# Notebook for tests and experiments

In [42]:
import pandas as pd
import numpy as np
import re
import pytz
import os
from pathlib import Path
import sys
sys.path.append("/home/jovyan/shared/service-data")

from src.clean import clean_percentage, normalize_string, standardize_column_names, clean_fiscal_yr
from src.load import load_csv_from_raw
from src.export import export_to_csv
from src.merge import merge_si, merge_ss

In [43]:
# Define the base directory
base_dir = Path.cwd()
parent_dir = base_dir.parent

# File paths for outputs
data_files = {
    "rbpo": parent_dir / "inputs" / "rbpo.csv",
    "org_var": parent_dir / "inputs" / "org_var.csv",
    "serv_prog": parent_dir / "inputs" / "serv_prog.csv"
}

si = merge_si()
rbpo = pd.read_csv(data_files["rbpo"])
serv_prog = pd.read_csv(data_files["serv_prog"])

rbpo = standardize_column_names(rbpo)
rbpo['fiscal_yr'] = rbpo['fiscal_yr'].apply(clean_fiscal_yr)

Exported dept.csv to /home/jovyan/shared/service-data/outputs/utils
Exported si.csv to /home/jovyan/shared/service-data/outputs


In [44]:
# Define columns related to measures: spending and FTEs (planned and actual)
fte_spend_cols = [
    'planned_spending_1', 'actual_spending', 'planned_spending_2', 'planned_spending_3',
    'planned_ftes_1', 'actual_ftes', 'planned_ftes_2', 'planned_ftes_3'
]

# Melt (unpivot) the DataFrame to long format
drf = pd.melt(
    rbpo, 
    id_vars=['fiscal_yr', 'org_id', 'program_id'], 
    value_vars=fte_spend_cols, 
    var_name='plan_actual_spendfte_yr', 
    value_name='measure'
)

# Split 'plan_actual_yr' into separate columns for planned/actual, spending/FTEs, and year adjustment
drf[['planned_actual', 'spending_fte', 'yr_adjust']] = drf['plan_actual_spendfte_yr'].str.split('_', expand=True)
drf['yr_adjust'] = drf['yr_adjust'].fillna('1').astype(int) - 1

# Calculate 4-digit 'measure_yr' and 'report_yr' from 'fiscal_yr' and 'yr_adjust'
drf['measure_yr'] = drf['fiscal_yr'].str.split('-').str[1].astype(int) + drf['yr_adjust']
drf['report_yr'] = drf['fiscal_yr'].str.split('-').str[1].astype(int)

# Get the latest fiscal year from the Service inventory (four digit fy, year of end of fy)
# latest_si_fy = si['fiscal_yr'].str.split('-').str[1].astype(int).max()
latest_si_fy = 2024

# Separate actuals and future planned data
drf_actuals = drf[
    (drf['planned_actual'] == 'actual') & 
    (drf['report_yr'] <= latest_si_fy)
].dropna()

drf_planned = drf[
    (drf['planned_actual'] == 'planned') &
    (drf['report_yr'] > latest_si_fy) 
].dropna()

# Each report year has 3 measure years for planned values.
# Only keep records that have the highest report year for that given program, measure type, and measure year
idx = drf_planned.groupby(['program_id', 'spending_fte', 'measure_yr'])['report_yr'].idxmax()
drf_planned = drf_planned.loc[idx]

drf_actuals_checksum = drf_actuals['measure'].sum()
drf_planned_checksum = drf_planned['measure'].sum()

print("drf_actuals.shape:", drf_actuals.shape)
print("checksum:", drf_actuals_checksum)
print("drf_planned.shape:", drf_planned.shape)
print("checksum:", drf_planned_checksum)

# Concatenate actuals and planned entries
drf = pd.concat([drf_actuals, drf_planned])
drf_checksum = drf['measure'].sum()

print("drf.shape:", drf.shape)
print("checksum:", drf_checksum)
print("checksum difference:", drf_checksum - (drf_planned_checksum+drf_actuals_checksum))
print(drf.info())

# Pivot to get a wide format table with spending/FTE columns
print("pivoting drf")
drf = drf.pivot_table(
    index=['org_id', 'program_id', 'report_yr', 'measure_yr', 'planned_actual'], 
    columns=['spending_fte'], 
    values='measure'
).sort_values(
    by=['org_id', 'program_id', 'report_yr','measure_yr']
).reset_index()

print("drf.shape:", drf.shape)

ftes_checksum = drf['ftes'].sum()
print('ftes_checksum:', ftes_checksum)
spending_checksum = drf['spending'].sum()
print('spending_checksum:', spending_checksum)
print("checksum difference:", drf_checksum - (ftes_checksum+spending_checksum))
print(drf.info())

# Set up si_link_yr: a fiscal year column to be able to include years 
# beyond the service inventory when joining by service id and fy.
# if measure year > latest service fy, = latest service fy, else use measure_yr
drf.loc[drf['measure_yr']>latest_si_fy, 'si_link_yr'] = latest_si_fy
drf.loc[drf['measure_yr']<=latest_si_fy, 'si_link_yr'] = drf['measure_yr']
drf['si_link_yr'] = drf['si_link_yr'].astype(int) 


drf_files = {
    "drf_actuals":drf_actuals,
    "drf_planned": drf_planned,
    "drf": drf
}


#export_to_csv(drf_files, Path.cwd())

drf_actuals.shape: (14540, 10)
checksum: 2173686010033.4302
drf_planned.shape: (4812, 10)
checksum: 1284942481118.59
drf.shape: (19352, 10)
checksum: 3458628491152.02
checksum difference: -0.00048828125
<class 'pandas.core.frame.DataFrame'>
Index: 19352 entries, 8508 to 33901
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   fiscal_yr                19352 non-null  object 
 1   org_id                   19352 non-null  int64  
 2   program_id               19352 non-null  object 
 3   plan_actual_spendfte_yr  19352 non-null  object 
 4   measure                  19352 non-null  float64
 5   planned_actual           19352 non-null  object 
 6   spending_fte             19352 non-null  object 
 7   yr_adjust                19352 non-null  int64  
 8   measure_yr               19352 non-null  int64  
 9   report_yr                19352 non-null  int64  
dtypes: float64(1), int64(4), object(5)
me

In [53]:
si_drf = si.loc[:, ['service_id', 'fiscal_yr', 'program_id']]
si_drf = si_drf.explode('program_id')
si_drf['si_yr'] = si_drf['fiscal_yr'].str.split('-').str[1].astype(int)
si_drf = si_drf[si_drf['program_id'].notna()]

service_fte_spending = pd.merge(
    si_drf, 
    drf, 
    how='left', 
    left_on=['si_yr', 'program_id'], 
    right_on=['si_link_yr', 'program_id']
)

print(service_fte_spending.info())
service_fte_spending


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30437 entries, 0 to 30436
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   service_id      30437 non-null  object 
 1   fiscal_yr       30437 non-null  object 
 2   program_id      30437 non-null  object 
 3   si_yr           30437 non-null  int64  
 4   org_id          29202 non-null  float64
 5   report_yr       29202 non-null  float64
 6   measure_yr      29202 non-null  float64
 7   planned_actual  29202 non-null  object 
 8   ftes            29107 non-null  float64
 9   spending        29125 non-null  float64
 10  si_link_yr      29202 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 2.6+ MB
None


Unnamed: 0,service_id,fiscal_yr,program_id,si_yr,org_id,report_yr,measure_yr,planned_actual,ftes,spending,si_link_yr
0,1000,2018-2019,BWM06,2019,129.0,2019.0,2019.0,actual,7.0,3.690273e+07,2019.0
1,1001,2022-2023,BGN01,2023,128.0,2023.0,2023.0,actual,3679.0,7.056224e+10,2023.0
2,1001,2018-2019,BGN01,2019,128.0,2019.0,2019.0,actual,2695.0,5.363957e+10,2019.0
3,1001,2019-2020,BGN01,2020,128.0,2020.0,2020.0,actual,2854.0,5.654511e+10,2020.0
4,1001,2020-2021,BGN01,2021,128.0,2021.0,2021.0,actual,2968.0,6.142159e+10,2021.0
...,...,...,...,...,...,...,...,...,...,...,...
30432,2105,2023-2024,BXR02,2024,246.0,2025.0,2027.0,planned,98.0,2.062208e+08,2024.0
30433,2106,2023-2024,BXR02,2024,246.0,2024.0,2024.0,actual,171.0,2.668840e+08,2024.0
30434,2106,2023-2024,BXR02,2024,246.0,2025.0,2025.0,planned,172.0,3.273289e+08,2024.0
30435,2106,2023-2024,BXR02,2024,246.0,2025.0,2026.0,planned,172.0,3.271138e+08,2024.0


In [None]:
# Set new multi-index for service inventory, drop existing collapsed program id column (temp1)


temp1 = si.set_index(['fiscal_yr','service_id']).drop(columns='program_id')

# Get the program_id into the service inventory
# Set index for service-program correspondence table (temp2)
temp2 = serv_prog.set_index(['fiscal_yr', 'service_id'])

# Join the service inventory (temp1) and the program correspondence table (temp2) 
temp3 = temp1.join(temp2)

# then clean up this expanded service inventory (temp3) by resetting the index and dropping NaNs
temp3 = temp3[temp3['program_id'].notna()].reset_index()

# Generate a 4-digit year in the expanded service inventory (temp3) to link to the program data
temp3['si_link_yr'] = temp3['fiscal_yr'].str.split('-').str[1].astype(int)

# Set a new multi-index for the expanded service inventory (temp3) and rename org_id to align to the program table
temp3 = temp3.set_index(['si_link_yr', 'org_id', 'program_id'])

# Set index for program data (temp4) 
temp4 = rbpo_melted.set_index(['si_link_yr', 'org_id', 'program_id'])

# then join with expanded service inventory
service_fte_spending = temp3.join(temp4, lsuffix='_si', rsuffix='_program').reset_index()

import os
import csv
from collections import defaultdict

def get_csv_headers(directory):
    """
    Recursively scans a directory and its subdirectories for CSV files and extracts column headers.

    Args:
        directory (str): Path to the directory containing CSV files.

    Returns:
        list: A list of tuples where each tuple contains the file path and a single column header.
    """
    headers = []

    # Walk through the directory and subdirectories
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.endswith(".csv"):
                file_path = os.path.join(root, filename)
                try:
                    with open(file_path, mode="r", encoding="utf-8") as file:
                        reader = csv.reader(file, delimiter=';')
                        # Read the first row as headers
                        columns = next(reader)
                        for col in columns:
                            headers.append((file_path, col))
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    return headers

def save_headers_to_csv(headers, output_file):
    """
    Appends the headers to a CSV file. If the file does not exist, it creates it.

    Args:
        headers (list): List of tuples with file paths and column headers.
        output_file (str): Path to the output CSV file.
    """
    file_exists = os.path.isfile(output_file)
    try:
        with open(output_file, mode="a", encoding="utf-8", newline='') as file:
            writer = csv.writer(file)
            if not file_exists:
                writer.writerow(["File Path", "Column Header"])
            writer.writerows(headers)
        print(f"Headers saved to {output_file}")
    except Exception as e:
        print(f"Error saving to {output_file}: {e}")

def main():
    directory = input("Enter the directory path containing CSV files: ")
    output_file = input("Enter the output file path (e.g., headers_output.csv): ")

    if not os.path.isdir(directory):
        print("Invalid directory path.")
        return

    headers = get_csv_headers(directory)
    save_headers_to_csv(headers, output_file)

if __name__ == "__main__":
    main()

