# Figuring out the connection to DRF/DRR/DP figures

In [5]:
import pandas as pd
import numpy as np
import re
import pytz
import os
from pathlib import Path
import sys
# sys.path.append("/home/jovyan/shared/service-data")

from src.clean import clean_percentage, normalize_string, standardize_column_names, clean_fiscal_yr
from src.load import load_csv_from_raw
from src.export import export_to_csv
from src.merge import merge_si, merge_ss

ModuleNotFoundError: No module named 'src'

In [4]:
# Define the base directory
base_dir = Path.cwd()
parent_dir = base_dir.parent

# File paths for outputs
data_files = {
    "rbpo": parent_dir / "inputs" / "rbpo.csv",
    "org_var": parent_dir / "inputs" / "org_var.csv",
    "serv_prog": parent_dir / "inputs" / "serv_prog.csv"
}

si = merge_si()
rbpo = pd.read_csv(data_files["rbpo"])
serv_prog = pd.read_csv(data_files["serv_prog"])

rbpo = standardize_column_names(rbpo)
rbpo['fiscal_yr'] = rbpo['fiscal_yr'].apply(clean_fiscal_yr)

NameError: name 'merge_si' is not defined

In [3]:
si

NameError: name 'si' is not defined

# Define columns related to measures: spending and FTEs (planned and actual)
fte_spend_cols = [
    'planned_spending_1', 'actual_spending', 'planned_spending_2', 'planned_spending_3',
    'planned_ftes_1', 'actual_ftes', 'planned_ftes_2', 'planned_ftes_3'
]

# Melt (unpivot) the DataFrame to long format
drf = pd.melt(
    rbpo, 
    id_vars=['fiscal_yr', 'org_id', 'program_id'], 
    value_vars=fte_spend_cols, 
    var_name='plan_actual_spendfte_yr', 
    value_name='measure'
)

# Split 'plan_actual_yr' into separate columns for planned/actual, spending/FTEs, and year adjustment
drf[['planned_actual', 'spending_fte', 'yr_adjust']] = drf['plan_actual_spendfte_yr'].str.split('_', expand=True)
drf['yr_adjust'] = drf['yr_adjust'].fillna('1').astype(int) - 1

# Calculate 4-digit 'measure_yr' and 'report_yr' from 'fiscal_yr' and 'yr_adjust'
drf['measure_yr'] = drf['fiscal_yr'].str.split('-').str[1].astype(int) + drf['yr_adjust']
drf['report_yr'] = drf['fiscal_yr'].str.split('-').str[1].astype(int)

# Get the latest fiscal year from the Service inventory (four digit fy, year of end of fy)
# latest_si_fy = si['fiscal_yr'].str.split('-').str[1].astype(int).max()
latest_si_fy = 2024

# Separate actuals and future planned data
drf_actuals = drf[
    (drf['planned_actual'] == 'actual') & 
    (drf['report_yr'] <= latest_si_fy)
].dropna()

drf_planned = drf[
    (drf['planned_actual'] == 'planned') &
    (drf['report_yr'] > latest_si_fy) 
].dropna()

# Each report year has 3 measure years for planned values.
# Only keep records that have the highest report year for that given program, measure type, and measure year
idx = drf_planned.groupby(['program_id', 'spending_fte', 'measure_yr'])['report_yr'].idxmax()
drf_planned = drf_planned.loc[idx]

drf_actuals_checksum = drf_actuals['measure'].sum()
drf_planned_checksum = drf_planned['measure'].sum()

print("drf_actuals.shape:", drf_actuals.shape)
print("checksum:", drf_actuals_checksum)
print("drf_planned.shape:", drf_planned.shape)
print("checksum:", drf_planned_checksum)

# Concatenate actuals and planned entries
drf = pd.concat([drf_actuals, drf_planned])
drf_checksum = drf['measure'].sum()

print("drf.shape:", drf.shape)
print("checksum:", drf_checksum)
print("checksum difference:", drf_checksum - (drf_planned_checksum+drf_actuals_checksum))
print(drf.info())

# Pivot to get a wide format table with spending/FTE columns
print("pivoting drf")
drf = drf.pivot_table(
    index=['org_id', 'program_id', 'report_yr', 'measure_yr', 'planned_actual'], 
    columns=['spending_fte'], 
    values='measure'
).sort_values(
    by=['org_id', 'program_id', 'report_yr','measure_yr']
).reset_index()

print("drf.shape:", drf.shape)

ftes_checksum = drf['ftes'].sum()
print('ftes_checksum:', ftes_checksum)
spending_checksum = drf['spending'].sum()
print('spending_checksum:', spending_checksum)
print("checksum difference:", drf_checksum - (ftes_checksum+spending_checksum))
print(drf.info())

# Set up si_link_yr: a fiscal year column to be able to include years 
# beyond the service inventory when joining by service id and fy.
# if measure year > latest service fy, = latest service fy, else use measure_yr
drf.loc[drf['measure_yr']>latest_si_fy, 'si_link_yr'] = latest_si_fy
drf.loc[drf['measure_yr']<=latest_si_fy, 'si_link_yr'] = drf['measure_yr']
drf['si_link_yr'] = drf['si_link_yr'].astype(int) 


drf_files = {
    "drf_actuals":drf_actuals,
    "drf_planned": drf_planned,
    "drf": drf
}


#export_to_csv(drf_files, Path.cwd())

si_drf = si.loc[:, ['service_id', 'fiscal_yr', 'program_id']]
si_drf = si_drf.explode('program_id')
si_drf['si_yr'] = si_drf['fiscal_yr'].str.split('-').str[1].astype(int)
si_drf = si_drf[si_drf['program_id'].notna()]

service_fte_spending = pd.merge(
    si_drf, 
    drf, 
    how='left', 
    left_on=['si_yr', 'program_id'], 
    right_on=['si_link_yr', 'program_id']
)

print(service_fte_spending.info())
service_fte_spending
