In [None]:
import pandas as pd
import numpy as np
import json
import re, pytz, os, requests, sys
from pathlib import Path
from datetime import datetime
import sys
sys.path.append("/workspaces/service-data")

from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names
from src.load import load_csv, download_csv_files
from src.export import export_to_csv
from src.merge import merge_si, merge_ss
from main import get_config


base_dir = Path.cwd()
parent_dir = base_dir.parent

In [None]:
# si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/si.csv", 
#                  keep_default_na=False, 
#                  na_values='', 
#                  delimiter=';'
#                  )

# ss = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/ss.csv", 
#                  keep_default_na=False, 
#                  na_values='', 
#                  delimiter=';'
#                  )

config=get_config()
si_path = parent_dir / 'outputs' / 'si.csv'
si = pd.read_csv(si_path, keep_default_na=False, na_values='', delimiter=';', engine='python', skipfooter=2)

In [None]:
drf = load_csv('rbpo.csv', config, False)
drf = standardize_column_names(drf)
drf['fiscal_yr'] = drf['fiscal_yr'].apply(clean_fiscal_yr)

# Define columns related to planned and actual measures: spending and FTEs 
fte_spend_cols = [
    'planned_spending_1', 
    'actual_spending', 
    'planned_spending_2', 
    'planned_spending_3',
    'planned_ftes_1', 
    'actual_ftes', 
    'planned_ftes_2', 
    'planned_ftes_3'
]

# Melt (unpivot) the DataFrame to long format
drf = pd.melt(
    drf, 
    id_vars=['fiscal_yr', 'org_id', 'program_id'], 
    value_vars=fte_spend_cols, 
    var_name='plan_actual_spendfte_yr', 
    value_name='measure'
)

# Split 'plan_actual_yr' into separate columns for planned/actual, spending/FTEs, and year adjustment
drf[['planned_actual', 'spending_fte', 'yr_adjust']] = drf['plan_actual_spendfte_yr'].str.split('_', expand=True)
drf['yr_adjust'] = drf['yr_adjust'].fillna('1').astype(int) - 1


# Calculate 4-digit 'measure_yr' and 'report_yr' from 'fiscal_yr' and 'yr_adjust'
drf['measure_yr'] = drf['fiscal_yr'].str.split('-').str[1].astype(int) + drf['yr_adjust']
drf['report_yr'] = drf['fiscal_yr'].str.split('-').str[1].astype(int)

latest_si_fy_by_org = si.groupby('org_id')['fiscal_yr'].max()

drf = pd.merge(drf, latest_si_fy_by_org, on='org_id', how='left', suffixes=['', '_si'])
drf['fiscal_yr_si'] = drf['fiscal_yr_si'].fillna('0-0')
drf['latest_si_yr'] = drf['fiscal_yr_si'].str.split('-').str[1].astype(int)

drf_actuals = drf[
    (drf['planned_actual'] == 'actual')
].dropna()

drf_planned = drf[
    (drf['planned_actual'] == 'planned')
].dropna()

# Determine the highest measure year for actuals
latest_actuals = drf_actuals.groupby(['org_id', 'program_id', 'spending_fte'])['report_yr'].max().reset_index()

# Merge in the highest measure year for actuals in the planned table
drf_planned = pd.merge(left=drf_planned, right=latest_actuals, how='left', on=['org_id', 'program_id', 'spending_fte'], suffixes=['', '_actuals']) 

# Only keep planned years that are greater than the latest actual report year
drf_planned = drf_planned[drf_planned['measure_yr']>drf_planned['report_yr_actuals']]

# # # Each report year has 3 measure years for planned values.
# # Only keep records that have the highest report year for that given program, measure type, and measure year
idx = drf_planned.groupby(['org_id', 'program_id', 'spending_fte', 'measure_yr'])['report_yr'].idxmax()
drf_planned = drf_planned.loc[idx]

# # # Concatenate actuals and planned entries
drf = pd.concat([drf_actuals, drf_planned])

drf = drf[[
    'org_id', 
    'latest_si_yr', 
    'program_id', 
    'report_yr', 
    'measure_yr', 
    'planned_actual', 
    'spending_fte',
    'measure']].reset_index(drop=True)

# # Set up si_link_yr: a fiscal year column to be able to include years 
# # beyond the service inventory when joining by service id and fy.
# # if measure year > latest service fy, = latest service fy, else use measure_yr
drf.loc[drf['measure_yr']>drf['latest_si_yr'], 'si_link_yr'] = drf['latest_si_yr']
drf.loc[drf['measure_yr']<=drf['latest_si_yr'], 'si_link_yr'] = drf['measure_yr']
drf['si_link_yr'] = drf['si_link_yr'].astype(int)

# # # Return years to fiscal year YYYY-YYYY format
drf['report_yr'] = (drf['report_yr']-1).apply(str) +"-"+ (drf['report_yr']).apply(str)
drf['measure_yr'] = (drf['measure_yr']-1).apply(str) +"-"+ (drf['measure_yr']).apply(str)
drf['si_link_yr'] = (drf['si_link_yr']-1).apply(str) +"-"+ (drf['si_link_yr']).apply(str)
drf['latest_si_yr'] = (drf['latest_si_yr']-1).apply(str) +"-"+ (drf['latest_si_yr']).apply(str)




In [None]:
drf.loc[(drf['org_id']==130)&(drf['program_id']=='ISS02')]
# drf['valid_plan'] = (drf['report_yr'] > latest_si_fy)

# drf[(drf['org_id']==130) & (drf['program_id']=='ISS02') & (drf['planned_actual']=='planned') & (drf['report_yr'] > latest_si_fy)]


In [None]:
# Load and normalize
drf = load_csv('rbpo.csv', config, False)
drf = standardize_column_names(drf)
drf['fiscal_yr'] = drf['fiscal_yr'].apply(clean_fiscal_yr)

# Define columns related to planned and actual measures: spending and FTEs
# These columns will be unpivoted / melted
fte_spend_cols = [
    'planned_spending_1', 
    'actual_spending', 
    'planned_spending_2', 
    'planned_spending_3',
    'planned_ftes_1', 
    'actual_ftes', 
    'planned_ftes_2', 
    'planned_ftes_3'
]

# Melt (unpivot) the DataFrame to long format
drf = pd.melt(
    drf, 
    id_vars=['fiscal_yr', 'org_id', 'program_id'], 
    value_vars=fte_spend_cols, 
    var_name='plan_actual_spendfte_yr', 
    value_name='measure'
)

# Split 'plan_actual_yr' into separate columns for planned/actual, spending/FTEs, and year offset (e.g. _1, _2, _3)
drf[['planned_actual', 'spending_fte', 'yr_adjust']] = drf['plan_actual_spendfte_yr'].str.split('_', n=2, expand=True)
drf['yr_adjust'] = drf['yr_adjust'].fillna('1').astype(int) - 1

# Parse fiscal year end (YYYY-YYYY -> second part)
fy_end = pd.to_numeric(drf['fiscal_yr'].str.split('-').str[-1].astype(int), errors='coerce')

# Calculate 4-digit 'measure_yr' and 'report_yr' from 'fiscal_yr' and 'yr_adjust'
drf['report_yr'] = fy_end.astype('Int64')
drf['measure_yr'] = (fy_end+ drf['yr_adjust']).astype('Int64')

# Latest SI fiscal year per org (end year as int)
si_latest = (si.assign(lat_end=pd.to_numeric(si['fiscal_yr'].str.split('-').str[-1], errors='coerce'))
               .groupby('org_id', as_index=False)['lat_end'].max()
               .rename(columns={'lat_end':'latest_si_yr'}))

drf = drf.merge(si_latest, on='org_id', how='left')


# Split planned vs actual; only drop blank measures
drf_actuals = drf[drf['planned_actual']=='actual'].dropna(subset=['measure']).copy()
drf_planned = drf[drf['planned_actual']=='planned'].dropna(subset=['measure']).copy()

# Determine the highest measure year for actuals
latest_actuals = (drf_actuals
                  .groupby(['org_id', 'program_id', 'spending_fte'], as_index=False)['report_yr']
                  .max()
                  .rename(columns={'report_yr':'report_yr_actuals'})
)

# Merge in the highest measure year for actuals in the planned table
drf_planned = drf_planned.merge(latest_actuals, 
                                on=['org_id', 'program_id', 'spending_fte'],
                                how='left') 

# Only keep planned years that are greater than the latest actual report year
# fillna(-np.inf) assures that all planned values are included, even if there are not associated actual report years
drf_planned = drf_planned[
    drf_planned['measure_yr'] > (drf_planned['report_yr_actuals'].fillna(0))
]

# # # Each report year has 3 measure years for planned values.
# # Only keep records that have the highest report year for that given program, measure type, and measure year
idx = (drf_planned
       .groupby(['org_id', 'program_id', 'spending_fte', 'measure_yr'])['report_yr']
       .idxmax())
drf_planned = drf_planned.loc[idx]

# # # Concatenate actuals and planned entries
drf = pd.concat([drf_actuals, drf_planned], ignore_index=True)

drf = drf[[
    'org_id', 
    'latest_si_yr', 
    'program_id', 
    'report_yr', 
    'measure_yr', 
    'planned_actual', 
    'spending_fte',
    'measure']].reset_index(drop=True)

# # Set up si_link_yr: a fiscal year column to be able to include years 
# # beyond the service inventory when joining by service id and fy.
# # if measure year > latest service fy, = latest service fy, else use measure_yr
drf['latest_si_yr'] = drf['latest_si_yr'].fillna(0).astype(int)
drf.loc[drf['measure_yr']>drf['latest_si_yr'], 'si_link_yr'] = drf['latest_si_yr']
drf.loc[drf['measure_yr']<=drf['latest_si_yr'], 'si_link_yr'] = drf['measure_yr']
drf['si_link_yr'] = drf['si_link_yr'].astype('Int64')

# # # Return years to fiscal year YYYY-YYYY format
drf['report_yr'] = (drf['report_yr']-1).apply(str) +"-"+ (drf['report_yr']).apply(str)
drf['measure_yr'] = (drf['measure_yr']-1).apply(str) +"-"+ (drf['measure_yr']).apply(str)
drf['si_link_yr'] = (drf['si_link_yr']-1).apply(str) +"-"+ (drf['si_link_yr']).apply(str)
drf['latest_si_yr'] = (drf['latest_si_yr']-1).apply(str) +"-"+ (drf['latest_si_yr']).apply(str)