In [12]:
import pandas as pd
import numpy as np
import re, pytz, os, requests, sys
from pathlib import Path
from datetime import datetime
import sys
sys.path.append("/workspaces/service-data")

from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names
from src.load import load_csv, download_csv_files
from src.export import export_to_csv
from src.merge import merge_si, merge_ss
from main import get_config


base_dir = Path.cwd()
parent_dir = base_dir.parent

In [25]:
# si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/si.csv", 
#                  keep_default_na=False, 
#                  na_values='', 
#                  delimiter=';'
#                  )

# ss = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/ss.csv", 
#                  keep_default_na=False, 
#                  na_values='', 
#                  delimiter=';'
#                  )

config=get_config()
download_csv_files(config)
org_var = load_csv('org_var.csv', config, snapshot=False)

Downloaded: si_2018.csv
Downloaded: si_2024.csv
Downloaded: ss_2018.csv
Downloaded: ss_2024.csv
Downloaded: org_var.csv
Downloaded: serv_prog.csv
Downloaded: ifoi_en.csv
Downloaded: ifoi_fr.csv
Downloaded: rbpo.csv


In [26]:
frames_en = []
frames_fr = []

for fiscal_yr, url in get_config()['program_csv_urls_en'].items():
    filename = url[(url.rfind('/')+1):url.rfind('.')]
    # print(filename)
    
    df = pd.read_csv(url)
    df['filename'] = filename
    df['fiscal_yr'] = fiscal_yr
    frames_en += [df]

for fiscal_yr, url in get_config()['program_csv_urls_fr'].items():
    filename = url[(url.rfind('/')+1):url.rfind('.')]
    # print(filename)
    
    df = pd.read_csv(url)
    df['filename'] = filename
    df['fiscal_yr'] = fiscal_yr
    frames_fr += [df]



In [28]:
program_df_en = pd.concat(frames_en)
program_df_en.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10237 entries, 0 to 1220
Data columns (total 13 columns):
 #   Column                                                                             Non-Null Count  Dtype 
---  ------                                                                             --------------  ----- 
 0   Ministry_name-Portefeuilleministériel_nom                                          10125 non-null  object
 1   Other-than-budgetary-expenditures_Autres-que-depenses-budgetaires                  112 non-null    object
 2   EntityDept_name_Eng-EntitéMin_nom_ang                                              10237 non-null  object
 3   ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_code_PROG         10237 non-null  object
 4   ProgramorCoreResponsibility_name-ProgrammeouResponsabilitéessentielle_nom_PROG     10237 non-null  object
 5   ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_description_PROG  7618 non-null   object
 6   Prog

In [27]:
program_df_en = pd.concat(frames_en)
program_df_en = program_df_en.merge(org_var, how='left', left_on='EntityDept_name_Eng-EntitéMin_nom_ang', right_on='org_name_variant')
program_df_en['code'] = program_df_en['ProgramInventory-Répertoiredesprogrammes_code_PROG'].combine_first(program_df_en['ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_code_PROG'])
program_df_en['name_en'] = program_df_en['ProgramInventory_name-Répertoiredesprogrammes_name_PROG'].combine_first(program_df_en['ProgramorCoreResponsibility_name-ProgrammeouResponsabilitéessentielle_nom_PROG'])

program_df_en.set_index(['fiscal_yr', 'org_id', 'code'])

program_df_en.head()

KeyError: 'ProgramInventory_name-Répertoiredesprogrammes_name_PROG'

In [None]:
program_df_fr = pd.concat(frames_fr)
program_df_fr = program_df_fr.merge(org_var, how='left', left_on='EntityDept_name_fra-EntitéMin_nom_fra', right_on='org_name_variant')
program_df_fr['code'] = program_df_fr['ProgramInventory-Répertoiredesprogrammes_code_PROG'].combine_first(program_df_en['ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_code_PROG'])
program_df_en['name_fr'] = program_df_en['ProgramInventory_name-Répertoiredesprogrammes_nom_PROG'].combine_first(program_df_en['ProgramorCoreResponsibility_name-ProgrammeouResponsabilitéessentielle_nom_PROG'])

program_df_fr.info()
# program_df_en.set_index(['fiscal_yr', 'org_id', 'code'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10237 entries, 0 to 10236
Data columns (total 15 columns):
 #   Column                                                                             Non-Null Count  Dtype  
---  ------                                                                             --------------  -----  
 0   Ministry_name-Portefeuilleministériel_nom                                          10139 non-null  object 
 1   Other-than-budgetary-expenditures_Autres-que-depenses-budgetaires                  98 non-null     object 
 2   EntityDept_name_fra-EntitéMin_nom_fra                                              10237 non-null  object 
 3   ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_code_PROG         10237 non-null  object 
 4   ProgramorCoreResponsibility_name-ProgrammeouResponsabilitéessentielle_nom_PROG     10237 non-null  object 
 5   ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_description_PROG  7633 non-null   obj

In [17]:
# missing_orgs = program_df_en[['EntityDept_name_Eng-EntitéMin_nom_ang', 'org_name_variant']][program_df_en['org_id'].isna()]
# missing_orgs['EntityDept_name_Eng-EntitéMin_nom_ang'].unique()