In [2]:
import pandas as pd
import numpy as np
import re, pytz, os, requests, sys
from pathlib import Path
from datetime import datetime
import sys
sys.path.append("/workspaces/service-data")

from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names
from src.load import load_csv, download_csv_files
from src.export import export_to_csv
from src.merge import merge_si, merge_ss
from main import get_config


base_dir = Path.cwd()
parent_dir = base_dir.parent

In [3]:
# si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/si.csv", 
#                  keep_default_na=False, 
#                  na_values='', 
#                  delimiter=';'
#                  )

# ss = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/ss.csv", 
#                  keep_default_na=False, 
#                  na_values='', 
#                  delimiter=';'
#                  )

config=get_config()
download_csv_files(config)
org_var = load_csv('org_var.csv', config, snapshot=False)

Downloaded: si_2018.csv
Downloaded: si_2024.csv
Downloaded: ss_2018.csv
Downloaded: ss_2024.csv
Downloaded: org_var.csv
Downloaded: sid_registry.csv
Downloaded: serv_prog.csv
Downloaded: ifoi_en.csv
Downloaded: ifoi_fr.csv
Downloaded: rbpo.csv


In [4]:
frames_en = []
frames_fr = []

for fiscal_yr, url in get_config()['program_csv_urls_en'].items():
    filename = url[(url.rfind('/')+1):url.rfind('.')]
    # print(filename)
    
    df = pd.read_csv(url)
    df['filename'] = filename
    df['fiscal_yr'] = fiscal_yr
    frames_en += [df]

for fiscal_yr, url in get_config()['program_csv_urls_fr'].items():
    filename = url[(url.rfind('/')+1):url.rfind('.')]
    # print(filename)
    
    df = pd.read_csv(url)
    df['filename'] = filename
    df['fiscal_yr'] = fiscal_yr
    frames_fr += [df]

program_df_en = pd.concat(frames_en)
program_df_en = program_df_en.merge(org_var, how='left', left_on='EntityDept_name_Eng-EntitéMin_nom_ang', right_on='org_name_variant')
program_df_en['program_id'] = program_df_en['ProgramInventory-Répertoiredesprogrammes_code_PROG'].combine_first(program_df_en['ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_code_PROG'])
program_df_en['program_en'] = program_df_en['ProgramInventory_name-Répertoiredesprogrammes_nom_PROG'].combine_first(program_df_en['ProgramorCoreResponsibility_name-ProgrammeouResponsabilitéessentielle_nom_PROG'])

program_df_en.set_index(['fiscal_yr', 'org_id', 'program_id'], inplace=True)

program_df_fr = pd.concat(frames_fr)
program_df_fr = program_df_fr.merge(org_var, how='left', left_on='EntityDept_name_fra-EntitéMin_nom_fra', right_on='org_name_variant')
program_df_fr['program_id'] = program_df_fr['ProgramInventory-Répertoiredesprogrammes_code_PROG'].combine_first(program_df_fr['ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_code_PROG'])
program_df_fr['program_fr'] = program_df_fr['ProgramInventory_name-Répertoiredesprogrammes_nom_PROG'].combine_first(program_df_fr['ProgramorCoreResponsibility_name-ProgrammeouResponsabilitéessentielle_nom_PROG'])

program_df_fr.set_index(['fiscal_yr', 'org_id', 'program_id'], inplace=True)

program_df = pd.merge(program_df_en, program_df_fr, how='outer', right_index=True, left_index=True, suffixes=['_en', '_fr'], indicator=True)
program_df = program_df.loc[:, ['program_en', 'program_fr']].reset_index()

program_df = program_df.sort_values('fiscal_yr')

# Then, group by org_id and code, and get the index of the latest fiscal year
latest_idx = program_df.groupby(['org_id', 'program_id'])['fiscal_yr'].idxmax()

# Use those indices to extract the rows
program_df = program_df.loc[latest_idx, ['org_id', 'program_id', 'fiscal_yr', 'program_en', 'program_fr']]

program_df


Unnamed: 0,org_id,program_id,fiscal_yr,program_en,program_fr
9018,1.0,BWN01,2025-2026,Trade and Market Expansion,Croissance du commerce et des marchés
9019,1.0,BWN02,2025-2026,Sector Engagement and Development,Mobilisation et développement du secteur
9020,1.0,BWN03,2025-2026,Farm Products Council of Canada,Conseil des produits agricoles du Canada
9021,1.0,BWN04,2025-2026,Supply Management Initiatives,Initiatives de gestion de l'offre
9022,1.0,BWN05,2025-2026,Canadian Pari-Mutuel Agency,Agence canadienne du pari mutuel
...,...,...,...,...,...
10233,561.0,ISS00,2025-2026,Internal services,Services internes
10234,562.0,BYQ00,2025-2026,VIA HFR – VIA TGF Inc,VIA HFR – VIA TGF Inc
10235,563.0,BYR01,2025-2026,Freshwater Management,Gestion de l'eau douce
10236,563.0,BYR02,2025-2026,Freshwater Policy and Engagement,Politique et mobilisation de l'eau douce


In [None]:
frames_en = []
frames_fr = []

df = pd.DataFrame

for fiscal_yr, url in get_config()['program_csv_urls_en'].items():
    filename = url.split('/')[-1].split('.')[0]
    
    df = pd.read_csv(url)
    df['filename'] = filename
    df['fiscal_yr'] = fiscal_yr
    frames_en += [df]

for fiscal_yr, url in get_config()['program_csv_urls_fr'].items():
    filename = url.split('/')[-1].split('.')[0]

    df = pd.read_csv(url)
    df['filename'] = filename
    df['fiscal_yr'] = fiscal_yr
    frames_fr += [df]

# --- Process English Program Data ---

program_df_en = pd.concat(frames_en, ignore_index=True)

# Determine the org_id using the org name variants
program_df_en = program_df_en.merge(
    org_var,
    how='left',
    left_on='EntityDept_name_Eng-EntitéMin_nom_ang',
    right_on='org_name_variant'
)

# Resolve program codes and names, taking the core responsibility if the program is null
program_df_en['program_id'] = program_df_en[
    'ProgramInventory-Répertoiredesprogrammes_code_PROG'
].combine_first(
    program_df_en['ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_code_PROG']
)

program_df_en['program_en'] = program_df_en[
    'ProgramInventory_name-Répertoiredesprogrammes_nom_PROG'
].combine_first(
    program_df_en['ProgramorCoreResponsibility_name-ProgrammeouResponsabilitéessentielle_nom_PROG']
)

# Set index for merging
program_df_en.set_index(['fiscal_yr', 'org_id', 'program_id'], inplace=True)

# --- Process French Program Data ---

program_df_fr = pd.concat(frames_fr, ignore_index=True)

# Determine the org_id using the org name variants
program_df_fr = program_df_fr.merge(
    org_var,
    how='left',
    left_on='EntityDept_name_fra-EntitéMin_nom_fra',
    right_on='org_name_variant'
)

# Resolve program codes and names, taking the core responsibility if the program is null
program_df_fr['program_id'] = program_df_fr[
    'ProgramInventory-Répertoiredesprogrammes_code_PROG'
].combine_first(
    program_df_fr['ProgramorCoreResponsibility-ProgrammeouResponsabilitéessentielle_code_PROG']
)

program_df_fr['program_fr'] = program_df_fr[
    'ProgramInventory_name-Répertoiredesprogrammes_nom_PROG'
].combine_first(
    program_df_fr['ProgramorCoreResponsibility_name-ProgrammeouResponsabilitéessentielle_nom_PROG']
)

program_df_fr.set_index(['fiscal_yr', 'org_id', 'program_id'], inplace=True)

# --- Merge English and French Data ---

program_df = pd.merge(
    program_df_en,
    program_df_fr,
    how='outer',
    left_index=True,
    right_index=True,
    suffixes=['_en', '_fr'],
    indicator=True
)

# Keep only necessary columns
program_df = program_df[['program_en', 'program_fr']].reset_index()

# --- Select Latest Fiscal Year per org_id–code ---

program_df = program_df.sort_values('fiscal_yr')

latest_idx = program_df.groupby(['org_id', 'program_id'])['fiscal_yr'].idxmax()
program_df = program_df.loc[latest_idx, ['org_id', 'program_id', 'fiscal_yr', 'program_en', 'program_fr']]

program_df

# Final DataFrame: one row per org_id–code with the most recent program names
program_df


Unnamed: 0,org_id,program_id,fiscal_yr,program_en,program_fr
9018,1.0,BWN01,2025-2026,Trade and Market Expansion,Croissance du commerce et des marchés
9019,1.0,BWN02,2025-2026,Sector Engagement and Development,Mobilisation et développement du secteur
9020,1.0,BWN03,2025-2026,Farm Products Council of Canada,Conseil des produits agricoles du Canada
9021,1.0,BWN04,2025-2026,Supply Management Initiatives,Initiatives de gestion de l'offre
9022,1.0,BWN05,2025-2026,Canadian Pari-Mutuel Agency,Agence canadienne du pari mutuel
...,...,...,...,...,...
10233,561.0,ISS00,2025-2026,Internal services,Services internes
10234,562.0,BYQ00,2025-2026,VIA HFR – VIA TGF Inc,VIA HFR – VIA TGF Inc
10235,563.0,BYR01,2025-2026,Freshwater Management,Gestion de l'eau douce
10236,563.0,BYR02,2025-2026,Freshwater Policy and Engagement,Politique et mobilisation de l'eau douce
