In [25]:
import pandas as pd
import numpy as np
import re, pytz, os, requests, sys
from pathlib import Path
from datetime import datetime
import sys
sys.path.append("/workspaces/service-data")

from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names
from src.load import load_csv
from src.export import export_to_csv
from src.merge import merge_si, merge_ss
from src.utils import dept_list, program_list
from main import get_config

base_dir = Path.cwd()
parent_dir = base_dir.parent

In [26]:
si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/si.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';',
                 engine='python',
                 skipfooter=1
                 )

ss = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/ss.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';',
                 engine='python',
                 skipfooter=1
                 )

config = get_config()

In [27]:
# Load extra files
rbpo = load_csv('rbpo.csv', config, snapshot=False)
org_var = load_csv('org_var.csv', config, snapshot=False)
sid_registry = load_csv('sid_registry.csv', config, snapshot=False)

# Build then import department, program list from utilities
dept = dept_list(config)
program = program_list(config)

# Determine the current date
timezone = pytz.timezone('America/Montreal')
current_datetime = pd.Timestamp.now(tz=timezone)
current_date = current_datetime.date()

# Coerce all numeric fields
int_cols = {
    'num_phone_enquiries': si,
    'num_applications_by_phone': si,
    'num_website_visits': si,
    'num_applications_online': si,
    'num_applications_by_mail': si,
    'num_applications_by_email': si,
    'num_applications_by_fax': si,
    'num_applications_by_other': si,
    'num_applications_total': si,
    'volume_meeting_target': ss,
    'total_volume': ss
}

for column, df in int_cols.items():
    int_cols[column][column] = pd.to_numeric(df[column], errors = 'coerce').fillna(0).astype(int)

# Create numeric ids, strip out prefixes
si['service_id_numeric'] = si['service_id'].str.replace(r'^SRV', '', regex=True)
si['service_id_numeric'] = pd.to_numeric(si['service_id_numeric'], errors = 'coerce')

ss['service_standard_id_numeric'] = ss['service_standard_id'].str.replace(r'^STAN', '', regex=True)
ss['service_standard_id_numeric'] = pd.to_numeric(ss['service_standard_id_numeric'], errors = 'coerce')

si = si.merge(sid_registry[['service_id', 'org_id']], how='left', on='service_id', suffixes=['', '_sid_registry'])

Exported dept.csv to /workspaces/service-data/outputs/utils
Exported program_list.csv to /workspaces/service-data/outputs/utils


In [28]:
si['qa_unregistered_sid'] = si['org_id_sid_registry'].isna()
si['qa_reused_sid'] = (si['org_id'] != si['org_id_sid_registry']) & ~(si['qa_unregistered_sid'])

In [29]:
# Refer to program_list
si['org_id'] = si['org_id'].astype(str)
program['org_id'] = pd.to_numeric(program['org_id'], errors = 'coerce').astype(int)
program['org_id'] = program['org_id'].astype(str)

# Prepare a dataframe that splits service inventory into one-program-per-row: si_prog
# Exclude empty program ID rows, select relevant columns
si_prog = si.loc[
    ~si['program_id'].isnull(),
    ['fiscal_yr', 'service_id', 'program_id', 'org_id']]
si_prog['org_id'] = si_prog['org_id'].astype(str)

# Split and explode program_id to handle multiple program_id entries per cell
si_prog['program_id'] = si_prog['program_id'].str.split(',')
si_prog = si_prog.explode('program_id')

# Join si_prog with program_list on program_id and org_id
si_prog = si_prog.merge(program, on=['program_id', 'org_id'], how='left', suffixes=('_si', '_prog'), indicator=True)

# qa check: program id belongs to different department
si_prog_wrong_org = si_prog[si_prog['_merge'] == 'left_only']  # Keep only mismatched rows
si_prog_wrong_org = si_prog_wrong_org.groupby(['fiscal_yr', 'service_id', 'org_id'], as_index=False).agg({'program_id': lambda x: '<>'.join(sorted(map(str, x.dropna())))})
si_prog_wrong_org.rename(columns={'program_id':'mismatched_program_ids'}, inplace=True)

# qa check: program id is old/expired
si_prog['latest_valid_fy_ending_in'] = pd.to_numeric(si_prog['latest_valid_fy'].str.split('-').str[1].fillna(0), errors = 'coerce').astype(int)
si_prog['reported_fy_ending_in'] = pd.to_numeric(si_prog['fiscal_yr'].str.split('-').str[1].fillna(0), errors = 'coerce').astype(int)
si_prog['program_id_latest_valid_fy'] = si_prog['program_id']+': '+si_prog['latest_valid_fy']

si_prog_old = si_prog[(si_prog['latest_valid_fy_ending_in'] < si_prog['reported_fy_ending_in']) & (si_prog['_merge'] =='both')]
si_prog_old = si_prog_old.groupby(['fiscal_yr', 'service_id', 'org_id'], as_index=False).agg({'program_id_latest_valid_fy': lambda x: '<>'.join(sorted(map(str, x.dropna())))})

# Merge into si
si = pd.merge(si, si_prog_old, on=['fiscal_yr', 'service_id', 'org_id'], how='left')
si['qa_program_id_old'] = ~(si['program_id_latest_valid_fy'].isnull())

si = pd.merge(si, si_prog_wrong_org, on=['fiscal_yr', 'service_id', 'org_id'], how='left')
si['qa_program_id_wrong_org'] = ~(si['mismatched_program_ids'].isnull())




In [30]:
si

Unnamed: 0,fiscal_yr,org_id,service_id,service_name_en,service_name_fr,service_description_en,service_description_fr,service_type,service_recipient_type,service_scope,...,automated_decision_system,fy_org_id_service_id,service_id_numeric,org_id_sid_registry,qa_unregistered_sid,qa_reused_sid,program_id_latest_valid_fy,qa_program_id_old,mismatched_program_ids,qa_program_id_wrong_org
0,2022-2023,539,712,Registry Services,Services de greffe,This service works closely with tribunal chair...,Ce service travaille en étroite collaboration ...,"APIR, INFO",CLIENT,EXTERN,...,,2022-2023_539_712,712,539.0,False,False,,False,,False
1,2022-2023,539,2018,General Inquiries,Demandes générales,The ATSSC hotline is a voice message inbox tha...,La messagerie vocale du SCDATA est accessible ...,INFO,SOCIETY,EXTERN,...,,2022-2023_539_2018,2018,539.0,False,False,,False,,False
2,2022-2023,539,SRV03188,ATSSC Access to Information and Privacy,Accès à l’information et protection des rensei...,The Access to Information Act gives Canadian c...,La Loi sur l’accès à l’information accorde aux...,INFO,SOCIETY,EXTERN,...,,2022-2023_539_SRV03188,3188,539.0,False,False,,False,,False
3,2022-2023,1,125,Domestic Statistics and Market Information Web,Statistiques canadiennes et site Web d'informa...,Provides information on market structure and p...,Offre des renseignements sur la structure et l...,INFO,CLIENT,EXTERN,...,,2022-2023_1_125,125,1.0,False,False,,False,,False
4,2022-2023,1,127,Canadian Soil Information Services (CanSIS),Système d'information sur les sols du Canada (...,CanSIS works in partnership with the provincia...,SISCan collabore avec des groupes provinciaux ...,INFO,CLIENT,EXTERN,...,,2022-2023_1_127,127,1.0,False,False,,False,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8937,2023-2024,139,1273,War Veterans Allowance,Allocation aux anciens combattants,The War Veterans Allowance (WVA) is in recogni...,L’allocation aux anciens combattants (AAC) est...,RES,CLIENT,EXTERN,...,N,2023-2024_139_1273,1273,139.0,False,False,,False,,False
8938,2023-2024,333,53,Review and Appeal hearings,Audiences de révision et d'appel,The independent avenue of appeal for disabilit...,Voie d’appel indépendante à l’égard des décisi...,RES,CLIENT,EXTERN,...,N,2023-2024_333_53,53,333.0,False,False,,False,,False
8939,2023-2024,246,2104,Women's Program,Programme de promotion de la femme,The purpose of the Women’s Program is to advan...,Le Programme de promotion de la femme vise à f...,GNC,SOCIETY,EXTERN,...,N,2023-2024_246_2104,2104,246.0,False,False,,False,,False
8940,2023-2024,246,2105,Gender-Based Violence Program,Programme de financement de la lutte contre la...,The Gender-Based Violence (GBV) Program takes ...,Les mesures du Programme de financement de la ...,GNC,SOCIETY,EXTERN,...,N,2023-2024_246_2105,2105,246.0,False,False,,False,,False
