In [1]:
import pandas as pd
import numpy as np
import re, pytz, os, requests, sys
from pathlib import Path
from datetime import datetime
import sys
sys.path.append("/workspaces/service-data")

from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names
from src.load import load_csv
from src.export import export_to_csv
from src.merge import merge_si, merge_ss

base_dir = Path.cwd()
parent_dir = base_dir.parent

In [2]:
si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/si.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';',
                 engine='python',
                 skipfooter=2
                 )

snap_si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/2025-03-01_si.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';',
                 engine='python',
                 skipfooter=2
                 )

ss = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/ss.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';',
                 engine='python',
                 skipfooter=2
                 )

snap_ss = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/2025-03-01_ss.csv", 
                 keep_default_na=False, 
                 na_values='', 
                 delimiter=';',
                 engine='python',
                 skipfooter=2
                 )



In [3]:
import pandas as pd

def compare_simple(df_base, df_comp, key_name):
    """
    Compare two DataFrames: return differences and unmatched records.
    
    Args:
        df_base (pd.DataFrame): Baseline DataFrame to compare from.
        df_comp (pd.DataFrame): DataFrame to compare against the baseline.
        key_name (str): Column used as a unique identifier. Must be present in both DataFrames

    Returns:
        pd.DataFrame: Long-format DataFrame with differing fields and unmatched records.
    """
    df_base = df_base.reset_index(drop=True)
    df_comp = df_comp.reset_index(drop=True)

    # Prepare series of keys for comparison
    base_keys = df_base[key_name]
    comp_keys = df_comp[key_name]

    # Set the key as index for easier row-wise comparison
    df_base.set_index(key_name, inplace=True)
    df_comp.set_index(key_name, inplace=True)

    # Identify which keys are common or unique between the two dataframes
    match_keys = pd.merge(base_keys, comp_keys, how='outer', indicator=True)
    common_keys = match_keys[match_keys['_merge'] == 'both'].iloc[:, 0]
    keys_only_in_base = match_keys[match_keys['_merge'] == 'left_only'].iloc[:, 0]
    keys_only_in_comp = match_keys[match_keys['_merge'] == 'right_only'].iloc[:, 0]

    # Identify which columns are common or unique between the two dataframes
    common_cols = list(set(df_base.columns) & set(df_comp.columns))
    cols_only_in_base = list(set(df_base.columns) - set(df_comp.columns))
    cols_only_in_comp = list(set(df_comp.columns) - set(df_base.columns))

    # Filter both DataFrames to only those with common keys
    df_base_common = df_base.loc[common_keys, common_cols]
    df_comp_common = df_comp.loc[common_keys, common_cols]

    # Set the name for the column that stores field names during the reshaping process
    var_name = 'field'

    # Compare values field-by-field, row-by-row after filling NaNs with 0
    diff_wide = df_base_common.fillna(0).ne(df_comp_common.fillna(0)).reset_index()
    diff = diff_wide.melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=diff_wide.columns.drop(key_name)
    )
    # Filter for where values are different
    diff = diff[diff['value']].set_index([key_name, var_name])

    # Prepare long-format versions of base and comp DataFrames for comparison
    df_base_long = df_base_common.reset_index().melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=df_base_common.columns
    ).set_index([key_name, var_name])

    df_comp_long = df_comp_common.reset_index().melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=df_comp_common.columns
    ).set_index([key_name, var_name])

    # Merge differences with their respective base and comp values
    compare_result = diff.join(df_base_long, rsuffix='_base').join(df_comp_long, rsuffix='_comp')
    compare_result.drop(columns=['value'], inplace=True)  # Drop diff indicator
    compare_result.reset_index(inplace=True)

    # Add records that are only in one of the datasets
    records_only_in_base = pd.DataFrame({
        key_name: keys_only_in_base,
        var_name: 'record only in base',
        'value_base': keys_only_in_base
    })

    records_only_in_comp = pd.DataFrame({
        key_name: keys_only_in_comp,
        var_name: 'record only in comp',
        'value_comp': keys_only_in_comp
    })

    fields_only_in_base = pd.DataFrame({
        key_name: cols_only_in_base,
        var_name: 'field only in base',
        'value_base': cols_only_in_base
    })

    fields_only_in_comp = pd.DataFrame({
        key_name: cols_only_in_comp,
        var_name: 'field only in comp',
        'value_comp': cols_only_in_comp
    })

    # Concatenate all results into a single DataFrame
    compare_result = pd.concat([
        compare_result, 
        records_only_in_base, 
        records_only_in_comp,
        fields_only_in_base,
        fields_only_in_comp
        ], ignore_index=True)

    return compare_result


In [4]:
def compare(compare_dict):
    """
    Compare two DataFrames: return differences and unmatched records.
    
    Args:
        df_base (pd.DataFrame): Baseline DataFrame to compare from.
        df_comp (pd.DataFrame): DataFrame to compare against the baseline.
        key_name (str): Column used as a unique identifier. Must be present in both DataFrames

    Returns:
        pd.DataFrame: Long-format DataFrame with differing fields and unmatched records.
    """
    
    df_base = compare_dict['df_base']
    df_comp = compare_dict['df_comp']
    key_name = compare_dict['key_name']
    
    df_base = df_base.reset_index(drop=True)
    df_comp = df_comp.reset_index(drop=True)

    # Prepare series of keys for comparison
    base_keys = df_base[key_name]
    comp_keys = df_comp[key_name]

    # Set the key as index for easier row-wise comparison
    df_base.set_index(key_name, inplace=True)
    df_comp.set_index(key_name, inplace=True)

    # Identify which keys are common or unique between the two dataframes
    match_keys = pd.merge(base_keys, comp_keys, how='outer', indicator=True)
    common_keys = match_keys[match_keys['_merge'] == 'both'].iloc[:, 0]
    keys_only_in_base = match_keys[match_keys['_merge'] == 'left_only'].iloc[:, 0]
    keys_only_in_comp = match_keys[match_keys['_merge'] == 'right_only'].iloc[:, 0]

    # Identify which columns are common or unique between the two dataframes
    common_cols = list(set(df_base.columns) & set(df_comp.columns))
    cols_only_in_base = list(set(df_base.columns) - set(df_comp.columns))
    cols_only_in_comp = list(set(df_comp.columns) - set(df_base.columns))

    # Filter both DataFrames to only those with common keys
    df_base_common = df_base.loc[common_keys, common_cols]
    df_comp_common = df_comp.loc[common_keys, common_cols]

    # Set the name for the column that stores field names during the reshaping process
    var_name = 'field'

    # Compare values field-by-field, row-by-row after filling NaNs with 0
    diff_wide = df_base_common.fillna(0).ne(df_comp_common.fillna(0)).reset_index()
    diff = diff_wide.melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=diff_wide.columns.drop(key_name)
    )
    # Filter for where values are different
    diff = diff[diff['value']].set_index([key_name, var_name])

    # Prepare long-format versions of base and comp DataFrames for comparison
    df_base_long = df_base_common.reset_index().melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=df_base_common.columns
    ).set_index([key_name, var_name])

    df_comp_long = df_comp_common.reset_index().melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=df_comp_common.columns
    ).set_index([key_name, var_name])

    # Merge differences with their respective base and comp values
    compare_result = diff.join(df_base_long, rsuffix='_base').join(df_comp_long, rsuffix='_comp')
    compare_result.drop(columns=['value'], inplace=True)  # Drop diff indicator
    compare_result.reset_index(inplace=True)

    # Add records that are only in one of the datasets
    records_only_in_base = pd.DataFrame({
        key_name: keys_only_in_base,
        var_name: 'record only in base',
        'value_base': keys_only_in_base
    })

    records_only_in_comp = pd.DataFrame({
        key_name: keys_only_in_comp,
        var_name: 'record only in comp',
        'value_comp': keys_only_in_comp
    })

    fields_only_in_base = pd.DataFrame({
        key_name: cols_only_in_base,
        var_name: 'field only in base',
        'value_base': cols_only_in_base
    })

    fields_only_in_comp = pd.DataFrame({
        key_name: cols_only_in_comp,
        var_name: 'field only in comp',
        'value_comp': cols_only_in_comp
    })

    # Concatenate all results into a single DataFrame
    compare_result = pd.concat([
        compare_result, 
        records_only_in_base, 
        records_only_in_comp,
        fields_only_in_base,
        fields_only_in_comp
        ], ignore_index=True)
    
    # Prepare file name
    base_name = compare_dict['base_name']
    comp_name = compare_dict['comp_name']

    filename = f"compare_{base_name}_vs_{comp_name}"
    print(filename)

    # return compare_result

In [6]:
ss.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12708 entries, 0 to 12707
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   fiscal_yr                   12708 non-null  object 
 1   service_id                  12708 non-null  object 
 2   service_name_en             12708 non-null  object 
 3   service_name_fr             12708 non-null  object 
 4   service_standard_id         12708 non-null  object 
 5   service_standard_en         12700 non-null  object 
 6   service_standard_fr         12640 non-null  object 
 7   type                        12708 non-null  object 
 8   gcss_tool_fiscal_yr         5709 non-null   object 
 9   channel                     12708 non-null  object 
 10  channel_comments_en         4567 non-null   object 
 11  channel_comments_fr         4554 non-null   object 
 12  target_type                 10377 non-null  object 
 13  target                      124

In [None]:
# table1 = pd.read_csv('../table1.csv')
# table2 = pd.read_csv('../table2.csv')

df_base = ss
df_comp = snap_ss
key_name = 'fy_org_id_service_id_service_standard'


comparison_table = compare_simple(df_base, df_comp, key_name)
comparison_table




Unnamed: 0,fy_org_id_service_id,field,value_base,value_comp
0,2023-2024_223_1728,record only in comp,,2023-2024_223_1728


In [None]:
compare_dict = {
    'df_base': si,
    'base_name': 'si',
    'df_comp': snap_si,
    'comp_name': 'snap_si',
    'key_name':'fy_org_id_service_id'
}

comparison_table = compare(compare_dict)
comparison_table

compare_si_vs_snap_si


In [None]:
compare_dict = {
    'df_base': si,
    'base_name': 'si',
    'df_comp': snap_si,
    'comp_name': 'snap_si',
    'key_name':'fy_org_id_service_id'
}

comparison_table = compare(compare_dict)
comparison_table

In [10]:
snap_si = pd.read_csv(
    Path('../outputs/si.csv'),
    keep_default_na=False, 
    na_values='',
    delimiter=';'
)

snap_si

Unnamed: 0,fiscal_yr,service_id,service_name_en,service_name_fr,service_description_en,service_description_fr,service_type,service_recipient_type,service_scope,client_target_groups,...,num_applications_total,org_name_variant,org_id,department_en,department_fr,program_id,automated_decision_system,automated_decision_system_description_fr,automated_decision_system_description_en,fy_org_id_service_id
0,2018-2019,1000,Reconciliation,Réconciliation,Advancing the whole of Government reconciliati...,Promouvoir une approche pangouvernementale rel...,INFO,CLIENT,"ENTERPRISE, EXTERN",NGO,...,0.0,Crown-Indigenous Relations and Northern Affair...,129.0,Crown-Indigenous Relations and Northern Affair...,Relations Couronne-Autochtones et Affaires du ...,BWM06,,,,2018-2019_129_1000
1,2022-2023,1001,Old Age Security (OAS) Benefits,Prestations de la Sécurité de la vieillesse,The Old Age Security (OAS) pension is a monthl...,La pension de la Sécurité de la vieillesse (SV...,RES,CLIENT,EXTERN,PERSON,...,3286418.0,Employment and Social Development Canada,128.0,Employment and Social Development Canada,Emploi et Développement social Canada,BGN01,,,,2022-2023_128_1001
2,2018-2019,1001,Old Age Security (OAS) Benefits,Prestations de la Sécurité de la vieillesse,The Old Age Security (OAS) pension is a monthl...,La pension de la Sécurité de la vieillesse (SV...,RES,CLIENT,EXTERN,PERSON,...,2948447.0,Employment and Social Development Canada,128.0,Employment and Social Development Canada,Emploi et Développement social Canada,BGN01,,,,2018-2019_128_1001
3,2019-2020,1001,Old Age Security (OAS) Benefits,Prestations de la Sécurité de la vieillesse,The Old Age Security (OAS) pension is a monthl...,La pension de la Sécurité de la vieillesse (SV...,RES,CLIENT,EXTERN,PERSON,...,3154536.0,Employment and Social Development Canada,128.0,Employment and Social Development Canada,Emploi et Développement social Canada,BGN01,,,,2019-2020_128_1001
4,2020-2021,1001,Old Age Security (OAS) Benefits,Prestations de la Sécurité de la vieillesse,The Old Age Security (OAS) pension is a monthl...,La pension de la Sécurité de la vieillesse (SV...,RES,CLIENT,EXTERN,PERSON,...,3217790.0,Employment and Social Development Canada,128.0,Employment and Social Development Canada,Emploi et Développement social Canada,BGN01,,,,2020-2021_128_1001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8929,2023-2024,53,Review and Appeal hearings,Audiences de révision et d'appel,The independent avenue of appeal for disabilit...,Voie d’appel indépendante à l’égard des décisi...,RES,CLIENT,EXTERN,PERSON,...,0.0,vrab-tacra,333.0,Veterans Review and Appeal Board,Tribunal des anciens combattants (révision et ...,BWL01,N,,,2023-2024_333_53
8930,2023-2024,2104,Women's Program,Programme de promotion de la femme,The purpose of the Women’s Program is to advan...,Le Programme de promotion de la femme vise à f...,GNC,SOCIETY,EXTERN,"NGO, PTC",...,772.0,wage,246.0,Women and Gender Equality Canada,Femmes et Égalité des genres Canada,BXR02,N,,,2023-2024_246_2104
8931,2023-2024,2105,Gender-Based Violence Program,Programme de financement de la lutte contre la...,The Gender-Based Violence (GBV) Program takes ...,Les mesures du Programme de financement de la ...,GNC,SOCIETY,EXTERN,"NGO, PTC",...,1.0,wage,246.0,Women and Gender Equality Canada,Femmes et Égalité des genres Canada,BXR02,N,,,2023-2024_246_2105
8932,2023-2024,2106,"Equality for Sex, Sexual Orientation, Gender I...","Programme de promotion de l'égalité des sexes,...","The objective of the Equality for Sex, Sexual ...",L’objectif du Programme de promotion de l’égal...,GNC,SOCIETY,EXTERN,"NGO, PTC",...,234.0,wage,246.0,Women and Gender Equality Canada,Femmes et Égalité des genres Canada,BXR02,N,,,2023-2024_246_2106
