In [10]:
import pandas as pd
import numpy as np
import re, pytz, os, requests, sys
from pathlib import Path
from datetime import datetime
import sys
sys.path.append("/workspaces/service-data")

from src.clean import clean_percentage, clean_fiscal_yr, normalize_string, standardize_column_names
from src.load import load_csv
from src.export import export_to_csv
from src.merge import merge_si, merge_ss

base_dir = Path.cwd()
parent_dir = base_dir.parent

In [11]:
# si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/si.csv", 
#                  keep_default_na=False, 
#                  na_values='', 
#                  delimiter=';',
#                  engine='python',
#                  skipfooter=2
#                  )

# snap_si = pd.read_csv("https://github.com/gcperformance/service-data/releases/latest/download/2025-03-01_si.csv", 
#                  keep_default_na=False, 
#                  na_values='', 
#                  delimiter=';',
#                  engine='python',
#                  skipfooter=2
#                  )



In [None]:
import pandas as pd

def compare(df_base, df_comp, key_name):
  """
  Compare two DataFrames: return differences and unmatched records.
  
  Args:
      df_base (pd.DataFrame): Baseline DataFrame to compare from.
      df_comp (pd.DataFrame): DataFrame to compare against the baseline.
      key_name (str): Column used as a unique identifier. Must be present in both DataFrames

  Returns:
      pd.DataFrame: Long-format DataFrame with differing fields and unmatched records.
  """
  try:
    # Reset index to ensure clean comparison on key
    df_base = df_base.reset_index(drop=True)
    df_comp = df_comp.reset_index(drop=True)

    # Prepare series of keys for comparison
    base_keys = df_base[key_name]
    comp_keys = df_comp[key_name]

    # Check the overlap between columns
    common_cols = set(df_base.columns) & set(df_comp.columns)
    cols_only_in_base = set(df_base.columns) - set(df_comp.columns)
    cols_only_in_comp = set(df_comp.columns) - set(df_base.columns)

    # if len(common_cols) == 0:
    #   raise Exception

  except KeyError as err:
    print(f"{type(err)} {key_name} is not a valid column present")

  else:
    # Set the key as index for easier row-wise comparison
    df_base.set_index(key_name, inplace=True)
    df_comp.set_index(key_name, inplace=True)

    # Identify which keys are common or unique between the two dataframes
    match_keys = pd.merge(base_keys, comp_keys, how='outer', indicator=True)
    common_keys = match_keys[match_keys['_merge'] == 'both'].iloc[:, 0]
    keys_only_in_base = match_keys[match_keys['_merge'] == 'left_only'].iloc[:, 0]
    keys_only_in_comp = match_keys[match_keys['_merge'] == 'right_only'].iloc[:, 0]

    # Filter both DataFrames to only those with common keys
    df_base_common = df_base.loc[common_keys]
    df_comp_common = df_comp.loc[common_keys]
            
    # Set the name for the column that stores field names during the reshaping process
    var_name = 'field'

    # Compare values field-by-field, row-by-row after filling NaNs with 0
    diff_wide = df_base_common.fillna(0).ne(df_comp_common.fillna(0)).reset_index()
    diff = diff_wide.melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=diff_wide.columns.drop(key_name)
    )
    # Filter for where values are different
    diff = diff[diff['value']].set_index([key_name, var_name])

    # Prepare long-format versions of base and comp DataFrames for comparison
    df_base_long = df_base_common.reset_index().melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=df_base_common.columns
    ).set_index([key_name, var_name])

    df_comp_long = df_comp_common.reset_index().melt(
        id_vars=[key_name],
        var_name=var_name,
        value_vars=df_comp_common.columns
    ).set_index([key_name, var_name])

    # Merge differences with their respective base and comp values
    compare_result = diff.join(df_base_long, rsuffix='_base').join(df_comp_long, rsuffix='_comp')
    compare_result.drop(columns=['value'], inplace=True)  # Drop diff indicator
    compare_result.reset_index(inplace=True)

    # Add records that are only in one of the datasets
    records_only_in_base = pd.DataFrame({
        key_name: keys_only_in_base,
        var_name: 'record only in base',
        'value_base': keys_only_in_base
    })

    records_only_in_comp = pd.DataFrame({
        key_name: keys_only_in_comp,
        var_name: 'record only in comp',
        'value_comp': keys_only_in_comp
    })

    # Concatenate all results into a single DataFrame
    compare_result = pd.concat([compare_result, records_only_in_base, records_only_in_comp], ignore_index=True)

    return compare_result


In [13]:
table1 = pd.read_csv('../table1.csv')
table2 = pd.read_csv('../table2.csv')

comparison_table = compare(table1, table2, 'index')
comparison_table

Unnamed: 0,index,field,value_base,value_comp
0,c,col_a,0.930153,50.0
1,d,col_a,0.581606,50.0
2,e,col_a,0.98462,50.0
3,e,col_b,0.775072,0.0
4,a,col_c,0.150425,
5,c,col_c,0.09261,
6,d,col_c,0.242782,
7,e,col_c,0.335649,
8,b,record only in base,b,
9,f,record only in base,f,


In [22]:
table1 = pd.read_csv('../table1.csv')
table2 = pd.read_csv('../table2.csv')

df_base = table1
df_comp = table2
key = 'index'

df_base.reset_index(inplace=True, drop=True)
df_comp.reset_index(inplace=True, drop=True)

base_keys = df_base[key]
comp_keys = df_comp[key]

df_base.set_index(key, inplace=True)
df_comp.set_index(key, inplace=True)

match_keys = pd.merge(base_keys, comp_keys, how='outer', indicator=True)

common_keys = match_keys[match_keys['_merge']=='both'].iloc[:, 0]
keys_only_in_base = match_keys[match_keys['_merge']=='left_only'].iloc[:, 0]
keys_only_in_comp = match_keys[match_keys['_merge']=='right_only'].iloc[:, 0]

df_base_common = df_base.loc[common_keys]
df_comp_common = df_comp.loc[common_keys]

# Check the overlap between columns
common_cols = set(df_base.columns) & set(df_comp.columns)
cols_only_in_base = set(df_base.columns) - set(df_comp.columns)
cols_only_in_comp = set(df_comp.columns) - set(df_base.columns)

# print(common_cols)
# print(cols_only_in_base)
# print(cols_only_in_comp)

var_name = 'field'
#ne = not equal, inverse of eq
diff_wide = df_base_common.fillna(0).ne(df_comp_common.fillna(0)).reset_index()
diff = diff_wide.melt(
    value_vars = diff_wide.columns.drop(key), 
    var_name=var_name, 
    id_vars = key
)
diff = diff[diff['value']].set_index([key, var_name])

df_base_long = df_base_common.reset_index().melt(
    value_vars = df_base_common.columns, 
    var_name = var_name, 
    id_vars=[key]
).set_index([key, var_name])

df_comp_long = df_comp_common.reset_index().melt(
    value_vars = df_comp_common.columns, 
    var_name = var_name, 
    id_vars=[key]
).set_index([key, var_name])


compare_result = diff.join(df_base_long, rsuffix='_base').join(df_comp_long, rsuffix='_comp')
compare_result.drop(columns=['value'], inplace=True)
compare_result.reset_index(inplace=True)

records_only_in_base = pd.DataFrame({
    key: keys_only_in_base,
    var_name: 'record only in base',
    'value_base': keys_only_in_base
})

records_only_in_comp = pd.DataFrame({
    key: keys_only_in_comp,
    var_name: 'record only in comp',
    'value_comp': keys_only_in_comp
})

compare_result = pd.concat([compare_result, records_only_in_base, records_only_in_comp], ignore_index=True)
compare_result




Unnamed: 0,index,field,value_base,value_comp
0,c,col_a,0.930153,50.0
1,d,col_a,0.581606,50.0
2,e,col_a,0.98462,50.0
3,e,col_b,0.775072,0.0
4,a,col_c,0.150425,
5,c,col_c,0.09261,
6,d,col_c,0.242782,
7,e,col_c,0.335649,
8,b,record only in base,b,
9,f,record only in base,f,


dang
