In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from IPython.display import display

from carbon_bombs.conf import REPO_PATH
from carbon_bombs.conf import FPATH_OUT_BANK
from carbon_bombs.conf import FPATH_OUT_CB
from carbon_bombs.conf import FPATH_OUT_COMP
from carbon_bombs.conf import FPATH_OUT_CONX_BANK_COMP
from carbon_bombs.conf import FPATH_OUT_CONX_CB_COMP
from carbon_bombs.conf import FPATH_OUT_COUNTRY


def compare_dataframes(df1, df2, key_col):
    
    if not isinstance(key_col, list):
        df1 = df1.loc[(~df1[key_col].isna()) & (df1[key_col] != "None")]
        df2 = df2.loc[~df2[key_col].isna() & (df2[key_col] != "None")]
        key_col = [key_col]
        
        
    df1["key_col"] = df1.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)
    df2["key_col"] = df2.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)

    df1 = df1.sort_values(by="key_col").reset_index(drop=True).fillna("None")
    df2 = df2.sort_values(by="key_col").reset_index(drop=True).fillna("None")
    
    diff = set(df1.columns) - set(df2.columns)
    if diff:
        print(f"New df has new columns:", diff)

    diff = set(df2.columns) - set(df1.columns)
    if diff:
        print(f"New df misses columns:", diff)
        
    uniq_keys1 = set(df1["key_col"])
    uniq_keys2 = set(df2["key_col"])
        
    if uniq_keys1.symmetric_difference(uniq_keys2):
        if uniq_keys1 - uniq_keys2:
            print("Found new keys:", uniq_keys1 - uniq_keys2)
        if uniq_keys2 - uniq_keys1:
            print("Missing keys:", uniq_keys2 - uniq_keys1)
        
        common_keys = uniq_keys1.intersection(uniq_keys2)
        df1 = df1.loc[df1["key_col"].isin(common_keys)].drop_duplicates().reset_index(drop=True)
        df2 = df2.loc[df2["key_col"].isin(common_keys)].drop_duplicates().reset_index(drop=True)

    full_comp = []
        
    for col in df2.columns:
        if col == "key_col":
            continue
        if col not in df1:
            print(f"NF // {col}  // not in new_df")
            continue
    
        if col not in key_col:
            df1_ = df1.set_index(key_col).copy()
            df2_ = df2.set_index(key_col).copy()
        else:
            df1_ = df1
            df2_ = df2
            
        comp = df1_[col].fillna("None").replace("", "None").compare(
            df2_[col].fillna("None").replace("", "None"),
            result_names=("new", "old")
        ).reset_index()
        comp["column"] = col
        
        comp["key_col"] = ""

        if "index" not in comp:
            if len(comp):
                comp["key_col"] = comp.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)
        
        
        if len(comp) == 0:
            print(f"OK -- {col} -- ALL OK")
        else:
            print(f"KO -- {col} -- NOT OK (n={len(comp)})")
            # display(comp)
            
        full_comp += [comp]
        
    full_comp = pd.concat(full_comp).reset_index(drop=True)
    
    return full_comp

def compare_cleaned_datasets(old_data_path="data_cleaned_SAVE"):
    
    cleaned_datasets_fpaths = [
        FPATH_OUT_CB,
        FPATH_OUT_COMP,
        FPATH_OUT_BANK,
        FPATH_OUT_CONX_BANK_COMP,
        FPATH_OUT_CONX_CB_COMP,
        FPATH_OUT_COUNTRY,
    ]
    
    key_cols_map = {
        "carbon_bombs_data.csv": ["Carbon_bomb_name_source_CB", "Country_source_CB"],
        "company_data.csv": "Company_name",
        "bank_data.csv": "Bank Name",
        "connection_bank_company.csv": ["Bank", "Company"],
        "connection_carbonbombs_company.csv": ["Carbon_bomb_name", "Company", "Country"],
        "country_data.csv": "Country_source_CB",
    }
    
    full_comp = []
    
    for fpath in cleaned_datasets_fpaths:
        fname = fpath.split("/")[-1]
        save_path = f"{REPO_PATH}/{old_data_path}/{fname}"
        
        new_df = pd.read_csv(fpath)
        old_df = pd.read_csv(save_path)
        
        print(fname)
        comp = compare_dataframes(new_df, old_df, key_cols_map[fname])
        comp["file"] = fname
        comp = comp[["file", "column", "key_col", "new", "old"]]
        full_comp += [comp]
        
    full_comp = pd.concat(full_comp).reset_index(drop=True)
    
    return full_comp

In [3]:
full_comp = compare_cleaned_datasets("data_cleaned copy//")

carbon_bombs_data.csv
OK -- Carbon_bomb_name_source_CB -- ALL OK
OK -- Country_source_CB -- ALL OK
OK -- World_region -- ALL OK
OK -- Potential_GtCO2_source_CB -- ALL OK
OK -- Fuel_type_source_CB -- ALL OK
OK -- GEM_id_source_GEM -- ALL OK
OK -- GEM_url_source_GEM -- ALL OK
OK -- Latitude -- ALL OK
OK -- Longitude -- ALL OK
OK -- Latitude_longitude_source -- ALL OK
OK -- Operators_source_GEM -- ALL OK
OK -- Parent_company_source_GEM -- ALL OK
KO -- Companies_involved_source_GEM -- NOT OK (n=1)
OK -- GEM_project_name_source_GEM -- ALL OK
OK -- Carbon_bomb_description -- ALL OK
OK -- Carbon_bomb_start_year -- ALL OK
OK -- Status_source_CB -- ALL OK
OK -- Status_source_GEM -- ALL OK
OK -- Status_lvl_1 -- ALL OK
OK -- Status_lvl_2 -- ALL OK
company_data.csv
Found new keys: {'Guangdong Hanjian Holding Co Ltd'}
Missing keys: {'Guangdong Hanjian Investment Co Ltd'}
OK -- Company_name -- ALL OK
OK -- Address_headquarters_source_chatGPT -- ALL OK
OK -- Latitude -- ALL OK
OK -- Longitude -- ALL 

In [4]:
full_comp.shape

(2851, 5)

In [5]:
full_comp.head(50)

Unnamed: 0,file,column,key_col,new,old
0,carbon_bombs_data.csv,Companies_involved_source_GEM,Qingchunta Coal Mine - China,Guangdong Hanjian Holding Co Ltd (94.0%);Shang...,Guangdong Hanjian Investment Co Ltd (94.0%);Sh...
1,connection_bank_company.csv,2016,ANZ - State Grid Corp of China,75434669.0,75434668.715946
2,connection_bank_company.csv,2016,ANZ - State Power Investment Corp (SPIC),97656855.0,97656855.054785
3,connection_bank_company.csv,2016,Agricultural Bank of China - Beijing Energy Ho...,180670028.0,180670027.912511
4,connection_bank_company.csv,2016,Agricultural Bank of China - China Datang Corp,166200147.0,166200146.610503
5,connection_bank_company.csv,2016,Agricultural Bank of China - China Energy Inve...,1831431160.0,1831431159.859382
6,connection_bank_company.csv,2016,Agricultural Bank of China - China Huadian Cor...,935007190.0,935007190.132086
7,connection_bank_company.csv,2016,Agricultural Bank of China - Hubei Yihua Chemi...,10218037.0,10218036.92
8,connection_bank_company.csv,2016,Agricultural Bank of China - Jinneng Holding G...,119965447.0,119965447.068704
9,connection_bank_company.csv,2016,Agricultural Bank of China - Lu'an Chemical Gr...,96181628.0,96181628.4


In [8]:
full_comp.to_csv("comparison.csv", index=False)