In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from IPython.display import display

from carbon_bombs.conf import REPO_PATH
from carbon_bombs.conf import FPATH_OUT_BANK
from carbon_bombs.conf import FPATH_OUT_CB
from carbon_bombs.conf import FPATH_OUT_COMP
from carbon_bombs.conf import FPATH_OUT_CONX_BANK_COMP
from carbon_bombs.conf import FPATH_OUT_CONX_CB_COMP
from carbon_bombs.conf import FPATH_OUT_COUNTRY


def compare_dataframes(df1, df2, key_col):
    
    if not isinstance(key_col, list):
        df1 = df1.loc[(~df1[key_col].isna()) & (df1[key_col] != "None")]
        df2 = df2.loc[~df2[key_col].isna() & (df2[key_col] != "None")]
        key_col = [key_col]
        
        
    df1["key_col"] = df1.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)
    df2["key_col"] = df2.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)

    df1 = df1.sort_values(by="key_col").reset_index(drop=True).fillna("None")
    df2 = df2.sort_values(by="key_col").reset_index(drop=True).fillna("None")
    
    diff = set(df1.columns) - set(df2.columns)
    if diff:
        print(f"New df has new columns:", diff)

    diff = set(df2.columns) - set(df1.columns)
    if diff:
        print(f"New df misses columns:", diff)
        
    uniq_keys1 = set(df1["key_col"])
    uniq_keys2 = set(df2["key_col"])
        
    if uniq_keys1.symmetric_difference(uniq_keys2):
        if uniq_keys1 - uniq_keys2:
            print("Found new keys:", uniq_keys1 - uniq_keys2)
        if uniq_keys2 - uniq_keys1:
            print("Missing keys:", uniq_keys2 - uniq_keys1)
        
        common_keys = uniq_keys1.intersection(uniq_keys2)
        df1 = df1.loc[df1["key_col"].isin(common_keys)].drop_duplicates().reset_index(drop=True)
        df2 = df2.loc[df2["key_col"].isin(common_keys)].drop_duplicates().reset_index(drop=True)

    full_comp = []
        
    for col in df2.columns:
        if col == "key_col":
            continue
        if col not in df1:
            print(f"NF // {col}  // not in new_df")
            continue
    
        if col not in key_col:
            df1_ = df1.set_index(key_col).copy()
            df2_ = df2.set_index(key_col).copy()
        else:
            df1_ = df1
            df2_ = df2
            
        comp = df1_[col].fillna("None").replace("", "None").compare(
            df2_[col].fillna("None").replace("", "None"),
            result_names=("new", "old")
        ).reset_index()
        comp["column"] = col
        
        comp["key_col"] = ""

        if "index" not in comp:
            if len(comp):
                comp["key_col"] = comp.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)
        
        
        if len(comp) == 0:
            print(f"OK -- {col} -- ALL OK")
        else:
            print(f"KO -- {col} -- NOT OK (n={len(comp)})")
            # display(comp)
            
        full_comp += [comp]
        
    full_comp = pd.concat(full_comp).reset_index(drop=True)
    
    return full_comp

def compare_cleaned_datasets(old_data_path="data_cleaned_SAVE"):
    
    cleaned_datasets_fpaths = [
        FPATH_OUT_CB,
        FPATH_OUT_COMP,
        FPATH_OUT_BANK,
        FPATH_OUT_CONX_BANK_COMP,
        FPATH_OUT_CONX_CB_COMP,
        FPATH_OUT_COUNTRY,
    ]
    
    key_cols_map = {
        "carbon_bombs_data.csv": ["Carbon_bomb_name_source_CB", "Country_source_CB"],
        "company_data.csv": "Company_name",
        "bank_data.csv": "Bank Name",
        "connection_bank_company.csv": ["Bank", "Company"],
        "connection_carbonbombs_company.csv": ["Carbon_bomb_name", "Company", "Country"],
        "country_data.csv": "Country_source_CB",
    }
    
    full_comp = []
    
    for fpath in cleaned_datasets_fpaths:
        fname = fpath.split("/")[-1]
        save_path = f"{REPO_PATH}/{old_data_path}/{fname}"
        
        new_df = pd.read_csv(fpath)
        old_df = pd.read_csv(save_path)
        
        print(fname)
        comp = compare_dataframes(new_df, old_df, key_cols_map[fname])
        comp["file"] = fname
        comp = comp[["file", "column", "key_col", "new", "old"]]
        full_comp += [comp]
        
    full_comp = pd.concat(full_comp).reset_index(drop=True)
    
    return full_comp

In [12]:
full_comp = compare_cleaned_datasets("data_cleaned_SAVE//")

carbon_bombs_data.csv
New df has new columns: {'GEM_project_name_source_GEM'}
New df misses columns: {'Multiple_unit_concerned_source_GEM'}
OK -- Carbon_bomb_name_source_CB -- ALL OK
OK -- Country_source_CB -- ALL OK
OK -- World_region -- ALL OK
OK -- Potential_GtCO2_source_CB -- ALL OK
OK -- Fuel_type_source_CB -- ALL OK
KO -- GEM_id_source_GEM -- NOT OK (n=21)
KO -- GEM_url_source_GEM -- NOT OK (n=21)
KO -- Latitude -- NOT OK (n=114)
KO -- Longitude -- NOT OK (n=118)
KO -- Latitude_longitude_source -- NOT OK (n=13)
KO -- Operators_source_GEM -- NOT OK (n=21)
KO -- Parent_company_source_GEM -- NOT OK (n=21)
KO -- Companies_involved_source_GEM -- NOT OK (n=245)
NF // Multiple_unit_concerned_source_GEM  // not in new_df
KO -- Carbon_bomb_description -- NOT OK (n=21)
KO -- Carbon_bomb_start_year -- NOT OK (n=9)
OK -- Status_source_CB -- ALL OK
KO -- Status_source_GEM -- NOT OK (n=15)
KO -- Status_lvl_1 -- NOT OK (n=11)
KO -- Status_lvl_2 -- NOT OK (n=11)
company_data.csv
Found new keys: 

In [13]:
full_comp.shape

(748, 5)

In [14]:
full_comp.head(50)

Unnamed: 0,file,column,key_col,new,old
0,carbon_bombs_data.csv,GEM_id_source_GEM,Bakken Shale - United States,OG0015302|OG0015304|OG0014465|OG0015529|OG0014...,OG0015293|OG0015294|OG0015295|OG0015296|OG0015...
1,carbon_bombs_data.csv,GEM_id_source_GEM,Barail Shale - India,OG0004979|OG0004966|OG0004965|OG0004969|OG0004970,No informations available on GEM
2,carbon_bombs_data.csv,GEM_id_source_GEM,Barnett Shale - United States,OG0014258|OG0015761|OG0014704|OG0015790|OG0014...,OG0014284|OG0014327|OG0014366|OG0014438|OG0014...
3,carbon_bombs_data.csv,GEM_id_source_GEM,Candeias Shale - Brazil,OG0000261|OG0000092|OG0000470|OG0000071|OG0000...,OG0000092
4,carbon_bombs_data.csv,GEM_id_source_GEM,Carboniferous Shale - Kazakhstan,OG0012594|OG0012644|OG0012597|OG0012641|OG0012...,No informations available on GEM
5,carbon_bombs_data.csv,GEM_id_source_GEM,DJ Basin Tight Oil - United States,OG0016917|OG0016981|OG0016951|OG0016925|OG0016...,No informations available on GEM
6,carbon_bombs_data.csv,GEM_id_source_GEM,Haynesville/Bossier Shale - United States,OG0014452|OG0016454|OG0014994|OG0016804|OG0016...,OG0016031|OG0016093|OG0016136|OG0016200|OG0016...
7,carbon_bombs_data.csv,GEM_id_source_GEM,La Luna Shale - Colombia,OG0003047|OG0002942|OG0002941|OG0002934|OG0003...,No informations available on GEM
8,carbon_bombs_data.csv,GEM_id_source_GEM,La Luna Shale - Venezuela,OG0005058|OG0005169|OG0005185|OG0005052|OG0005...,No informations available on GEM
9,carbon_bombs_data.csv,GEM_id_source_GEM,Lensky Basin CBM - Russia,OG0012588|OG0012589|OG0012377|OG0012328|OG0012...,No informations available on GEM


In [16]:
full_comp.to_csv("comparison.csv")