In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from IPython.display import display

from carbon_bombs.conf import REPO_PATH
from carbon_bombs.conf import FPATH_OUT_BANK
from carbon_bombs.conf import FPATH_OUT_CB
from carbon_bombs.conf import FPATH_OUT_COMP
from carbon_bombs.conf import FPATH_OUT_CONX_BANK_COMP
from carbon_bombs.conf import FPATH_OUT_CONX_CB_COMP
from carbon_bombs.conf import FPATH_OUT_COUNTRY


def compare_dataframes(df1, df2, key_col):
    
    if not isinstance(key_col, list):
        df1 = df1.loc[(~df1[key_col].isna()) & (df1[key_col] != "None")]
        df2 = df2.loc[~df2[key_col].isna() & (df2[key_col] != "None")]
        key_col = [key_col]
        
        
    df1["key_col"] = df1.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)
    df2["key_col"] = df2.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)

    df1 = df1.sort_values(by="key_col").reset_index(drop=True).fillna("None")
    df2 = df2.sort_values(by="key_col").reset_index(drop=True).fillna("None")
    
    diff = set(df1.columns) - set(df2.columns)
    if diff:
        print(f"New df has new columns:", diff)

    diff = set(df2.columns) - set(df1.columns)
    if diff:
        print(f"New df misses columns:", diff)
        
    uniq_keys1 = set(df1["key_col"])
    uniq_keys2 = set(df2["key_col"])
        
    if uniq_keys1.symmetric_difference(uniq_keys2):
        if uniq_keys1 - uniq_keys2:
            print("Found new keys:", uniq_keys1 - uniq_keys2)
        if uniq_keys2 - uniq_keys1:
            print("Missing keys:", uniq_keys2 - uniq_keys1)
        
        common_keys = uniq_keys1.intersection(uniq_keys2)
        df1 = df1.loc[df1["key_col"].isin(common_keys)].drop_duplicates().reset_index(drop=True)
        df2 = df2.loc[df2["key_col"].isin(common_keys)].drop_duplicates().reset_index(drop=True)

    full_comp = []
        
    for col in df2.columns:
        if col == "key_col":
            continue
        if col not in df1:
            print(f"NF // {col}  // not in new_df")
            continue
    
        if col not in key_col:
            df1_ = df1.set_index(key_col).copy()
            df2_ = df2.set_index(key_col).copy()
        else:
            df1_ = df1
            df2_ = df2
            
        comp = df1_[col].fillna("None").replace("", "None").compare(
            df2_[col].fillna("None").replace("", "None"),
            result_names=("new", "old")
        ).reset_index()
        comp["column"] = col
        
        comp["key_col"] = ""

        if "index" not in comp:
            if len(comp):
                comp["key_col"] = comp.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)
        
        
        if len(comp) == 0:
            print(f"OK -- {col} -- ALL OK")
        else:
            print(f"KO -- {col} -- NOT OK (n={len(comp)})")
            # display(comp)
            
        full_comp += [comp]
        
    full_comp = pd.concat(full_comp).reset_index(drop=True)
    
    return full_comp

def compare_cleaned_datasets(old_data_path="data_cleaned_SAVE"):
    
    cleaned_datasets_fpaths = [
        FPATH_OUT_CB,
        FPATH_OUT_COMP,
        FPATH_OUT_BANK,
        FPATH_OUT_CONX_BANK_COMP,
        FPATH_OUT_CONX_CB_COMP,
        FPATH_OUT_COUNTRY,
    ]
    
    key_cols_map = {
        "carbon_bombs_data.csv": ["Carbon_bomb_name_source_CB", "Country_source_CB"],
        "company_data.csv": "Company_name",
        "bank_data.csv": "Bank Name",
        "connection_bank_company.csv": ["Bank", "Company"],
        "connection_carbonbombs_company.csv": ["Carbon_bomb_name", "Company", "Country"],
        "country_data.csv": "Country_source_CB",
    }
    
    full_comp = []
    
    for fpath in cleaned_datasets_fpaths:
        fname = fpath.split("/")[-1]
        save_path = f"{REPO_PATH}/{old_data_path}/{fname}"
        
        new_df = pd.read_csv(fpath)
        old_df = pd.read_csv(save_path)
        
        print(fname)
        comp = compare_dataframes(new_df, old_df, key_cols_map[fname])
        comp["file"] = fname
        comp = comp[["file", "column", "key_col", "new", "old"]]
        full_comp += [comp]
        
    full_comp = pd.concat(full_comp).reset_index(drop=True)
    
    return full_comp

In [3]:
full_comp = compare_cleaned_datasets("data_cleaned copy//")

carbon_bombs_data.csv
OK -- Carbon_bomb_name_source_CB -- ALL OK
OK -- Country_source_CB -- ALL OK
OK -- World_region -- ALL OK
OK -- Potential_GtCO2_source_CB -- ALL OK
OK -- Fuel_type_source_CB -- ALL OK
KO -- GEM_id_source_GEM -- NOT OK (n=8)
KO -- GEM_url_source_GEM -- NOT OK (n=8)
KO -- Latitude -- NOT OK (n=93)
KO -- Longitude -- NOT OK (n=101)
KO -- Latitude_longitude_source -- NOT OK (n=3)
KO -- Operators_source_GEM -- NOT OK (n=8)
KO -- Parent_company_source_GEM -- NOT OK (n=8)
KO -- Companies_involved_source_GEM -- NOT OK (n=8)
KO -- GEM_project_name_source_GEM -- NOT OK (n=8)
KO -- Carbon_bomb_description -- NOT OK (n=6)
KO -- Carbon_bomb_start_year -- NOT OK (n=2)
OK -- Status_source_CB -- ALL OK
KO -- Status_source_GEM -- NOT OK (n=4)
OK -- Status_lvl_1 -- ALL OK
OK -- Status_lvl_2 -- ALL OK
company_data.csv
Found new keys: {'Renos Land & Minerals Company', 'Texas American Resources Company', 'Freedom Production', 'Devon Energy Production Company', 'Payrock', 'Tidal Petrol

In [4]:
full_comp.shape

(319, 5)

In [5]:
full_comp.head(50)

Unnamed: 0,file,column,key_col,new,old
0,carbon_bombs_data.csv,GEM_id_source_GEM,Anadarko Shelf_Oklahoma - United States,OG0016356|OG0016355|OG0016030|OG0015497|OG0016...,OG0014193|OG0014194|OG0014195|OG0014196|OG0014...
1,carbon_bombs_data.csv,GEM_id_source_GEM,Austin Chalk Tight - United States,OG0014847|OG0014771|OG0014772|OG0014845|OG0015...,OG0014341|OG0014357|OG0014410|OG0014491|OG0014...
2,carbon_bombs_data.csv,GEM_id_source_GEM,Candeias Shale - Brazil,OG0000261|OG0000092|OG0000068|OG0000470|OG0000...,OG0000261|OG0000092|OG0000470|OG0000071|OG0000...
3,carbon_bombs_data.csv,GEM_id_source_GEM,Ku-Maloob-Zaap Project - Mexico,OG0002257|OG0002369|OG0002370,No informations available on GEM
4,carbon_bombs_data.csv,GEM_id_source_GEM,Marcellus Shale - United States,OG0016103|OG0016056|OG0016338|OG0014606|OG0016...,OG0016103|OG0016056|OG0016338|OG0014606|OG0016...
5,carbon_bombs_data.csv,GEM_id_source_GEM,Oil shale China - China,OG0014124|OG0013943|OG0013965|OG0013968|OG0010...,No informations available on GEM
6,carbon_bombs_data.csv,GEM_id_source_GEM,PRB Tight Oil - United States,OG0016980|OG0016872|OG0016903|OG0017005|OG0016...,No informations available on GEM
7,carbon_bombs_data.csv,GEM_id_source_GEM,Umm Shaif/Nasr - United Arab Emirates,OG0012319|OG0012296,No informations available on GEM
8,carbon_bombs_data.csv,GEM_url_source_GEM,Anadarko Shelf_Oklahoma - United States,"https://www.gem.wiki/Perryton,_North_(George_M...",https://www.gem.wiki/Phantom_(Wolfcamp)_-_Anad...
9,carbon_bombs_data.csv,GEM_url_source_GEM,Austin Chalk Tight - United States,https://www.gem.wiki/John_Amoruso_(Bossier)_-_...,https://www.gem.wiki/Sugarkane_(Austin_Chalk)_...


In [6]:
full_comp.to_csv("comparison.csv")