In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
import pandas as pd
from carbon_bombs.processing.banks import create_banks_table
from carbon_bombs.processing.company import create_company_table

In [3]:
%%time
banks = create_banks_table(check_old_df_address=True)

CPU times: total: 32.5 s
Wall time: 4min 50s


In [4]:
%%time
companies = create_company_table(check_old_df_address=True)

CPU times: total: 2.53 s
Wall time: 2min 22s


In [9]:
old_banks = pd.read_csv("data_cleaned_SAVE/bank_informations.csv")
old_companies = pd.read_csv("data_cleaned_SAVE/company_informations.csv")
old_companies = old_companies.loc[~old_companies["Company_name"].isna()]

In [26]:
(old_companies["Company_name"] == "None").sum()

1

In [93]:
a = {1, 2}
n = {2, 3}
a.symmetric_difference(n)

n.symmetric_difference(a)
n.intersection(a)

{2}

In [103]:
import pandas as pd
from IPython.display import display

from carbon_bombs.conf import REPO_PATH
from carbon_bombs.conf import FPATH_OUT_BANK
from carbon_bombs.conf import FPATH_OUT_CB
from carbon_bombs.conf import FPATH_OUT_COMP
from carbon_bombs.conf import FPATH_OUT_CONX_BANK_COMP
from carbon_bombs.conf import FPATH_OUT_CONX_CB_COMP
from carbon_bombs.conf import FPATH_OUT_COUNTRY


def compare_dataframes(df1, df2, key_col):
    
    if not isinstance(key_col, list):
        df1 = df1.loc[(~df1[key_col].isna()) & (df1[key_col] != "None")]
        df2 = df2.loc[~df2[key_col].isna() & (df2[key_col] != "None")]
        key_col = [key_col]
        
        
    df1["key_col"] = df1.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)
    df2["key_col"] = df2.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)

    df1 = df1.sort_values(by="key_col").reset_index(drop=True).fillna("None")
    df2 = df2.sort_values(by="key_col").reset_index(drop=True).fillna("None")
    
    diff = set(df1.columns) - set(df2.columns)
    if diff:
        print(f"New df has new columns:", diff)

    diff = set(df2.columns) - set(df1.columns)
    if diff:
        print(f"New df misses columns:", diff)
        
    uniq_keys1 = set(df1["key_col"])
    uniq_keys2 = set(df2["key_col"])
        
    if uniq_keys1.symmetric_difference(uniq_keys2):
        if uniq_keys1 - uniq_keys2:
            print("Found new keys:", uniq_keys1 - uniq_keys2)
        if uniq_keys2 - uniq_keys1:
            print("Missing keys:", uniq_keys2 - uniq_keys1)
        
        common_keys = uniq_keys1.intersection(uniq_keys2)
        df1 = df1.loc[df1["key_col"].isin(common_keys)]
        df2 = df2.loc[df1["key_col"].isin(common_keys)]

    full_comp = []
        
    for col in df2.columns:
        if col == "key_col":
            continue
        if col not in df1:
            print(f"NF // {col}  // not in new_df")
            continue
    
        if col not in key_col:
            df1_ = df1.set_index(key_col).copy()
            df2_ = df2.set_index(key_col).copy()
        else:
            df1_ = df1
            df2_ = df2
            
        comp = df1_[col].fillna("None").replace("", "None").compare(
            df2_[col].fillna("None").replace("", "None"),
            result_names=("new", "old")
        ).reset_index()
        comp["column"] = col
        
        comp["key_col"] = ""

        if "index" not in comp:
            if len(comp):
                comp["key_col"] = comp.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)
        
        
        if len(comp) == 0:
            print(f"OK -- {col} -- ALL OK")
        else:
            print(f"KO -- {col} -- NOT OK (n={len(comp)})")
            # display(comp)
            
        full_comp += [comp]
        
    full_comp = pd.concat(full_comp).reset_index(drop=True)
    
    return full_comp

def compare_cleaned_datasets(old_data_path="data_cleaned_SAVE"):
    
    cleaned_datasets_fpaths = [
        FPATH_OUT_CB,
        FPATH_OUT_COMP,
        FPATH_OUT_BANK,
        FPATH_OUT_CONX_BANK_COMP,
        FPATH_OUT_CONX_CB_COMP,
        FPATH_OUT_COUNTRY,
    ]
    
    key_cols_map = {
        "carbon_bombs_informations.csv": "Carbon_bomb_name_source_CB",
        "company_informations.csv": "Company_name",
        "bank_informations.csv": "Bank Name",
        "connexion_bank_company.csv": ["Bank", "Company"],
        "connexion_carbonbombs_company.csv": ["Carbon_bomb_name", "Company"],
        "country_informations.csv": "Country_source_CB",
    }
    
    full_comp = []
    
    for fpath in cleaned_datasets_fpaths:
        fname = fpath.split("/")[-1]
        save_path = f"{REPO_PATH}/{old_data_path}/{fname}"
        
        new_df = pd.read_csv(fpath)
        old_df = pd.read_csv(save_path)
        
        print(fname)
        comp = compare_dataframes(new_df, old_df, key_cols_map[fname])
        comp["file"] = fname
        comp = comp[["file", "column", "key_col", "new", "old"]]
        full_comp += [comp]
        
    full_comp = pd.concat(full_comp).reset_index(drop=True)
    
    return full_comp

In [104]:
full_comp = compare_cleaned_datasets("data_cleaned_SAVE//")

carbon_bombs_informations.csv
OK -- Carbon_bomb_name_source_CB -- ALL OK
OK -- Country_source_CB -- ALL OK
OK -- World_region -- ALL OK
OK -- Potential_GtCO2_source_CB -- ALL OK
OK -- Fuel_type_source_CB -- ALL OK
KO -- GEM_id_source_GEM -- NOT OK (n=1)
KO -- GEM_url_source_GEM -- NOT OK (n=1)
KO -- Latitude -- NOT OK (n=3)
KO -- Longitude -- NOT OK (n=1)
OK -- Latitude_longitude_source -- ALL OK
KO -- Operators_source_GEM -- NOT OK (n=1)
KO -- Parent_company_source_GEM -- NOT OK (n=1)
KO -- Companies_involved_source_GEM -- NOT OK (n=1)
KO -- Multiple_unit_concerned_source_GEM -- NOT OK (n=1)
KO -- Carbon_bomb_description -- NOT OK (n=18)
KO -- Carbon_bomb_start_year -- NOT OK (n=6)
OK -- Status_source_CB -- ALL OK
OK -- Status_source_GEM -- ALL OK
OK -- Status_lvl_1 -- ALL OK
OK -- Status_lvl_2 -- ALL OK
company_informations.csv
OK -- Company_name -- ALL OK
KO -- Address_headquarters_source_chatGPT -- NOT OK (n=8)
KO -- Latitude -- NOT OK (n=7)
KO -- Longitude -- NOT OK (n=7)
KO -- Ca

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["key_col"] = df2.apply(lambda x: (" - ".join([str(x[k]) for k in key_col])), axis=1)


KO -- World Region -- NOT OK (n=1)
connexion_bank_company.csv
OK -- Bank -- ALL OK
OK -- Company -- ALL OK
OK -- 2016 -- ALL OK
OK -- 2017 -- ALL OK
OK -- 2018 -- ALL OK
OK -- 2019 -- ALL OK
OK -- 2020 -- ALL OK
OK -- 2021 -- ALL OK
OK -- 2022 -- ALL OK
OK -- Grand Total -- ALL OK
connexion_carbonbombs_company.csv
OK -- Carbon_bomb_name -- ALL OK
KO -- Country -- NOT OK (n=8)
OK -- Company -- ALL OK
country_informations.csv
OK -- Country_source_CB -- ALL OK
OK -- Emissions_per_capita_tons_CO2 -- ALL OK
OK -- Year_Emissions_per_capita_tons_CO2 -- ALL OK
OK -- Emissions_thousand_tons_CO2 -- ALL OK
OK -- Year_Emissions_thousand_tons_CO2 -- ALL OK
OK -- GDP_millions_US_dollars -- ALL OK
OK -- Year_GDP_millions_US_dollars -- ALL OK
OK -- GDP_per_capita_US_dollars -- ALL OK
OK -- Year_GDP_per_capita_US_dollars -- ALL OK
OK -- Population_in_millions -- ALL OK
OK -- Year_Population_in_millions -- ALL OK
OK -- Surface_thousand_km2 -- ALL OK
OK -- Year_Surface_thousand_km2 -- ALL OK


In [105]:
full_comp.head(50)

Unnamed: 0,file,column,key_col,new,old
0,carbon_bombs_informations.csv,GEM_id_source_GEM,Maritsa Coal Mines,M3703|M3702,M3701
1,carbon_bombs_informations.csv,GEM_url_source_GEM,Maritsa Coal Mines,https://www.gem.wiki/Troyanovo-North_Coal_Mine...,https://www.gem.wiki/Troyanovo_3_Coal_Mine
2,carbon_bombs_informations.csv,Latitude,Central Arabian Onshore,25.38092,25.38092
3,carbon_bombs_informations.csv,Latitude,Eagle Ford Shale,23.723619,23.723619
4,carbon_bombs_informations.csv,Latitude,Maritsa Coal Mines,42.2737,42.1677
5,carbon_bombs_informations.csv,Longitude,Maritsa Coal Mines,26.0138,26.0376
6,carbon_bombs_informations.csv,Operators_source_GEM,Maritsa Coal Mines,Mini Maritsa Iztok EAD|Mini Maritsa Iztok EAD,Mini Maritsa Iztok EAD
7,carbon_bombs_informations.csv,Parent_company_source_GEM,Maritsa Coal Mines,Bulgarian Energy Holding (100.0%)|Bulgarian En...,Bulgarian Energy Holding (100.0%)
8,carbon_bombs_informations.csv,Companies_involved_source_GEM,Maritsa Coal Mines,Bulgarian Energy Holding (100.0%)|Bulgarian En...,Bulgarian Energy Holding (100.0%)
9,carbon_bombs_informations.csv,Multiple_unit_concerned_source_GEM,Maritsa Coal Mines,Troyanovo-North Coal Mine|Troyanovo 1 Coal Mine,


In [106]:
full_comp

Unnamed: 0,file,column,key_col,new,old
0,carbon_bombs_informations.csv,GEM_id_source_GEM,Maritsa Coal Mines,M3703|M3702,M3701
1,carbon_bombs_informations.csv,GEM_url_source_GEM,Maritsa Coal Mines,https://www.gem.wiki/Troyanovo-North_Coal_Mine...,https://www.gem.wiki/Troyanovo_3_Coal_Mine
2,carbon_bombs_informations.csv,Latitude,Central Arabian Onshore,25.38092,25.38092
3,carbon_bombs_informations.csv,Latitude,Eagle Ford Shale,23.723619,23.723619
4,carbon_bombs_informations.csv,Latitude,Maritsa Coal Mines,42.2737,42.1677
...,...,...,...,...,...
198,connexion_carbonbombs_company.csv,Country,Central Arabian Onshore - No informations on c...,Kuwait,Saudi Arabia
199,connexion_carbonbombs_company.csv,Country,Central Arabian Onshore - No informations on c...,Qatar,Kuwait
200,connexion_carbonbombs_company.csv,Country,Central Arabian Onshore - No informations on c...,Saudi Arabia,Qatar
201,connexion_carbonbombs_company.csv,Country,La Luna Shale - No informations on company,Colombia,Venezuela
