# Check manual macth data

In [1]:
%load_ext autoreload
%autoreload 2

## Import packages

In [2]:
import pandas as pd
import numpy as np
from carbon_bombs.conf import FPATH_SRC_GEM_COAL, FPATH_SRC_GEM_GASOIL, FPATH_SRC_BOCC
from carbon_bombs.io.banking_climate_chaos import load_banking_climate_chaos
from carbon_bombs.io.khune_paper import load_carbon_bomb_gasoil_database, load_carbon_bomb_coal_database

In [3]:
from carbon_bombs.io.manual_match import manual_match_coal
from carbon_bombs.io.manual_match import manual_match_bank
from carbon_bombs.io.manual_match import manual_match_company
from carbon_bombs.io.manual_match import manual_match_gasoil
from carbon_bombs.io.manual_match import manual_match_lat_long

## Load data

In [4]:
# GEM source
gem_coal_df = pd.read_excel(FPATH_SRC_GEM_COAL, sheet_name='Global Coal Mine Tracker')
gem_gasoil_df = pd.read_excel(FPATH_SRC_GEM_GASOIL, sheet_name='Main data',engine='openpyxl')

In [5]:
# paper source
gasoil = load_carbon_bomb_gasoil_database()
coal = load_carbon_bomb_coal_database()
cb_df = pd.concat([gasoil, coal])

In [6]:
bocc_df = load_banking_climate_chaos()

## Check

In [7]:
from carbon_bombs.io.manual_match import manual_match_coal
from carbon_bombs.io.manual_match import manual_match_gasoil

def check_manual_mathc_gem_id(manual_match, fuel):
    if fuel == "coal":
        names = gem_coal_df["Mine Name"].unique()
    else:
        names = gem_gasoil_df["Unit name"].unique()
    
    for cb, units in manual_match.items():
        # if (
        #     cb not in cb_df["Carbon_bomb_name_source_CB"].values
        # ) and (
        #     "Eagle Ford Shale" not in cb and "La Luna Shale" not in cb
        # ):
        #     print(f"CB Name not found: `{cb}`")
        #     continue
            
        units = units[:-1] if units.endswith("$") else units
        
        for unit in units.split("$"):
            if unit not in names and unit not in ["None", ""]:
                print(f"{cb} - unit not found: {unit}")
#                 print(units)

### Check on coal

In [8]:
check_manual_mathc_gem_id(manual_match_coal, fuel="coal")

Maritsa Coal Mines - unit not found: Troyanovo 3 Coal Mine


### Check on gasoil

In [9]:
check_manual_mathc_gem_id(manual_match_gasoil, fuel="gasoil")

## Check on bank

In [10]:
for key, value in manual_match_bank.items():
    if value not in bocc_df["Bank"].unique():
        print(f"`{value}` not in BOCC Bank")

## Check on companies

In [11]:
from carbon_bombs.utils.match_company_bocc import clean
from carbon_bombs.utils.match_company_bocc import fuzz

In [12]:
bocc_df["Company_cleaned"] = bocc_df["Company"].apply(clean)
clean_comp_df = bocc_df[["Company", "Company_cleaned"]].drop_duplicates().reset_index(drop=True)
clean_comp = clean_comp_df["Company_cleaned"].values

In [13]:
for key, value in manual_match_company.items():
    if value not in bocc_df["Company"].unique():
        print(f"`{value}` not in BOCC Companies")
        
        ratio = [fuzz.ratio(x, clean(value)) for x in clean_comp]
        best_id = np.argmax(ratio)
        best_match_val = clean_comp_df.loc[best_id, "Company"]
        print(f"Best fuzz score found for this company: `{best_match_val}` (score = {ratio[best_id]})")
        print()

`BPX Operating` not in BOCC Companies
Best fuzz score found for this company: `APR Operating LLC` (score = 74)

`BPX Operating` not in BOCC Companies
Best fuzz score found for this company: `APR Operating LLC` (score = 74)

`Ballard Petroleum Holding` not in BOCC Companies
Best fuzz score found for this company: `Tamar Petroleum Ltd` (score = 80)

`COG Operating` not in BOCC Companies
Best fuzz score found for this company: `PPC Operating Co LLC` (score = 74)

`China Cinda Asset Management` not in BOCC Companies
Best fuzz score found for this company: `Xinzhou Asset Management Group Co Ltd` (score = 72)

`Cimarex Energy ` not in BOCC Companies
Best fuzz score found for this company: `Crew Energy Inc` (score = 78)

`Kaiser Francis Oil Co ` not in BOCC Companies
Best fuzz score found for this company: `Sheridan Holding Co II LLC` (score = 55)

`Mewbourne Oil Company` not in BOCC Companies
Best fuzz score found for this company: `Tourmaline Oil Corp` (score = 64)



## Check lat long

In [19]:
merge_df = manual_match_lat_long.merge(
    cb_df, left_on=["Carbon_bomb_name_source_CB", "Country_source_CB"], right_on=["Project Name", "Country"], how="left"
)
merge_df["Project Name"].isna().sum()

0

In [20]:
merge_df.loc[merge_df["Project Name"].isna()]

Unnamed: 0,Carbon_bomb_name_source_CB,Country_source_CB,Latitude,Longitude,New_project,Project Name,Country,Potential emissions (GtCO2),Fuel
