# Import packages and load data


In [1]:
# Import packages and load data
import pandas as pd
import numpy as np
from carbon_bombs.io.rystad import load_rystad_emission_database
from carbon_bombs.io.khune_paper import load_carbon_bomb_gasoil_database
from carbon_bombs.conf import FPATH_OUT_CB
from carbon_bombs.conf import SHEETNAME_RYSTAD_CB_EMISSION
from carbon_bombs.conf import SHEETNAME_RYSTAD_GASOIL_EMISSION
from carbon_bombs.conf import SHEETNAME_RYSTAD_CB_EMISSION_INFERIOR_1GT

# Carbon Bombs v1
cb_v1 = load_carbon_bomb_gasoil_database()

# Carbon Bombs v2 is load in 2 steps where we concat data from CB_EMISSION >= 1GT and CB_EMISSION < 1GT 
cb_emission_superior_1gt = load_rystad_emission_database(SHEETNAME_RYSTAD_CB_EMISSION)
cb_emission_inferior_1gt = load_rystad_emission_database(SHEETNAME_RYSTAD_CB_EMISSION_INFERIOR_1GT)

# Remove project with column Total_potential_emissions_in_GTCO2 > 1GT in cb_emission_inferior_1gt
cb_emission_inferior_1gt = cb_emission_inferior_1gt[cb_emission_inferior_1gt["Total_potential_emissions_in_GTCO2"] <= 1]
cb_v2 = pd.concat([cb_emission_superior_1gt,cb_emission_inferior_1gt])

# Gasoil Project > 5 MTCO2 v2 
gasoil_v2 = load_rystad_emission_database(SHEETNAME_RYSTAD_GASOIL_EMISSION)

cb_v2.head(10)

Unnamed: 0,Project_name,Country,Latitude,Longitude,Start_up_year,Producing_potential_emissions_in_GTCO2,Short_term_expansion_potential_emissions_in_GTCO2,Long_term_expansion_potential_emissions_in_GTCO2,Total_potential_emissions_in_GTCO2,Project_name_raw
0,Permian Midland Tight,United States,31.476845,-102.62436,2000.0,5.104154,3.797199,8.412872,17.314227,"Permian Midland Tight, US"
1,Gorgon LNG T1-T3,Australia,-19.83333,114.4667,2016.0,0.930539,0.0,0.833659,1.764198,"Gorgon LNG T1-T3, AU"
2,Goldwyer Shale,Australia,-18.1,124.1,2039.0,0.0,0.0,2.818899,2.818899,"Goldwyer Shale, AU"
3,Velkerri Shale,Australia,-16.86521,134.37621,2043.0,0.0,0.0,1.015168,1.015168,"Velkerri Shale, AU"
4,Xinjiang,China,45.619999,85.089996,1958.0,1.820849,0.0,0.065279,1.886128,"Xinjiang, CN"
5,Xi'nan-CNPC,China,29.268999,105.1497,1945.0,1.526558,0.056123,0.072138,1.654819,"Xi'nan-CNPC, CN"
6,Changqing,China,36.19635,107.6328,1971.0,3.563677,0.0,0.378083,3.941759,"Changqing, CN"
7,Daqing,China,46.620701,125.004,1960.0,2.476757,0.00261,0.090713,2.57008,"Daqing, CN"
8,Tarim,China,37.46629,77.30368,1988.0,2.174101,0.044748,0.159722,2.378571,"Tarim, CN"
9,Shengli,China,37.465446,118.58768,1961.0,1.302346,0.012745,0.106765,1.421857,"Shengli, CN"


# Cleaning actions


In [2]:
# Clean country names between versions
cb_v1["Country"] = cb_v1["Country"].replace({"United Arab Emirates": "UAE"})
# Clean project names when multiple country
cb_v1["Project Name"] = cb_v1.apply(
    lambda row: row["Project Name"].removesuffix(f"_{row['Country']}") if isinstance(row["Project Name"], str) else row["Project Name"],
    axis=1
)


# Define manual matching

In [3]:
# Define manual matching dictionary
manual_matching = {
    "Tarim (CNPC)": "Tarim",
    "Xinjiang (CNPC)": "Xinjiang",
    "Ahwaz Asmari":"Ahwaz (Ahwaz Asmari)",
    "Ahwaz Bangestan":"Ahwaz (Bangestan)",
    "South Pars (Phases 4-5) dry gas": "South Pars (Phases 4-5)",
    "Lula (X-Tupi)":"Tupi (x-Lula)",
    #"Longmaxi Shale (Sichuan/Changyu)": "Longmaxi Shale", # --> WARNING NOT IN NEW VERSION BECAUSE NOT THE SAME PROJECT
    "South Pars (Phases 2-3) dry gas":"South Pars (Phases 2-3)",
    "South Pars (Phases 9-10) dry gas":"South Pars (Phases 9-10)",
    "Tanzanian Coastal Offshore":"Tanzanian Coastal  Offshore", # With an extra space in new version
}

manual_matching_project = {}

# Process datasets and match carbon bombs projects between versions / Create a comparison dataframe

In [4]:
from carbon_bombs.utils.logger import LOGGER

# First, create two dataframes with only the columns we want and rename them
df_v1 = cb_v1[['New_project','Project Name', 'Country','Potential emissions (GtCO2)']].copy()
df_v2 = cb_v2[[
    'Project_name',
    'Country',
    'Producing_potential_emissions_in_GTCO2',
    'Short_term_expansion_potential_emissions_in_GTCO2',
    'Long_term_expansion_potential_emissions_in_GTCO2',
    'Total_potential_emissions_in_GTCO2',
    'Project_name_raw'
]].copy()


# Rename columns
df_v1 = df_v1.rename(columns={
    'New_project': 'project_status_v1',
    'Project Name': 'project_name_v1',
    'Country': 'country_v1',
    'Potential emissions (GtCO2)': 'total_emissions_v1',
})

df_v2 = df_v2.rename(columns={
    'Project_name': 'project_name_v2',
    'Country': 'country_v2',
    'Total_potential_emissions_in_GTCO2': 'total_emissions_v2',
    'Producing_potential_emissions_in_GTCO2': 'producing_emissions_v2',
    'Short_term_expansion_potential_emissions_in_GTCO2': 'short_term_emissions_v2',
    'Long_term_expansion_potential_emissions_in_GTCO2': 'long_term_emissions_v2',
    'Project_name_raw': 'project_name_raw_v2'
})

# Remove trailing and leading spaces from project names
df_v1['project_name_v1'] = df_v1['project_name_v1'].str.strip()
df_v2['project_name_v2'] = df_v2['project_name_v2'].str.strip()

# Create a mapping series for manual matches
manual_mapping = pd.Series(manual_matching)

# Apply manual matching to project names while keeping the original country
df_v1['matching_name'] = df_v1['project_name_v1'].map(manual_matching).fillna(df_v1['project_name_v1']).str.lower()
df_v2['matching_name'] = df_v2['project_name_v2'].str.lower()

# Create composite keys combining project name and country
df_v1['matching_key'] = df_v1['matching_name'] + '|' + df_v1['country_v1'].str.lower()
df_v2['matching_key'] = df_v2['matching_name'] + '|' + df_v2['country_v2'].str.lower()

# Perform outer join using the composite key
comparison_df = df_v1.merge(
    df_v2,
    on='matching_key',
    how='outer',
    indicator=True
)

# Check for country mismatches
country_mismatches = []
for idx, row in comparison_df.iterrows():
    if pd.notna(row['country_v1']) and pd.notna(row['country_v2']) and row['country_v1'] != row['country_v2']:
        LOGGER.warning(f"Country mismatch for project '{row['project_name_v1']}' -> '{row['project_name_v2']}': "
                      f"V1: {row['country_v1']} vs V2: {row['country_v2']}")
        country_mismatches.append({
            'project': row['project_name_v1'],
            'project_v2': row['project_name_v2'],
            'country_v1': row['country_v1'],
            'country_v2': row['country_v2']
        })

# Clean up the DataFrame
comparison_df = comparison_df.drop(['matching_name_x', 'matching_name_y'], axis=1, errors='ignore')

# Print manual matches that were successful
successful_manual_matches = comparison_df[
    comparison_df['project_name_v1'].isin(manual_matching.keys()) & 
    (comparison_df['_merge'] == 'both')
]
if not successful_manual_matches.empty:
    print("\nSuccessful manual matches:")
    for _, row in successful_manual_matches.iterrows():
        print(f"V1: '{row['project_name_v1']}' -> V2: '{row['project_name_v2']}'")




Successful manual matches:
V1: 'Ahwaz Asmari' -> V2: 'Ahwaz (Ahwaz Asmari)'
V1: 'Ahwaz Bangestan' -> V2: 'Ahwaz (Bangestan)'
V1: 'South Pars (Phases 2-3) dry gas' -> V2: 'South Pars (Phases 2-3)'
V1: 'South Pars (Phases 4-5) dry gas' -> V2: 'South Pars (Phases 4-5)'
V1: 'South Pars (Phases 9-10) dry gas' -> V2: 'South Pars (Phases 9-10)'
V1: 'Tanzanian Coastal Offshore' -> V2: 'Tanzanian Coastal  Offshore'
V1: 'Tarim (CNPC)' -> V2: 'Tarim'
V1: 'Lula (X-Tupi)' -> V2: 'Tupi (x-Lula)'
V1: 'Xinjiang (CNPC)' -> V2: 'Xinjiang'


# Add matching with project > 5 MTCO2


In [5]:
# Add distinction between methods (no match, with FID before 2050 or with no FID limit)
# First, add a column matching_method that contains with no FID limit when there is a match
comparison_df['matching_method'] = comparison_df.apply(
    lambda row: 'with no FID limit' if pd.notnull(row['project_name_v1']) and pd.notnull(row['project_name_v2']) 
    else 'no match',
    axis=1
)

# Filter rows from cb v1 where there is a no match
unmatched_cb_v1 = comparison_df[
    comparison_df['project_name_v1'].notnull() &
    comparison_df['project_name_v2'].isnull()
]

# Create a column matching_key in gasoil_v2 
gasoil_v2['matching_key'] = gasoil_v2['Project_name'].str.lower() + '|' + gasoil_v2['Country'].str.lower()

# Perform merge
match_cb_v1_gasoil_projects = unmatched_cb_v1.merge(gasoil_v2, on='matching_key', how='left')

# Update comparison df based on matched projects
matched_keys = match_cb_v1_gasoil_projects['matching_key']
matched_name_map = dict(zip(matched_keys, match_cb_v1_gasoil_projects['Project_name']))
mask = comparison_df['matching_key'].isin(matched_keys)
comparison_df.loc[mask, 'matching_method'] = 'with FID before 2050'
comparison_df.loc[mask, 'project_name_v2'] = comparison_df.loc[mask, 'matching_key'].map(matched_name_map)

# Add manual matching for simple gasoil projects
manual_matching_project = {}
comparison_df.loc[comparison_df['project_name_v1'].isin(manual_matching_project.keys()), 'project_name_v2'] = comparison_df['project_name_v1'].map(manual_matching_project)
comparison_df.loc[comparison_df['project_name_v1'].isin(manual_matching_project.keys()), 'matching_method'] = 'with FID before 2050'

# Filter projects of V2 that are not Carbon Bombs (below 1GTCO2) 

In [6]:
comparison_df = comparison_df[~(comparison_df['project_name_v1'].isna() & (comparison_df['total_emissions_v2'] < 1))]

# Summary informations 


In [7]:
# Sample logic from DataFrame
v1_notna = comparison_df['project_name_v1'].notna()
v2_notna = comparison_df['project_name_v2'].notna()
v2_emissions = comparison_df['total_emissions_v2']
v2_emissions_valid = v2_emissions >= 1
v2_emissions_below = v2_emissions < 1

# Define the conditions
carbon_bombs_v1 = v1_notna
carbon_bombs_v2 = v2_notna & v2_emissions_valid
matched_cb_v1_v2 = v1_notna & v2_notna & v2_emissions_valid
matched_below_threshold = v1_notna & v2_notna & v2_emissions_below
matched_simple_gasoil = v1_notna & v2_notna & v2_emissions.isna()
unmatched_v1 = v1_notna & comparison_df['project_name_v2'].isna()
new_cb_v2 = comparison_df['project_name_v1'].isna() & v2_notna

# Count the entries
count_cb_v1 = carbon_bombs_v1.sum()
count_cb_v2 = carbon_bombs_v2.sum()
count_matched_cb = matched_cb_v1_v2.sum()
count_matched_below = matched_below_threshold.sum()
count_matched_gasoil = matched_simple_gasoil.sum()
count_total_matched = count_matched_cb + count_matched_below + count_matched_gasoil
count_unmatched_v1 = unmatched_v1.sum()
count_new_cb_v2 = new_cb_v2.sum()

# Print the report
print(f"Report:")
print(f"- Number of carbon bombs in v1: {count_cb_v1}")
print(f"- Number of carbon bombs in v2 (≥ 1 GTCO2): {count_cb_v2}")
print(f"- Number of carbon bombs in v1 matched with a carbon bomb in v2 (≥ 1 GTCO2): {count_matched_cb}")
print(f"- Number of carbon bombs in v1 matched with a carbon bomb in v2 below 1 GTCO2 threshold: {count_matched_below}")
print(f"- Number of carbon bombs in v1 matched with a simple gasoil project (no emissions data): {count_matched_gasoil}/n"
"Reminder : WARNING : Those project won't have a column project_name_raw_v2")
print(f"- Total number of carbon bombs in v1 matched in v2: {count_total_matched}")
print(f"- Number of carbon bombs in v1 not matched in v2: {count_unmatched_v1}")
print(f"- Number of new carbon bombs in v2 (not present in v1): {count_new_cb_v2}")


Report:
- Number of carbon bombs in v1: 195
- Number of carbon bombs in v2 (≥ 1 GTCO2): 154
- Number of carbon bombs in v1 matched with a carbon bomb in v2 (≥ 1 GTCO2): 123
- Number of carbon bombs in v1 matched with a carbon bomb in v2 below 1 GTCO2 threshold: 50
- Total number of carbon bombs in v1 matched in v2: 175
- Number of carbon bombs in v1 not matched in v2: 20
- Number of new carbon bombs in v2 (not present in v1): 31


# Calculate difference between versions (Potential emissions)

In [8]:
# Calculate emissions difference (V2 - V1)
comparison_df['total_emissions_difference'] = comparison_df['total_emissions_v2'] - comparison_df['total_emissions_v1']

# Add percentage difference for emissions
comparison_df['total_emissions_pct_difference'] = (
    (comparison_df['total_emissions_v2'] - comparison_df['total_emissions_v1']) / 
    comparison_df['total_emissions_v1'] * 100
).round(2)


# Print summary of differences
print("\nEmissions and Location Differences Summary:")
matched_projects = comparison_df[comparison_df['project_name_v2'].notna()]

print(f"\nEmissions differences (for matched projects):")
print(f"Mean absolute difference: {matched_projects['total_emissions_difference'].abs().mean():.2f} GTCO2")
print(f"Max absolute difference: {matched_projects['total_emissions_difference'].abs().max():.2f} GTCO2")

# Projects with significant emissions differences
projects_big_emission_diff = matched_projects[matched_projects['total_emissions_pct_difference'].abs() > 10]
print(f"Projects with >10% difference: {len(projects_big_emission_diff)}")
if not projects_big_emission_diff.empty:
    print("Projects:")
    for _, row in projects_big_emission_diff.iterrows():
        print(f"- {row['project_name_v1']}: {row['total_emissions_pct_difference']:.1f}% difference "
              f"({row['total_emissions_v1']:.2f} -> {row['total_emissions_v2']:.2f} GTCO2)")



Emissions and Location Differences Summary:

Emissions differences (for matched projects):
Mean absolute difference: 1.00 GTCO2
Max absolute difference: 8.49 GTCO2
Projects with >10% difference: 127
Projects:
- Abqaiq: 72.2% difference (1.59 -> 2.74 GTCO2)
- ACG (Azeri-Chirag-Guneshli Deep Water): -38.3% difference (1.66 -> 1.02 GTCO2)
- Agha Jari: 28.5% difference (1.56 -> 2.01 GTCO2)
- Ahwaz Asmari: -28.1% difference (2.24 -> 1.61 GTCO2)
- Ahwaz Bangestan: 14.3% difference (1.43 -> 1.64 GTCO2)
- Al Khaleej Gas project: -13.2% difference (1.14 -> 0.99 GTCO2)
- Anadarko Shelf_Oklahoma: -79.0% difference (1.82 -> 0.38 GTCO2)
- Area 1 LNG (T1&T2): -26.5% difference (1.01 -> 0.74 GTCO2)
- Asab: 25.8% difference (1.37 -> 1.73 GTCO2)
- Athabasca Oil Sands Project: -20.7% difference (1.36 -> 1.08 GTCO2)
- Azadegan: -80.2% difference (2.28 -> 0.45 GTCO2)
- Bab: -21.5% difference (4.20 -> 3.29 GTCO2)
- Barnett Shale: -17.6% difference (1.36 -> 1.12 GTCO2)
- Basrah Gas project: -16.9% differen

# Data processing for file output comparison : 
1. Ajout des informations du fichier de Kjell 
2. Status du projet dans la v2 = concatenation des statut (Producing, Short term, Long term) lorsque les emissions du projet (par status) ne sont pas nul.
3. Status evolution v1/v2 : [plus une CB, une nouvelle CB, resté la meme chose]
4. une colonne qui flag si le statut était not started dans le papier de Kjell et qu’il y a maintenant des émissions dans la phase production (ce qui veut dire que le projet a commencé) nom : "has_started_since_v1"


In [9]:
# 2. Statut projet v2 = concatenation des statut (Producing, Short term, Long term) lorsque les emissions du projet (par status) ne sont pas nul.
def determine_project_status(row):
    status = []
    if row['producing_emissions_v2'] > 0:
        status.append('producing')
    if row['short_term_emissions_v2'] > 0:
        status.append('short_term_expansion')
    if row['long_term_emissions_v2'] > 0:
        status.append('long_term_expansion')
    return ', '.join(status)

comparison_df['project_status_v2'] = comparison_df.apply(determine_project_status, axis=1)

# 3. Define carbon bombs status changes
conditions = [
    comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].isna(),
    comparison_df['project_name_v1'].isna() & comparison_df['project_name_v2'].notna(),
    comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].notna() & (comparison_df['total_emissions_v2'] < 1),
    comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].notna() & (comparison_df['total_emissions_v2'] >= 1),
    comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].notna()
]

choices = [
    "No longer classified as carbon bomb",
    "Newly identified carbon bomb",
    "Now below 1 GtCO2 threshold",
    "Still classified as carbon bomb",
    "Now classified as new oil and gas expansion project"
]

comparison_df['new_carbon_bombs_status'] = np.select(
    conditions, choices, default="Status unknown"
)

# 4. Flag has_started_since_v1
comparison_df['has_started_since_v1'] = np.where(
    (comparison_df['project_status_v1'].fillna("") == "not started") &
    (comparison_df['producing_emissions_v2'].fillna(0) > 0),
    True, False
)

# Add GEM link

In [10]:
# Carbon Bombs v1 output
from carbon_bombs.conf import FPATH_OUT_CB
cb_v1_with_gem_link = pd.read_csv(FPATH_OUT_CB)

# Filter Oil&Gas projects
cb_v1_with_gem_link = cb_v1_with_gem_link.loc[
    cb_v1_with_gem_link.Fuel_type_source_CB == 'Oil&Gas'
]

# Filter out rows where project_name_v1 is not null
comparison_df_with_only_v1 = comparison_df[comparison_df['project_name_v1'].notnull()].copy()

# Sort both DataFrames
comparison_df_sorted = comparison_df_with_only_v1.sort_values('total_emissions_v1').reset_index()
cb_sorted = cb_v1_with_gem_link.sort_values('Potential_GtCO2_source_CB').reset_index(drop=True)

# Security checks
if len(comparison_df_sorted) != len(cb_sorted):
    raise ValueError("Mismatch in row counts between filtered comparison_df and cb_v1_with_gem_link.")
if comparison_df_sorted['total_emissions_v1'].duplicated().any():
    raise ValueError("Duplicated values found in total_emissions_v1. Join by index may be unreliable.")

# Perform index-based join
comparison_df_sorted['GEM_url'] = cb_sorted['GEM_url_source_GEM'].values

# Reassign the GEM_url values back to comparison_df_with_only_v1 using the original index
comparison_df_with_only_v1 = comparison_df_sorted.set_index('index')
comparison_df_with_only_v1.index.name = None  # Clean index name if needed

# Update original comparison_df
comparison_df = comparison_df.copy()
comparison_df['GEM_url'] = None
comparison_df.update(comparison_df_with_only_v1[['GEM_url']])




# Clean dataframe and save result 

In [11]:
# Clean up final DataFrame
comparison_df = comparison_df.drop('_merge', axis=1)

# Sort the DataFrame to group matched and unmatched projects
comparison_df = comparison_df.sort_values(
    by=['project_name_v1', 'project_name_v2'],
    na_position='last'
)

# Reorder columns
ordered_columns = [
    'project_name_v1',
    'country_v1',
    'project_status_v1',
    'total_emissions_v1',
    'project_name_v2',
    'country_v2',
    'project_status_v2',
    'producing_emissions_v2',
    'short_term_emissions_v2',
    'long_term_emissions_v2',
    'total_emissions_v2',
    'project_name_raw_v2',
    'total_emissions_difference',
    'new_carbon_bombs_status',
    'has_started_since_v1',
    'GEM_url'
]

# Reorder columns and save to CSV
comparison_df = comparison_df[ordered_columns]

# Save to CSV
comparison_df.to_csv('carbon_bombs_comparison.csv', index=False, sep=';')

In [12]:
# Check units for producing_emissions, short_term_emissions & long_term_emissions
# After A.Bogaert message https://data-for-good.slack.com/archives/C08C639D8HM/p1751271580413059

In [13]:
# Step 1: Calculate the sum of columns H, I, J
comparison_df['sum_HIJ'] = (
    comparison_df['producing_emissions_v2'] +
    comparison_df['short_term_emissions_v2'] +
    comparison_df['long_term_emissions_v2']
)

# Step 2: Calculate the ratio of H+I+J to K
comparison_df['ratio_vs_total'] = comparison_df['sum_HIJ'] / comparison_df['total_emissions_v2']

# Step 3: Filter rows where the ratio > 1 (possible unit mismatch)
potential_unit_issues = comparison_df[comparison_df['ratio_vs_total'] > 1]

# Step 4: Print list of project names with potential unit issues
print("\nList of concerned projects (project_name_v2):")
print(potential_unit_issues['project_name_v2'].tolist())


List of concerned projects (project_name_v2):
['Al Khaleej Gas project', 'Anadarko Shelf_Oklahoma', 'Area 1 LNG (T1&T2)', 'Area-1 Future Phases', 'Azadegan', 'Basrah Gas project', 'Beaufort Sea Offshore', 'Bowland Shale', 'Cambrian/Silurian Marine Shale', 'Campos Offshore', 'Central Arabian Offshore', 'Central Arabian Onshore', 'Central Arabian Onshore', 'Chukchi Sea Offshore', 'Dolphin', 'Dovletabad-Donmez', 'El Sharara', 'Gazprom dobycha Nadym', 'Ghawar Ain Dar N', 'Greater Liza (Liza)', 'Gulf Deepwater Offshore', 'Halfayah', 'Hassi Messaoud', 'Johan Sverdrup', 'Karachaganak', 'Kish Gas Project', 'Kuznetsk Depression (Kuzbass) CBM', 'La Luna Shale', 'Lensky Basin CBM', 'Leviathan', 'Libra', 'MZLNG Joint Development (T1-T2)', 'NLNG Base Project', 'North Kara Sea Offshore', 'North Slope Onshore', 'Oil shale China', 'Parnaiba Onshore', 'Pazanan', 'Ratawi', 'Safaniya YTF Concession', 'South Pars (Phases 2-3)', 'South Pars (Phases 20-21)', 'South Pars (Phases 9-10)', 'Sudair Shale', 'Syn