# Import packages and load data


In [1]:
# Import packages and load data
import pandas as pd
import numpy as np
from carbon_bombs.io.rystad import load_rystad_cb_emission_database
from carbon_bombs.io.rystad import load_rystad_gasoil_emission_database
from carbon_bombs.io.khune_paper import load_carbon_bomb_gasoil_database
from carbon_bombs.conf import FPATH_OUT_CB

# Carbon Bombs v1
cb_v1 = load_carbon_bomb_gasoil_database()

# Carbon Bombs v2
cb_v2 = load_rystad_cb_emission_database()

# Gasoil Project > 5 MTCO2 v2 
gasoil_v2 = load_rystad_gasoil_emission_database()

# Carbon Bombs v1 with GEM link




# Cleaning actions


In [2]:
# Clean country names between versions
cb_v1["Country"] = cb_v1["Country"].replace({"United Arab Emirates": "UAE"})
# Clean project names when multiple country
cb_v1["Project Name"] = cb_v1.apply(
    lambda row: row["Project Name"].removesuffix(f"_{row['Country']}") if isinstance(row["Project Name"], str) else row["Project Name"],
    axis=1
)


# Define manual matching

In [3]:
# Define manual matching dictionary
manual_matching = {
    "Tarim (CNPC)": "Tarim",
    "Xinjiang (CNPC)": "Xinjiang",
    "Ahwaz Asmari":"Ahwaz (Ahwaz Asmari)",
    "Ahwaz Bangestan":"Ahwaz (Bangestan)",
    "Austin Chalk Tight":"Austin Chalk Horizontal", # A vérifier 
    "Greater Burgan": "Greater Burgan (CR in field)",
    "Hassi R'Mel (Domestic)": "Hassi R'Mel (Pipe Export)",
    "West Siberia Onshore" : "Lukoil-West Siberia",
    "South Pars (Phases 4-5) dry gas": "South Pars (Phases 4-5)",
    "South Pars (Phases 22-24)":"South Pars (Phases 23-28)",
    # 2 project can be match for Yamal in v1 : Bovanenkovo Zone or Tambei Zone, we keep the southern zone for Tambei
    "Tambey Zone (Yamal Megaproject)" : "Southern Zone (Yamal Megaproject)", # A vérifier
    "Greater Turbot (Stabroek)" : "Stabroek Future Phases",
    # Only one project in tanzania for both version, we decide to match them manually
    "Tanzanian Coastal Offshore":"Tanzania LNG (Block 1/4 and 2)", # A vérifier
    "Lula (X-Tupi)":"Tupi (x-Lula)",
    "Longmaxi Shale (Sichuan/Changyu)": "Yaan Slope Onshore Sichuan Province", # A vérifier
    "South Pars (Phases 2-3) dry gas":"South Pars (Phases 2-3)",
    "South Pars (Phases 9-10) dry gas":"South Pars (Phases 9-10)"
}

# Process datasets and match carbon bombs projects between versions / Create a comparison dataframe

In [8]:
from carbon_bombs.utils.logger import LOGGER

# First, create two dataframes with only the columns we want and rename them
df_v1 = cb_v1[['New_project','Project Name', 'Country','Potential emissions (GtCO2)']].copy()
df_v2 = cb_v2[[
    'Project_name',
    'Country',
    'Potential_GtCO2_producing',
    'Potential_GtCO2_short_term_expansion',
    'Potential_GtCO2_long_term_expansion',
    'Potential_GtCO2_total'
]].copy()


# Rename columns
df_v1 = df_v1.rename(columns={
    'New_project': 'project_status_v1',
    'Project Name': 'project_name_v1',
    'Country': 'country_v1',
    'Potential emissions (GtCO2)': 'total_emissions_v1',
})

df_v2 = df_v2.rename(columns={
    'Project_name': 'project_name_v2',
    'Country': 'country_v2',
    'Potential_GtCO2_total': 'total_emissions_v2',
    'Potential_GtCO2_producing': 'producing_emissions_v2',
    'Potential_GtCO2_short_term_expansion': 'short_term_emissions_v2',
    'Potential_GtCO2_long_term_expansion': 'long_term_emissions_v2'
})

# Remove trailing and leading spaces from project names
df_v1['project_name_v1'] = df_v1['project_name_v1'].str.strip()
df_v2['project_name_v2'] = df_v2['project_name_v2'].str.strip()

# Create a mapping series for manual matches
manual_mapping = pd.Series(manual_matching)

# Apply manual matching to project names while keeping the original country
df_v1['matching_name'] = df_v1['project_name_v1'].map(manual_matching).fillna(df_v1['project_name_v1']).str.lower()
df_v2['matching_name'] = df_v2['project_name_v2'].str.lower()

# Create composite keys combining project name and country
df_v1['matching_key'] = df_v1['matching_name'] + '|' + df_v1['country_v1'].str.lower()
df_v2['matching_key'] = df_v2['matching_name'] + '|' + df_v2['country_v2'].str.lower()

# Perform outer join using the composite key
comparison_df = df_v1.merge(
    df_v2,
    on='matching_key',
    how='outer',
    indicator=True
)

# Check for country mismatches
country_mismatches = []
for idx, row in comparison_df.iterrows():
    if pd.notna(row['country_v1']) and pd.notna(row['country_v2']) and row['country_v1'] != row['country_v2']:
        LOGGER.warning(f"Country mismatch for project '{row['project_name_v1']}' -> '{row['project_name_v2']}': "
                      f"V1: {row['country_v1']} vs V2: {row['country_v2']}")
        country_mismatches.append({
            'project': row['project_name_v1'],
            'project_v2': row['project_name_v2'],
            'country_v1': row['country_v1'],
            'country_v2': row['country_v2']
        })

# Clean up the DataFrame
comparison_df = comparison_df.drop(['matching_name_x', 'matching_name_y'], axis=1, errors='ignore')

# Print manual matches that were successful
successful_manual_matches = comparison_df[
    comparison_df['project_name_v1'].isin(manual_matching.keys()) & 
    (comparison_df['_merge'] == 'both')
]
if not successful_manual_matches.empty:
    print("\nSuccessful manual matches:")
    for _, row in successful_manual_matches.iterrows():
        print(f"V1: '{row['project_name_v1']}' -> V2: '{row['project_name_v2']}'")




Successful manual matches:
V1: 'Ahwaz Asmari' -> V2: 'Ahwaz (Ahwaz Asmari)'
V1: 'Ahwaz Bangestan' -> V2: 'Ahwaz (Bangestan)'
V1: 'Austin Chalk Tight' -> V2: 'Austin Chalk Horizontal'
V1: 'Greater Burgan' -> V2: 'Greater Burgan (CR in field)'
V1: 'Hassi R'Mel (Domestic)' -> V2: 'Hassi R'Mel (Pipe Export)'
V1: 'West Siberia Onshore' -> V2: 'Lukoil-West Siberia'
V1: 'South Pars (Phases 2-3) dry gas' -> V2: 'South Pars (Phases 2-3)'
V1: 'South Pars (Phases 22-24)' -> V2: 'South Pars (Phases 23-28)'
V1: 'South Pars (Phases 4-5) dry gas' -> V2: 'South Pars (Phases 4-5)'
V1: 'South Pars (Phases 9-10) dry gas' -> V2: 'South Pars (Phases 9-10)'
V1: 'Tambey Zone (Yamal Megaproject)' -> V2: 'Southern Zone (Yamal Megaproject)'
V1: 'Greater Turbot (Stabroek)' -> V2: 'Stabroek Future Phases'
V1: 'Tanzanian Coastal Offshore' -> V2: 'Tanzania LNG (Block 1/4 and 2)'
V1: 'Tarim (CNPC)' -> V2: 'Tarim'
V1: 'Lula (X-Tupi)' -> V2: 'Tupi (x-Lula)'
V1: 'Xinjiang (CNPC)' -> V2: 'Xinjiang'
V1: 'Longmaxi Shale 

# Add matching with project > 5 MTCO2


In [9]:
# Add distinction between methods (no match, with FID before 2050 or with no FID limit)
# First, add a column matching_method that contains with no FID limit when there is a match
comparison_df['matching_method'] = comparison_df.apply(
    lambda row: 'with no FID limit' if pd.notnull(row['project_name_v1']) and pd.notnull(row['project_name_v2']) 
    else 'no match',
    axis=1
)

# Filter rows from cb v1 where there is a no match
unmatched_cb_v1 = comparison_df[
    comparison_df['project_name_v1'].notnull() &
    comparison_df['project_name_v2'].isnull()
]

# Create a column matching_key in gasoil_v2 
gasoil_v2['matching_key'] = gasoil_v2['Project_name'].str.lower() + '|' + gasoil_v2['Country'].str.lower()

# Perform merge
match_cb_v1_gasoil_projects = unmatched_cb_v1.merge(gasoil_v2, on='matching_key', how='left')

# Update comparison df based on matched projects
matched_keys = match_cb_v1_gasoil_projects['matching_key']
matched_name_map = dict(zip(matched_keys, match_cb_v1_gasoil_projects['Project_name']))
mask = comparison_df['matching_key'].isin(matched_keys)
comparison_df.loc[mask, 'matching_method'] = 'with FID before 2050'
comparison_df.loc[mask, 'project_name_v2'] = comparison_df.loc[mask, 'matching_key'].map(matched_name_map)

# Summary informations 


In [10]:
# Sample logic from DataFrame
v1_notna = comparison_df['project_name_v1'].notna()
v2_notna = comparison_df['project_name_v2'].notna()
v2_emissions = comparison_df['total_emissions_v2']
v2_emissions_valid = v2_emissions >= 1
v2_emissions_below = v2_emissions < 1

# Define the conditions
carbon_bombs_v1 = v1_notna
carbon_bombs_v2 = v2_notna & v2_emissions_valid
matched_cb_v1_v2 = v1_notna & v2_notna & v2_emissions_valid
matched_below_threshold = v1_notna & v2_notna & v2_emissions_below
matched_simple_gasoil = v1_notna & v2_notna & v2_emissions.isna()
unmatched_v1 = v1_notna & comparison_df['project_name_v2'].isna()
new_cb_v2 = comparison_df['project_name_v1'].isna() & v2_notna

# Count the entries
count_cb_v1 = carbon_bombs_v1.sum()
count_cb_v2 = carbon_bombs_v2.sum()
count_matched_cb = matched_cb_v1_v2.sum()
count_matched_below = matched_below_threshold.sum()
count_matched_gasoil = matched_simple_gasoil.sum()
count_total_matched = count_matched_cb + count_matched_below + count_matched_gasoil
count_unmatched_v1 = unmatched_v1.sum()
count_new_cb_v2 = new_cb_v2.sum()

# Print the report
print(f"Report:")
print(f"- Number of carbon bombs in v1: {count_cb_v1}")
print(f"- Number of carbon bombs in v2 (≥ 1 GTCO2): {count_cb_v2}")
print(f"- Number of carbon bombs in v1 matched with a carbon bomb in v2 (≥ 1 GTCO2): {count_matched_cb}")
print(f"- Number of carbon bombs in v1 matched with a carbon bomb in v2 below 1 GTCO2 threshold: {count_matched_below}")
print(f"- Number of carbon bombs in v1 matched with a simple gasoil project (no emissions data): {count_matched_gasoil}")
print(f"- Total number of carbon bombs in v1 matched in v2: {count_total_matched}")
print(f"- Number of carbon bombs in v1 not matched in v2: {count_unmatched_v1}")
print(f"- Number of new carbon bombs in v2 (not present in v1): {count_new_cb_v2}")


Report:
- Number of carbon bombs in v1: 195
- Number of carbon bombs in v2 (≥ 1 GTCO2): 154
- Number of carbon bombs in v1 matched with a carbon bomb in v2 (≥ 1 GTCO2): 129
- Number of carbon bombs in v1 matched with a carbon bomb in v2 below 1 GTCO2 threshold: 15
- Number of carbon bombs in v1 matched with a simple gasoil project (no emissions data): 11
- Total number of carbon bombs in v1 matched in v2: 155
- Number of carbon bombs in v1 not matched in v2: 40
- Number of new carbon bombs in v2 (not present in v1): 52


# Calculate difference between versions (Potential emissions)

In [11]:
# Calculate emissions difference (V2 - V1)
comparison_df['total_emissions_difference'] = comparison_df['total_emissions_v2'] - comparison_df['total_emissions_v1']

# Add percentage difference for emissions
comparison_df['total_emissions_pct_difference'] = (
    (comparison_df['total_emissions_v2'] - comparison_df['total_emissions_v1']) / 
    comparison_df['total_emissions_v1'] * 100
).round(2)


# Print summary of differences
print("\nEmissions and Location Differences Summary:")
matched_projects = comparison_df[comparison_df['project_name_v2'].notna()]

print(f"\nEmissions differences (for matched projects):")
print(f"Mean absolute difference: {matched_projects['total_emissions_difference'].abs().mean():.2f} GTCO2")
print(f"Max absolute difference: {matched_projects['total_emissions_difference'].abs().max():.2f} GTCO2")

# Projects with significant emissions differences
projects_big_emission_diff = matched_projects[matched_projects['total_emissions_pct_difference'].abs() > 10]
print(f"Projects with >10% difference: {len(projects_big_emission_diff)}")
if not projects_big_emission_diff.empty:
    print("Projects:")
    for _, row in projects_big_emission_diff.iterrows():
        print(f"- {row['project_name_v1']}: {row['total_emissions_pct_difference']:.1f}% difference "
              f"({row['total_emissions_v1']:.2f} -> {row['total_emissions_v2']:.2f} GTCO2)")



Emissions and Location Differences Summary:

Emissions differences (for matched projects):
Mean absolute difference: 0.99 GTCO2
Max absolute difference: 8.47 GTCO2
Projects with >10% difference: 99
Projects:
- Abqaiq: 72.2% difference (1.59 -> 2.74 GTCO2)
- ACG (Azeri-Chirag-Guneshli Deep Water): -38.3% difference (1.66 -> 1.02 GTCO2)
- Agha Jari: 28.5% difference (1.56 -> 2.01 GTCO2)
- Ahwaz Asmari: -28.1% difference (2.24 -> 1.61 GTCO2)
- Ahwaz Bangestan: 14.3% difference (1.43 -> 1.64 GTCO2)
- Al Khaleej Gas project: -13.2% difference (1.14 -> 0.99 GTCO2)
- Asab: 25.8% difference (1.37 -> 1.73 GTCO2)
- Athabasca Oil Sands Project: -20.7% difference (1.36 -> 1.08 GTCO2)
- Austin Chalk Tight: 75.2% difference (1.39 -> 2.44 GTCO2)
- Bab: -21.5% difference (4.20 -> 3.29 GTCO2)
- Barnett Shale: -17.6% difference (1.36 -> 1.12 GTCO2)
- Basrah Gas project: -16.9% difference (1.04 -> 0.87 GTCO2)
- Bovanenkovo Zone (Yamal Megaproject): -42.0% difference (11.16 -> 6.48 GTCO2)
- Bowland Shale

# Calculate difference between versions (Coordinates) --> Deprecated

In [8]:
# from geopy.distance import geodesic

# # Calculate distance between coordinates using geopy
# def calculate_distance(row):
#     """Calculate the distance between two points in kilometers using geopy"""
#     if pd.isna(row['latitude_v1']) or pd.isna(row['longitude_v1']) or pd.isna(row['latitude_v2']) or pd.isna(row['longitude_v2']):
#         return None
    
#     coords_1 = (row['latitude_v1'], row['longitude_v1'])
#     coords_2 = (row['latitude_v2'], row['longitude_v2'])
    
#     return geodesic(coords_1, coords_2).kilometers
# # Calculate distance difference (V2 - V1)
# comparison_df['distance_km'] = comparison_df.apply(calculate_distance, axis=1)

# print(f"\nLocation differences (for matched projects):")
# print(f"Mean distance between points: {matched_projects['distance_km'].mean():.2f} km")
# print(f"Max distance between points: {matched_projects['distance_km'].max():.2f} km")

# # Projects with significant location differences
# projects_big_location_diff = matched_projects[matched_projects['distance_km'] > 10]
# print(f"Projects with >10km difference: {len(projects_big_location_diff)}")
# if not projects_big_location_diff.empty:
#     print("Projects:")
#     for _, row in projects_big_location_diff.iterrows():
#         print(f"- {row['project_name_v1']}: {row['distance_km']:.1f} km difference")
#         print(f"  V1 coordinates: ({row['latitude_v1']:.4f}, {row['longitude_v1']:.4f})")
#         print(f"  V2 coordinates: ({row['latitude_v2']:.4f}, {row['longitude_v2']:.4f})")

# Data processing for file output comparison : 
1. Ajout des informations du fichier de Kjell 
2. Status du projet dans la v2 = concatenation des statut (Producing, Short term, Long term) lorsque les emissions du projet (par status) ne sont pas nul.
3. Status evolution v1/v2 : [plus une CB, une nouvelle CB, resté la meme chose]
4. une colonne qui flag si le statut était not started dans le papier de Kjell et qu’il y a maintenant des émissions dans la phase production (ce qui veut dire que le projet a commencé) nom : "has_started_since_v1"


In [12]:
# 2. Statut projet v2 = concatenation des statut (Producing, Short term, Long term) lorsque les emissions du projet (par status) ne sont pas nul.
def determine_project_status(row):
    status = []
    if row['producing_emissions_v2'] > 0:
        status.append('producing')
    if row['short_term_emissions_v2'] > 0:
        status.append('short_term_expansion')
    if row['long_term_emissions_v2'] > 0:
        status.append('long_term_expansion')
    return ', '.join(status)

comparison_df['project_status_v2'] = comparison_df.apply(determine_project_status, axis=1)

# 3. Define carbon bombs status changes
conditions = [
    comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].isna(),
    comparison_df['project_name_v1'].isna() & comparison_df['project_name_v2'].notna(),
    comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].notna() & (comparison_df['total_emissions_v2'] < 1),
    comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].notna() & (comparison_df['total_emissions_v2'] >= 1),
    comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].notna()
]

choices = [
    "No longer classified as carbon bomb",
    "Newly identified carbon bomb",
    "Now below 1 GtCO2 threshold",
    "Still classified as carbon bomb",
    "Now classified as simple gasoil project"
]

comparison_df['new_carbon_bombs_status'] = np.select(
    conditions, choices, default="Status unknown"
)

# 4. Flag has_started_since_v1
comparison_df['has_started_since_v1'] = np.where(
    (comparison_df['project_status_v1'].fillna("") == "not started") &
    (comparison_df['producing_emissions_v2'].fillna(0) > 0),
    True, False
)

# Add GEM link

In [13]:
# Carbon Bombs v1 output
from carbon_bombs.conf import FPATH_OUT_CB
cb_v1_with_gem_link = pd.read_csv(FPATH_OUT_CB)

# Filter Oil&Gas projects
cb_v1_with_gem_link = cb_v1_with_gem_link.loc[
    cb_v1_with_gem_link.Fuel_type_source_CB == 'Oil&Gas'
]

# Filter out rows where project_name_v1 is not null
comparison_df_with_only_v1 = comparison_df[comparison_df['project_name_v1'].notnull()].copy()

# Sort both DataFrames
comparison_df_sorted = comparison_df_with_only_v1.sort_values('total_emissions_v1').reset_index()
cb_sorted = cb_v1_with_gem_link.sort_values('Potential_GtCO2_source_CB').reset_index(drop=True)

# Security checks
if len(comparison_df_sorted) != len(cb_sorted):
    raise ValueError("Mismatch in row counts between filtered comparison_df and cb_v1_with_gem_link.")
if comparison_df_sorted['total_emissions_v1'].duplicated().any():
    raise ValueError("Duplicated values found in total_emissions_v1. Join by index may be unreliable.")

# Perform index-based join
comparison_df_sorted['GEM_url'] = cb_sorted['GEM_url_source_GEM'].values

# Reassign the GEM_url values back to comparison_df_with_only_v1 using the original index
comparison_df_with_only_v1 = comparison_df_sorted.set_index('index')
comparison_df_with_only_v1.index.name = None  # Clean index name if needed

# Update original comparison_df
comparison_df = comparison_df.copy()
comparison_df['GEM_url'] = None
comparison_df.update(comparison_df_with_only_v1[['GEM_url']])




In [11]:
comparison_df_with_only_v1.head(50)

Unnamed: 0,project_status_v1,project_name_v1,country_v1,total_emissions_v1,matching_key,project_name_v2,country_v2,producing_emissions_v2,short_term_emissions_v2,long_term_emissions_v2,total_emissions_v2,_merge,matching_method,total_emissions_difference,total_emissions_pct_difference,project_status_v2,new_carbon_bombs_status,has_started_since_v1,GEM_url
82,operating,Greater Liza (Liza),Guyana,1.002784,greater liza (liza)|guyana,Greater Liza (Liza),,,,,,left_only,with FID before 2050,,,,Now classified as simple gasoil project,False,No informations available on GEM
10,not started,Area 1 LNG (T1&T2),Mozambique,1.006655,area 1 lng (t1&t2)|mozambique,,,,,,,left_only,with FID before 2050,,,,No longer classified as carbon bomb,False,https://www.gem.wiki/Area_1_Gas_Block_(Mozambi...
59,operating,El Sharara,Libya,1.007846,el sharara|libya,,,,,,,left_only,with FID before 2050,,,,No longer classified as carbon bomb,False,https://www.gem.wiki/El_Sharara_Oil_Field_(Libya)
191,operating,South Pars (Phases 2-3) dry gas,Iran,1.016274,south pars (phases 2-3)|iran,South Pars (Phases 2-3),Iran,0.957057,0.0,0.0,0.957057,both,with no FID limit,-0.059218,-5.83,producing,Now below 1 GtCO2 threshold,False,https://www.gem.wiki/South_Pars_(Phase_2-3)_Ga...
192,operating,South Pars (Phases 20-21),Iran,1.017173,south pars (phases 20-21)|iran,South Pars (Phases 20-21),Iran,0.842407,0.0,0.0,0.842407,both,with no FID limit,-0.174766,-17.18,producing,Now below 1 GtCO2 threshold,False,https://www.gem.wiki/South_Pars_(Phase_20-21)_...
12,not started,Area-1 Future Phases,Mozambique,1.022724,area-1 future phases|mozambique,Area-1 Future Phases,Mozambique,0.0,0.0,0.9980631,0.998063,both,with no FID limit,-0.024661,-2.41,long_term_expansion,Now below 1 GtCO2 threshold,False,https://www.gem.wiki/Area_1_Gas_Block_(Mozambi...
239,not started,Yucatan Platform Offshore,Mexico,1.025798,yucatan platform offshore|mexico,,,,,,,left_only,with FID before 2050,,,,No longer classified as carbon bomb,False,https://www.gem.wiki/Akal_Oil_and_Gas_Field_(M...
87,operating,Harmaliyah,Saudi Arabia,1.028096,harmaliyah|saudi arabia,Harmaliyah,Saudi Arabia,0.789263,0.0,0.292545,1.081808,both,with no FID limit,0.053712,5.22,"producing, long_term_expansion",Still classified as carbon bomb,False,No informations available on GEM
205,not started,Tanzanian Coastal Offshore,Tanzania,1.031198,tanzania lng (block 1/4 and 2)|tanzania,Tanzania LNG (Block 1/4 and 2),Tanzania,0.0,0.0,1.137953,1.137953,both,with no FID limit,0.106755,10.35,long_term_expansion,Still classified as carbon bomb,False,No informations available on GEM
201,not started,Sudair Shale,Saudi Arabia,1.032893,sudair shale|saudi arabia,,,,,,,left_only,with FID before 2050,,,,No longer classified as carbon bomb,False,No informations available on GEM


# Clean dataframe and save result 

In [12]:
# Clean up final DataFrame
comparison_df = comparison_df.drop('_merge', axis=1)

# Sort the DataFrame to group matched and unmatched projects
comparison_df = comparison_df.sort_values(
    by=['project_name_v1', 'project_name_v2'],
    na_position='last'
)

# Reorder columns
ordered_columns = [
    'project_name_v1',
    'country_v1',
    'project_status_v1',
    'total_emissions_v1',
    'project_name_v2',
    'country_v2',
    'project_status_v2',
    'producing_emissions_v2',
    'short_term_emissions_v2',
    'long_term_emissions_v2',
    'total_emissions_v2',
    'total_emissions_difference',
    'new_carbon_bombs_status',
    'has_started_since_v1',
    'GEM_url'
]

# Reorder columns and save to CSV
comparison_df = comparison_df[ordered_columns]

# Save to CSV
comparison_df.to_csv('carbon_bombs_comparison.csv', index=False, sep=';')

# Graphs


In [13]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Only keep matched projects (those with rows'project_name_v1' and 'project_name_v2' columns not empty)
# filtered_df = comparison_df[comparison_df['project_name_v1'].notna() & comparison_df['project_name_v2'].notna()]
# # filtered_df = filtered_df[
# #     (filtered_df['emissions_difference'] >= -2.5) & 
# #     (filtered_df['emissions_difference'] <= 2.5)
# # ]


# # Create the histogram
# sns.histplot(filtered_df['emissions_difference'].dropna(), bins=30, kde=True, color='skyblue')

# # Add titles and labels
# plt.title('Distribution of Emissions Difference', fontsize=16)
# plt.xlabel('Emissions Difference', fontsize=14)
# plt.ylabel('Frequency', fontsize=14)

# # Show the plot
# plt.show()