# Import packages and load data


In [1]:
# Import packages and load data
import pandas as pd
from carbon_bombs.io.rystad import load_rystad_cb_emission_database
from carbon_bombs.conf import FPATH_OUT_CB

# Carbon Bombs v2
cb_v2 = load_rystad_cb_emission_database()

# Carbon Bombs v1
cb_v1 = pd.read_csv(FPATH_OUT_CB)


# Define manual matching

In [2]:
# Define manual matching dictionary
manual_matching = {
    "Tarim (CNPC)": "Tarim",
    "Xinjiang Hongshan Coal Mine": "Xinjiang", # Add more manual matches here
}

# Process datasets and match carbon bombs projects between versions / Create a comparison dataframe

In [3]:
from carbon_bombs.utils.logger import LOGGER

# Define manual matching dictionary
manual_matching = {
    "Tarim (CNPC)": "Tarim",
    # Add more manual matches here
}

# First, create two dataframes with only the columns we want and rename them
df_v1 = cb_v1[['Carbon_bomb_name_source_CB', 'Country_source_CB', 'Potential_GtCO2_source_CB', 'Latitude', 'Longitude']].copy()
df_v2 = cb_v2[['Project_name', 'Country', 'Total_potential_emissions_in_GTCO2', 'Latitude', 'Longitude']].copy()

# Rename columns
df_v1 = df_v1.rename(columns={
    'Carbon_bomb_name_source_CB': 'project_name_v1',
    'Country_source_CB': 'country_v1',
    'Potential_GtCO2_source_CB': 'emissions_v1',
    'Latitude': 'latitude_v1',
    'Longitude': 'longitude_v1',
})

df_v2 = df_v2.rename(columns={
    'Project_name': 'project_name_v2',
    'Country': 'country_v2',
    'Total_potential_emissions_in_GTCO2': 'emissions_v2',
    'Latitude': 'latitude_v2',
    'Longitude': 'longitude_v2'
})

# Create a mapping series for manual matches
manual_mapping = pd.Series(manual_matching)

# Apply manual matching to create alternative project names for matching
df_v1['matching_name'] = df_v1['project_name_v1'].map(manual_matching).fillna(df_v1['project_name_v1'])
df_v2['matching_name'] = df_v2['project_name_v2']

# Perform outer join to get all projects from both versions
comparison_df = df_v1.merge(
    df_v2,
    left_on='matching_name',
    right_on='matching_name',
    how='outer',
    indicator=True
)

# Check for country mismatches
country_mismatches = []
for idx, row in comparison_df.iterrows():
    if pd.notna(row['country_v1']) and pd.notna(row['country_v2']) and row['country_v1'] != row['country_v2']:
        LOGGER.warning(f"Country mismatch for project '{row['project_name_v1']}' -> '{row['project_name_v2']}': "
                      f"V1: {row['country_v1']} vs V2: {row['country_v2']}")
        country_mismatches.append({
            'project': row['project_name_v1'],
            'project_v2': row['project_name_v2'],
            'country_v1': row['country_v1'],
            'country_v2': row['country_v2']
        })

# Clean up the DataFrame
comparison_df = comparison_df.drop('matching_name', axis=1)

# Print manual matches that were successful
successful_manual_matches = comparison_df[
    comparison_df['project_name_v1'].isin(manual_matching.keys()) & 
    (comparison_df['_merge'] == 'both')
]
if not successful_manual_matches.empty:
    print("\nSuccessful manual matches:")
    for _, row in successful_manual_matches.iterrows():
        print(f"V1: '{row['project_name_v1']}' -> V2: '{row['project_name_v2']}'")

# Print country mismatches if any
if country_mismatches:
    print("\nDetailed country mismatches:")
    for mismatch in country_mismatches:
        print(f"Project: {mismatch['project']} -> {mismatch['project_v2']}")
        print(f"  V1 country: {mismatch['country_v1']}")
        print(f"  V2 country: {mismatch['country_v2']}")



Successful manual matches:
V1: 'Tarim (CNPC)' -> V2: 'Tarim'


# Summary informations 


In [4]:
# Print summary
print(f"\nSummary:")
print(f"Total projects in v1: {len(df_v1)}")
print(f"Total projects in v2: {len(df_v2)}")
print(f"Matched projects: {len(comparison_df[comparison_df['_merge'] == 'both'])}")
print(f"Projects only in v1: {len(comparison_df[comparison_df['_merge'] == 'left_only'])}")
print(f"Projects only in v2: {len(comparison_df[comparison_df['_merge'] == 'right_only'])}")
print(f"Number of country mismatches: {len(country_mismatches)}")

# Print projects only in V2
v2_only = comparison_df[comparison_df['_merge'] == 'right_only']
if not v2_only.empty:
    print("\nProjects only in V2:")
    for _, row in v2_only.iterrows():
        print(f"- {row['project_name_v2']} ({row['country_v2']})")


Summary:
Total projects in v1: 425
Total projects in v2: 17
Matched projects: 12
Projects only in v1: 413
Projects only in v2: 5
Number of country mismatches: 0

Projects only in V2:
- Cambay Shale (India)
- Tarim Basin Onshore (China)
- Xi'nan-CNPC (China)
- Xinjiang (China)
- Yaan Slope Onshore Sichuan Province (China)


# Calculate difference between versions (Potential emissions and coordinates)

In [5]:
from geopy.distance import geodesic

# Calculate distance between coordinates using geopy
def calculate_distance(row):
    """Calculate the distance between two points in kilometers using geopy"""
    if pd.isna(row['latitude_v1']) or pd.isna(row['longitude_v1']) or pd.isna(row['latitude_v2']) or pd.isna(row['longitude_v2']):
        return None
    
    coords_1 = (row['latitude_v1'], row['longitude_v1'])
    coords_2 = (row['latitude_v2'], row['longitude_v2'])
    
    return geodesic(coords_1, coords_2).kilometers

# Calculate emissions difference (V2 - V1)
comparison_df['emissions_difference'] = comparison_df['emissions_v2'] - comparison_df['emissions_v1']

# Add percentage difference for emissions
comparison_df['emissions_pct_difference'] = (
    (comparison_df['emissions_v2'] - comparison_df['emissions_v1']) / 
    comparison_df['emissions_v1'] * 100
).round(2)

# Calculate distance difference (V2 - V1)
comparison_df['distance_km'] = comparison_df.apply(calculate_distance, axis=1)

# Print summary of differences
print("\nEmissions and Location Differences Summary:")
matched_projects = comparison_df[comparison_df['project_name_v2'].notna()]

print(f"\nEmissions differences (for matched projects):")
print(f"Mean absolute difference: {matched_projects['emissions_difference'].abs().mean():.2f} GTCO2")
print(f"Max absolute difference: {matched_projects['emissions_difference'].abs().max():.2f} GTCO2")

# Projects with significant emissions differences
projects_big_emission_diff = matched_projects[matched_projects['emissions_pct_difference'].abs() > 10]
print(f"Projects with >10% difference: {len(projects_big_emission_diff)}")
if not projects_big_emission_diff.empty:
    print("Projects:")
    for _, row in projects_big_emission_diff.iterrows():
        print(f"- {row['project_name_v1']}: {row['emissions_pct_difference']:.1f}% difference "
              f"({row['emissions_v1']:.2f} -> {row['emissions_v2']:.2f} GTCO2)")

print(f"\nLocation differences (for matched projects):")
print(f"Mean distance between points: {matched_projects['distance_km'].mean():.2f} km")
print(f"Max distance between points: {matched_projects['distance_km'].max():.2f} km")

# Projects with significant location differences
projects_big_location_diff = matched_projects[matched_projects['distance_km'] > 10]
print(f"Projects with >10km difference: {len(projects_big_location_diff)}")
if not projects_big_location_diff.empty:
    print("Projects:")
    for _, row in projects_big_location_diff.iterrows():
        print(f"- {row['project_name_v1']}: {row['distance_km']:.1f} km difference")
        print(f"  V1 coordinates: ({row['latitude_v1']:.4f}, {row['longitude_v1']:.4f})")
        print(f"  V2 coordinates: ({row['latitude_v2']:.4f}, {row['longitude_v2']:.4f})")



Emissions and Location Differences Summary:

Emissions differences (for matched projects):
Mean absolute difference: 0.81 GTCO2
Max absolute difference: 4.56 GTCO2
Projects with >10% difference: 7
Projects:
- Agha Jari: 28.5% difference (1.56 -> 2.01 GTCO2)
- Changqing: -20.2% difference (4.94 -> 3.94 GTCO2)
- Gachsaran: -15.0% difference (1.74 -> 1.48 GTCO2)
- Goldwyer Shale: -36.9% difference (4.47 -> 2.82 GTCO2)
- Longmaxi Shale: -80.1% difference (5.69 -> 1.13 GTCO2)
- Sembar Shale: -13.3% difference (2.84 -> 2.46 GTCO2)
- Tarim (CNPC): 21.4% difference (1.96 -> 2.38 GTCO2)

Location differences (for matched projects):
Mean distance between points: 746.24 km
Max distance between points: 2164.33 km
Projects with >10km difference: 10
Projects:
- Changqing: 163.3 km difference
  V1 coordinates: (37.4467, 106.6668)
  V2 coordinates: (36.1964, 107.6328)
- Daqing: 2126.0 km difference
  V1 coordinates: (34.9245, 104.9102)
  V2 coordinates: (46.6207, 125.0040)
- Goldwyer Shale: 1322.4 km

# Clean dataframe and save result 

In [None]:
# Clean up final DataFrame
comparison_df = comparison_df.drop('_merge', axis=1)

# Sort the DataFrame to group matched and unmatched projects
comparison_df = comparison_df.sort_values(
    by=['project_name_v1', 'project_name_v2'],
    na_position='last'
)

# Reorder columns
ordered_columns = [
    'project_name_v1',
    'project_name_v2',
    'country_v1',
    'country_v2',
    'emissions_v1',
    'emissions_v2',
    'emissions_difference',
    'emissions_pct_difference',
    'latitude_v1',
    'longitude_v1',
    'latitude_v2',
    'longitude_v2',
    'distance_km'
]

# Reorder columns and save to CSV
comparison_df = comparison_df[ordered_columns]

# Save to CSV
comparison_df.to_csv('carbon_bombs_comparison.csv', index=False, sep=';')

KeyError: "['lat_v1', 'lon_v1', 'lat_v2', 'lon_v2'] not in index"