In [121]:
import pandas as pd
import numpy as np
import pycountry_convert as pc
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [122]:
invalid = []

In [123]:
totals_path = './combined_data/combined_co2_ghg_totals.csv'
per_gdp_path = './combined_data/combined_co2_ghg_per_gdp.csv'
per_capita_path = './combined_data/combined_co2_ghg_per_capita.csv'
sector_wise_path = './combined_data/combined_co2_ghg_by_sector.csv'

# Read the CSV files using the correct delimiter
totals_df = pd.read_csv(totals_path, delimiter=',')
per_gdp_df = pd.read_csv(per_gdp_path, delimiter=',')
per_capita_df = pd.read_csv(per_capita_path, delimiter=',')
sector_wise_df = pd.read_csv(sector_wise_path, delimiter=',')

  sector_wise_df = pd.read_csv(sector_wise_path, delimiter=',')


In [124]:
totals_df.dropna(inplace=True)
per_gdp_df.dropna(inplace=True)
per_capita_df.dropna(inplace=True)
sector_wise_df.dropna(inplace=True)

In [125]:
# Displaying first few rows for a quick inspection
print(totals_df.head())

# Summary of the dataset
print(totals_df.info())

# Statistical summary
print(totals_df.describe())

  Substance EDGAR Country Code                 Country  Year    Total CO2  \
0       CO2                ABW                   Aruba  1970  0,025213789   
1       CO2                AFG             Afghanistan  1970  1,734053007   
2       CO2                AGO                  Angola  1970  8,948152992   
3       CO2                AIA                Anguilla  1970  0,002177587   
4       CO2                AIR  International Aviation  1970  168,6025154   

     Total GHG  
0  0,045175752  
1  17,33619212  
2  20,13836441  
3  0,004256361  
4  171,1604553  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11236 entries, 0 to 11235
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Substance           11236 non-null  object
 1   EDGAR Country Code  11236 non-null  object
 2   Country             11236 non-null  object
 3   Year                11236 non-null  int64 
 4   Total CO2           11236 non-null

In [126]:
def country_to_continent(country_name):
    try:
        # Handling specific cases directly
        if country_name == 'Côte d’Ivoire':  # Make sure to use the exact character
            return 'Africa'
        elif country_name == 'International Aviation' or country_name == 'International Shipping' or country_name == 'EU27' or country_name == 'GLOBAL TOTAL' or country_name == None:
            return 'N/A'
        elif pd.isna(country_name):  # Handling NaN values
            return 'N/A'
        elif country_name == "Switzerland and Liechtenstein":
            return 'Europe'
        elif country_name == 'Faroes':
            return 'Europe'
        elif country_name == 'Western Sahara':
            return 'Africa'
        elif country_name == 'The Gambia':
            return 'Africa'
        elif country_name == 'Timor-Leste':
            return 'Asia'
        # if country_name == """Côte d'Ivoire""":
        #     return 'Africa'
        elif country_name == 'Spain and Andorra':
            return 'Europe'
        elif country_name == 'France and Monaco':
            return 'Europe'
        elif country_name == 'Israel and Palestine, State of':
            return 'Asia'
        elif country_name == 'Italy, San Marino and the Holy See':
            return 'Europe'
        elif country_name == 'Myanmar/Burma':
            return 'Asia'
        elif country_name == 'Serbia and Montenegro':
            return 'Europe'
        elif country_name == 'Sudan and South Sudan':
            return 'Africa'
        else:
            # Standard case
            country_alpha2 = pc.country_name_to_country_alpha2(country_name)
            country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
            country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
            return country_continent_name
    except Exception as e:
        invalid.append(country_name)
        print("Invalid country is ", country_name)
        return None

In [127]:
totals_df.columns

Index(['Substance', 'EDGAR Country Code', 'Country', 'Year', 'Total CO2',
       'Total GHG'],
      dtype='object')

In [128]:
totals_df['Country']

0                         Aruba
1                   Afghanistan
2                        Angola
3                      Anguilla
4        International Aviation
                  ...          
11231              South Africa
11232                    Zambia
11233                  Zimbabwe
11234                      EU27
11235              GLOBAL TOTAL
Name: Country, Length: 11236, dtype: object

In [129]:
# Mapping countries to continents
totals_df['Continent'] = totals_df['Country'].apply(country_to_continent)
per_gdp_df['Continent'] = per_gdp_df['Country'].apply(country_to_continent)
per_capita_df['Continent'] = per_capita_df['Country'].apply(country_to_continent)
sector_wise_df['Continent'] = sector_wise_df['Country'].apply(country_to_continent)


In [130]:
# Example: Convert 'Total CO2' and 'Total GHG' columns to numeric
totals_df['Total CO2'] = pd.to_numeric(totals_df['Total CO2'].str.replace(',', '.'), errors='coerce')
totals_df['Total GHG'] = pd.to_numeric(totals_df['Total GHG'].str.replace(',', '.'), errors='coerce')

# Similarly for other dataframes if needed
# per_gdp_df, per_capita_df, sector_wise_df


In [131]:
per_capita_df['CO2 Value'] = pd.to_numeric(per_capita_df['CO2 Value'].str.replace(',', '.'), errors='coerce')
per_capita_df['GHG Value'] = pd.to_numeric(per_capita_df['GHG Value'].str.replace(',', '.'), errors='coerce')

In [132]:
per_gdp_df['CO2 per GDP'] = pd.to_numeric(per_gdp_df['CO2 per GDP'].str.replace(',', '.'), errors='coerce')
per_gdp_df['GHG per GDP'] = pd.to_numeric(per_gdp_df['GHG per GDP'].str.replace(',', '.'), errors='coerce')

In [133]:
sector_wise_df['CO2 by Sector'] = pd.to_numeric(sector_wise_df['CO2 by Sector'].str.replace(',', '.'), errors='coerce')
sector_wise_df['GHG by Sector'] = pd.to_numeric(sector_wise_df['GHG by Sector'].str.replace(',', '.'), errors='coerce')

In [134]:
grouped_totals_co2 = totals_df.groupby(['Continent', 'Year'])['Total CO2'].mean().reset_index()
grouped_totals_ghg = totals_df.groupby(['Continent', 'Year'])['Total GHG'].mean().reset_index()

In [135]:
grouped_per_capita_co2 = per_capita_df.groupby(['Continent', 'Year'])['CO2 Value'].mean().reset_index()
grouped_per_capita_ghg = per_capita_df.groupby(['Continent', 'Year'])['GHG Value'].mean().reset_index()

In [136]:
grouped_per_gdp_co2 = per_gdp_df.groupby(['Continent', 'Year'])['CO2 per GDP'].mean().reset_index()
grouped_per_gdp_ghg = per_gdp_df.groupby(['Continent', 'Year'])['CO2 per GDP'].mean().reset_index()

In [137]:
grouped_sector_wise_co2 = sector_wise_df.groupby(['Continent', 'Year'])['CO2 by Sector'].mean().reset_index()
grouped_sector_wise_ghg = sector_wise_df.groupby(['Continent', 'Year'])['GHG by Sector'].mean().reset_index()

In [142]:
merged_df_co2 = pd.merge(grouped_totals_co2, grouped_per_gdp_co2, on=['Continent', 'Year'])
merged_df_co2 = pd.merge(merged_df_co2, grouped_per_capita_co2, on=['Continent', 'Year'])
merged_df_co2 = pd.merge(merged_df_co2, grouped_sector_wise_co2, on=['Continent', 'Year'])

In [143]:
merged_df_ghg = pd.merge(grouped_totals_ghg, grouped_per_gdp_ghg, on=['Continent', 'Year'])
merged_df_ghg = pd.merge(merged_df_ghg, grouped_per_capita_ghg, on=['Continent', 'Year'])
merged_df_ghg = pd.merge(merged_df_ghg, grouped_sector_wise_ghg, on=['Continent', 'Year'])

In [144]:
# Pivot the data to have years as columns and continents as rows
pivot_co2 = merged_df_co2.pivot(index='Continent', columns='Year', values='Total CO2').fillna(0)
pivot_ghg = merged_df_ghg.pivot(index='Continent', columns='Year', values='Total GHG').fillna(0)

# Define a function to prepare data for modeling
def prepare_data_for_modeling(pivot_df):
    X = pivot_df.iloc[:, :-3].values  # Use all years except the last three for features
    y = pivot_df.iloc[:, -3:].values  # Use the last three years for the target variable
    return X, y

# Prepare data for CO2 and GHG models
X_co2, y_co2 = prepare_data_for_modeling(pivot_co2)
X_ghg, y_ghg = prepare_data_for_modeling(pivot_ghg)

# Creating Random Forest models for CO2 and GHG predictions
model_co2 = RandomForestRegressor(n_estimators=100, random_state=42)
model_ghg = RandomForestRegressor(n_estimators=100, random_state=42)

# Splitting data into training and test sets
X_train_co2, X_test_co2, y_train_co2, y_test_co2 = train_test_split(X_co2, y_co2, test_size=0.2, random_state=42)
X_train_ghg, X_test_ghg, y_train_ghg, y_test_ghg = train_test_split(X_ghg, y_ghg, test_size=0.2, random_state=42)

# Training the models
model_co2.fit(X_train_co2, y_train_co2)
model_ghg.fit(X_train_ghg, y_train_ghg)

# Predicting emissions for the next three years
predicted_co2 = model_co2.predict(X_test_co2)
predicted_ghg = model_ghg.predict(X_test_ghg)

# Assessing model performance (using RMSE)
rmse_co2 = np.sqrt(mean_squared_error(y_test_co2, predicted_co2))
rmse_ghg = np.sqrt(mean_squared_error(y_test_ghg, predicted_ghg))

# Displaying the RMSE values and predicted emissions
print("RMSE for CO2 Model:", rmse_co2)
print("Predicted CO2 Emissions:", predicted_co2)
print("RMSE for GHG Model:", rmse_ghg)
print("Predicted GHG Emissions:", predicted_ghg)

RMSE for CO2 Model: 210.5933366247567
Predicted CO2 Emissions: [[ 48.56675432  50.59802821  50.41394732]
 [143.19988509 152.5486648  154.26105239]]
RMSE for GHG Model: 281.91309917177256
Predicted GHG Emissions: [[ 87.6552168   89.86450581  90.02538334]
 [197.33567124 207.37111015 210.00435688]]


In [146]:
merged_df_co2.head()

Unnamed: 0,Continent,Year,Total CO2,CO2 per GDP,CO2 Value,CO2 by Sector
0,Africa,1990,12.696884,0.154542,0.959513,2.002889
1,Africa,1991,12.80607,0.160238,0.965966,2.043134
2,Africa,1992,13.125,0.158208,0.945813,2.088068
3,Africa,1993,13.166357,0.157656,0.947032,2.100615
4,Africa,1994,13.270459,0.193316,1.007051,2.105229


In [153]:
random_forest_regressor = merged_df_co2

In [154]:
random_forest_regressor.columns

Index(['Continent', 'Year', 'Total CO2', 'CO2 per GDP', 'CO2 Value',
       'CO2 by Sector'],
      dtype='object')

In [165]:
random_forest_regressor = random_forest_regressor.drop(['Continent', 'Year'], axis=1)

In [166]:
# Preparing data for the model
X = random_forest_regressor.drop(['Total CO2'], axis=1).values
y = random_forest_regressor[['Total CO2']].values

In [168]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

array([[3.26401739e-01, 6.88150237e+00, 5.32737567e+01],
       [2.16822832e-01, 3.16942075e+00, 1.04950220e+01],
       [4.92252472e-01, 8.12872783e+00, 2.33307777e+01],
       [1.02078384e+00, 1.26818208e+01, 4.27661604e+00],
       [3.06086128e-01, 6.22765057e+00, 1.66795751e+03],
       [2.42114341e-01, 5.92891068e+00, 2.24577712e+03],
       [3.37058988e-01, 8.16204132e+00, 2.24351729e+01],
       [2.70863961e-01, 7.82080472e+00, 2.07604017e+01],
       [1.97718886e-01, 6.74380749e+00, 1.88698177e+01],
       [1.82850429e-01, 2.79985926e+00, 1.04035450e+01],
       [2.74101649e-01, 6.44175799e+00, 2.05403447e+03],
       [4.72014401e-01, 6.05953396e+00, 2.39407666e+01],
       [1.61120155e-01, 1.42886533e+00, 4.08339862e+00],
       [5.46021424e-01, 9.03180758e+00, 2.60722172e+01],
       [9.08746600e-01, 1.31248082e+01, 5.46616225e+00],
       [1.85050811e-01, 1.91609020e+00, 5.43919043e+00],
       [1.68754593e-01, 1.22516782e+00, 2.52461228e+00],
       [2.66872010e-01, 6.22690

In [173]:
# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
predictions = model.predict(X_test)

# Evaluating the model
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# Display RMSE
rmse
predictions[0:3]

  return fit_method(estimator, *args, **kwargs)


array([ 76.61049816, 189.65308316,  15.54057683])