# CO2 Emissions Tracker

This will will contain:
- Data extraction 
- Data visualation 
- Dataset creation 
- Model preperation
- Training algorithm 
- Model testing
- Model preview

# Note

In order to run the code, ensure a python environment is created and install the required packaged. 

## EDA 

### Data Perperation

### Importing plugins

In [16]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import lightning as l 
import matplotlib.pyplot as plt 
import plotly.express as px
pd.options.plotting.backend = "plotly"
pd.options.display.max_columns = 200




### Step 1. Data Cleanup, and standardizing the format

In [39]:
# loading in the data

#importing CO2 emissions data
co2_emission_data = pd.read_csv(filepath_or_buffer='./data/Annual CO2 Emissions.csv')
gdp_data = pd.read_csv('./data/gdp/gdp_data.csv')
country_stats = pd.read_csv('./data/inflation/inflation interest unemployment.csv')
land_size =  pd.read_csv('./data/world_country_stats.csv')
surface_temp = pd.read_csv('./data/temperature/Annual_Surface_Temperature_Change.csv')



In [None]:
gdp_data.columns

In [60]:
# gdp_data[['country_name',  'country_code' , 'year', 'value']]
country_stats = country_stats[['country', 'year', 'Inflation, consumer prices (annual %)',
       'Inflation, GDP deflator (annual %)', 'Real interest rate (%)',
       'Deposit interest rate (%)', 'Lending interest rate (%)',
       'Unemployment, total (% of total labor force) (national estimate)',
       'Unemployment, total (% of total labor force) (modeled ILO estimate)',
       'iso3c', 
       #'iso2c', 
       #'adminregion', 'incomeLevel'
       ]].copy()
land_size = land_size[['country', 
                       #'region', 
                       'land_area', 
                       #'fertility_rate', 'median_age'
                       ]].copy()

surface_temp = surface_temp[[#'ObjectId', 
                            'Country', #'ISO2', 
                            'ISO3',# 'Indicator', 'Unit', 'Source',
                            #'CTS_Code', 'CTS_Name', 'CTS_Full_Descriptor', 
                            '1961', '1962', '1963',
                            '1964', '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972',
                            '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981',
                            '1982', '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990',
                            '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
                            '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008',
                            '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017',
                            '2018', '2019', '2020', '2021', '2022']].copy()
co2_emission_data = co2_emission_data[['Entity', 'Code', 'Year', 'Annual CO2 Emissions (tonnes)']].copy()

In [None]:
## this section of code is to determine the year range for the different datasets
## the suitable range has been chosen such that we are getting the maximum of the oldest year, and minimum of the most current year, 
## this ensures we avoid having too many missing values

temp_years = [int(col) for col in surface_temp.columns if col.isdigit()]

year_ranges = {
    'CO2': (co2_emission_data['Year'].min(), co2_emission_data['Year'].max()),
    'GDP': (gdp_data['year'].min(), gdp_data['year'].max()),
    'Country Stats': (country_stats['year'].min(), country_stats['year'].max()),
    'Temperature': (min(temp_years), max(temp_years))}

# Print current ranges
for dataset, (start, end) in year_ranges.items():
    print(f"{dataset}: {start} - {end}")

# Calculate optimal range
optimal_start = max(range[0] for range in year_ranges.values())
optimal_end = min(range[1] for range in year_ranges.values())

print(f"\nOptimal year range: {optimal_start} - {optimal_end}")

In [57]:
# creating a function to standardize the datasets, so code is not repeated below. 
def std_dataset(df,id_columns,value_column, year_column='year'):
    # input of id_column should be: country, iso3
    if df[year_column].dtype != 'int64':
        df[year_column] = pd.to_numeric(df[year_column],errors='coerce')
    
    #filtering for optimal range
    df_filtered = df[
                    (df[year_column] >= optimal_start) &
                    (df[year_column] <= optimal_end)]
    
    # sorting by country code and year
    df_filtered = df_filtered.sort_values([id_columns[1],year_column])

    return df_filtered


In [61]:
# reshaping the surface temperature dataset o that it is in long format (countries are the rows, years are the columns) 
surface_temp_long = surface_temp.melt(id_vars=['Country', 'ISO3'], var_name='year', value_name='temperature_change')

#converting year to integer, and filtering out the optimal range
surface_temp_long['year'] = pd.to_numeric(surface_temp_long['year'],errors='coerce')
surface_temp_long = surface_temp_long[
                                        (surface_temp_long['year'] >= optimal_start) &
                                        (surface_temp_long['year'] <= optimal_end)]
surface_temp_long = surface_temp_long.sort_values(['ISO3', 'year'])

# Reshaping CO2 emissions to be in long format as well as optimal years
co2_long = std_dataset(co2_emission_data, id_columns=['Entity', 'Code'], value_column='Annual CO2 Emissions (tonnes)', year_column='Year')

# Reshaping GDP to be in long format as well as optimal years:
gdp_long = std_dataset(gdp_data,id_columns=['country_name', 'country_code'],value_column='value', year_column='year')

# reshaping country stats
country_stats_long = std_dataset(country_stats,id_columns=['country','iso3c'], value_column=['Inflation, consumer prices (annual %)','Inflation, GDP deflator (annual %)','Real interest rate (%)', 'Deposit interest rate (%)', 'Lending interest rate (%)',
       'Unemployment, total (% of total labor force) (national estimate)',
       'Unemployment, total (% of total labor force) (modeled ILO estimate)'], year_column='year')








In [44]:




co2_emission_structured_data = co2_emission_data.pivot(columns='Code',index='Year', values='Annual CO2 Emissions (tonnes)')


# creating country codes index
country_codes = co2_emission_data.set_index(['Code'])
country_codes = country_codes.drop(axis='columns', labels=['Year','Annual CO2 Emissions (tonnes)'])
country_codes.drop_duplicates(keep='first', inplace=True, ignore_index=False)

country_mapping = country_codes['Entity'].to_dict()







In [None]:
# plotting the CO2 emisisons (time series plots)
# Convert country_codes to a dictionary for easy lookup
fig = px.line(testing, 
              title='CO2 Emissions by Country',
              width=1500,
              height=1000)

# Update the trace names using your mapping
for trace in fig.data:
    # Get current name (country code) and find corresponding country name
    country_code = trace.name
    country_name = country_mapping.get(country_code, country_code)  # fallback to code if not found
    trace.name = country_name  # Use assignment operator = instead of calling it as a function

# Update layout with better labels
fig.update_layout(
    xaxis_title="Year",
    yaxis_title="CO2 Emissions (t)",
    title_x=0.5,  # Center the title
)

# Show the plot
fig.show()