In [1]:
# Import necessary libraries and packages
import pandas as pd
from pandas import DataFrame
import numpy as np
from matplotlib import pyplot
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import warnings
from math import exp
from numpy import log

import plotly.graph_objs as go
from plotly.subplots import make_subplots
import seaborn as sns
# Preferred settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)


In [2]:
#Import CO2 emissions data - OWID
data_e = pd.read_csv('../Data/carbon-emissions/owid-co2-data.csv')
data_t = pd.read_csv('../Data/Indicators/tas_timeseries_annual_cru_1901-2020_IND.csv')

#Observe data shape and head
print(data_e.shape)
print(data_t.shape)

(25204, 58)
(120, 46)


## Data Cleaning and Preprocessing

We focused on post-1950 data for our comparative analysis and global EDA since some countries are missing significant data before that time. Major carbon emitting countries' emission activites intensified post 1950 period, especially India. For our modelling process, however, focusing on India, the data form 1878 is continous, so we include all data for India from 1900. First, We'll be predicting the CO2 emissions for India

In [3]:
#load data
df_India_e = data_e
df_India_e.isnull().sum()

iso_code                                3256
country                                    0
year                                       0
co2                                     1255
consumption_co2                        21228
co2_growth_prct                          273
co2_growth_abs                          1619
trade_co2                              21228
co2_per_capita                          1897
consumption_co2_per_capita             21228
share_global_co2                        1255
cumulative_co2                          1255
share_global_cumulative_co2             1255
co2_per_gdp                             9815
consumption_co2_per_gdp                21443
co2_per_unit_energy                    16063
coal_co2                                8016
cement_co2                             12956
flaring_co2                            20822
gas_co2                                16359
oil_co2                                 4665
other_industry_co2                     23205
cement_co2

In [4]:
#only include India co2 per capita data post-1900
df_India_e = df_India_e[df_India_e['year'] >= 1900]
df_India_e = df_India_e[df_India_e['country'] == 'India']
df_India_e = df_India_e[['year', 'co2_per_capita']]

#observe shape of reformatted df
#print(df_India_e.shape)
df_India_e

Unnamed: 0,year,co2_per_capita
10857,1900,0.041
10858,1901,0.045
10859,1902,0.049
10860,1903,0.050
10861,1904,0.056
...,...,...
10973,2016,1.799
10974,2017,1.818
10975,2018,1.922
10976,2019,1.922


In [5]:
#load data
df_India_t = data_t

#only include India's temperature data 
df_India_t = df_India_t[['year', 'India']]
df_India_t

Unnamed: 0,year,India
0,1901,24.26
1,1902,24.52
2,1903,23.96
3,1904,24.02
4,1905,23.87
...,...,...
115,2016,25.27
116,2017,25.15
117,2018,25.00
118,2019,24.94


In [6]:
#Convert year to index
df_India_t.set_index('year', inplace=True)
df_India_e.set_index('year', inplace=True)

#Convert year to DateTime object
df_India_t.index = pd.to_datetime(df_India_t.index, format='%Y')
df_India_e.index = pd.to_datetime(df_India_e.index, format='%Y')

In [7]:
#observe updated df
#df_India_t.head(5)
df_India_e.head(5)

Unnamed: 0_level_0,co2_per_capita
year,Unnamed: 1_level_1
1900-01-01,0.041
1901-01-01,0.045
1902-01-01,0.049
1903-01-01,0.05
1904-01-01,0.056


In [17]:
df_India_t = df_India_t.diff().dropna()
df_India_t

Unnamed: 0_level_0,India
year,Unnamed: 1_level_1
1902-01-01,0.26
1903-01-01,-0.56
1904-01-01,0.06
1905-01-01,-0.15
1906-01-01,0.24
...,...
2016-01-01,0.38
2017-01-01,-0.12
2018-01-01,-0.15
2019-01-01,-0.06


In [12]:
df_India_e = df_India_e.iloc[2:]
df_India_e

Unnamed: 0_level_0,co2_per_capita
year,Unnamed: 1_level_1
1902-01-01,0.049
1903-01-01,0.050
1904-01-01,0.056
1905-01-01,0.056
1906-01-01,0.065
...,...
2016-01-01,1.799
2017-01-01,1.818
2018-01-01,1.922
2019-01-01,1.922


In [18]:
df_India_t.rename(columns = {'India':'temperature'}, inplace = True)
df_India = pd.concat([df_India_e, df_India_t], axis=1)
df_India

Unnamed: 0_level_0,co2_per_capita,temperature
year,Unnamed: 1_level_1,Unnamed: 2_level_1
1902-01-01,0.049,0.26
1903-01-01,0.050,-0.56
1904-01-01,0.056,0.06
1905-01-01,0.056,-0.15
1906-01-01,0.065,0.24
...,...,...
2016-01-01,1.799,0.38
2017-01-01,1.818,-0.12
2018-01-01,1.922,-0.15
2019-01-01,1.922,-0.06
