In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

# Data Description

* Population
* GDP
* DCT - $CO_{2}$ Emissions – Total - `CDI = CO2 Intensity Current-Year Score`
* DPT - $CO_{2}$ emissions/kWh elect. & heat - `CEH = CO2 Emissions per kWh`
* DMT - $CH_{4}$ emissions - `CHI = $CH_{4}$ Intensity Current-Year Score`
* DNT - $N_{2}O$ emissions - `NOI = $N_{2}O$ Intensity Current-Year Score`
* DBT - Black Carbon emissions - `BCI = Black Carbon Current-Year Score`

In [2]:
climate = pd.read_csv('data_2020.csv')

In [3]:
climate.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
climate.shape

(3094, 7)

In [6]:
climate.head(10)

Unnamed: 0,country,year,CDA,CHA,FGA,NDA,BCA
0,Albania,1995,100.0,73.07252,0.0,78.221486,100.0
1,Algeria,1995,40.836434,98.636534,96.334651,72.620891,70.996796
2,Angola,1995,12.888007,70.729875,0.0,100.0,43.865361
3,Antigua and Barbuda,1995,44.483927,80.575637,0.0,36.690524,68.683208
4,Argentina,1995,40.485271,100.0,100.0,63.435774,45.942816
5,Armenia,1995,72.726561,89.833401,0.0,97.707838,100.0
6,Australia,1995,42.566037,99.66689,100.0,76.458848,82.549405
7,Austria,1995,50.903881,100.0,92.800663,93.225015,100.0
8,Azerbaijan,1995,60.41653,86.983408,100.0,74.176342,88.589201
9,Bahrain,1995,25.727473,33.991729,100.0,43.459231,42.950239


In [5]:
climate = climate[climate['year']!=2020]
climate = climate[climate['year']!=2019]

In [6]:
climate['year'].unique()

array([1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
       2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
       2017, 2018], dtype=int64)

# `Time Series` for 2021-2027 per country

In [7]:
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARIMA
import datetime

In [8]:
countries = list(climate['country'].unique())
len(countries)

119

In [9]:
climate.isna().sum() # checking missing values

country    0
year       0
CDA        0
CHA        0
FGA        0
NDA        0
BCA        0
dtype: int64

In [10]:
cda = []
cha = []
fga = []
nda = []
bca = []

In [11]:
# CDA time series

for country in countries:
    df = climate[climate['country']==str(country)].reset_index()
    #df.index = pd.to_datetime(df['year'],format='%Y')#.dt.year
    features1 = df['CDA']
    model1 = AR(features1)
    model1_fit = model1.fit(maxlag=3)
    d_f1 = model1_fit.predict(start=len(features1), end=len(features1)+8, dynamic=False)
    cda += d_f1.to_list()

    #print('Country {0} completed'.format(country))

In [12]:
# CHA time series

for country in countries:
    df = climate[climate['country']==str(country)].reset_index()
    #df.index = pd.to_datetime(df['year'],format='%Y')#.dt.year
    features = df['CHA']
    if len((climate[climate['country']==str(country)]['CHA']).unique()) <=3:
        cha_ = features[:9].tolist()
        cha+=cha_
    else:
        model = AR(features)
        model_fit = model.fit(maxlag=3)
        d_f = model_fit.predict(start=len(features), end=len(features)+8, dynamic=False)
        cha+=d_f.to_list()
    #print('Country {0} completed'.format(country)) # problems with values repetition

In [13]:
# FGA time series

for country in countries:
    df = climate[climate['country']==str(country)].reset_index()
    #df.index = pd.to_datetime(df['year'],format='%Y')#.dt.year
    features = df['FGA']
    if len((climate[climate['country']==str(country)]['FGA']).unique()) <=3:
        fga_ = features[:9].tolist()
        fga+=fga_
    else:
        model = AR(features)
        model_fit = model.fit(maxlag=3)
        d_f = model_fit.predict(start=len(features), end=len(features)+8, dynamic=False)
        fga+=d_f.to_list()
    #print('Country {0} completed'.format(country))

In [14]:
# NDA time series

for country in countries:
    df = climate[climate['country']==str(country)].reset_index()
    #df.index = pd.to_datetime(df['year'],format='%Y')#.dt.year
    features = df['NDA']
    if len((climate[climate['country']==str(country)]['NDA']).unique()) <=3:
        nda_ = features[:9].tolist()
        nda+=nda_
    else:
        model = AR(features)
        model_fit = model.fit(maxlag=3)
        d_f = model_fit.predict(start=len(features), end=len(features)+8, dynamic=False)
        nda+=d_f.to_list()
    #print('Country {0} completed'.format(country))

In [15]:
# BCA time series

for country in countries:
    df = climate[climate['country']==str(country)].reset_index()
    #df.index = pd.to_datetime(df['year'],format='%Y')#.dt.year
    features = df['BCA']
    if len((climate[climate['country']==str(country)]['BCA']).unique()) <=3:
        bca_ = features[:9].tolist()
        bca+=bca_
    else:
        model = AR(features)
        model_fit = model.fit(maxlag=3)
        d_f = model_fit.predict(start=len(features), end=len(features)+8, dynamic=False)
        bca+=d_f.to_list()
    #print('Country {0} completed'.format(country))

## We have to generate total 1071 predictions

In [16]:
print('The CDA columns has {0} predictions'.format(len(cda)))
print('The CHA columns has {0} predictions'.format(len(cha)))
print('The FGA columns has {0} predictions'.format(len(fga)))
print('The NDA columns has {0} predictions'.format(len(nda)))
print('The BCA columns has {0} predictions'.format(len(bca)))

The CDA columns has 1071 predictions
The CHA columns has 1071 predictions
The FGA columns has 1071 predictions
The NDA columns has 1071 predictions
The BCA columns has 1071 predictions


In [17]:
df_cda = pd.DataFrame({'CDA':cda})
df_cha = pd.DataFrame({'CHA':cha})
df_fga = pd.DataFrame({'FGA':fga})
df_nda = pd.DataFrame({'NDA':nda})
df_bca = pd.DataFrame({'BCA':bca})

In [18]:
print('The CDA columns has {0} predictions'.format(df_cda.shape[0]))
print('The CHA columns has {0} predictions'.format(df_cha.shape[0]))
print('The FGA columns has {0} predictions'.format(df_fga.shape[0]))
print('The NDA columns has {0} predictions'.format(df_nda.shape[0]))
print('The BCA columns has {0} predictions'.format(df_bca.shape[0]))

The CDA columns has 1071 predictions
The CHA columns has 1071 predictions
The FGA columns has 1071 predictions
The NDA columns has 1071 predictions
The BCA columns has 1071 predictions


In [19]:
years = [num for num in range(2019,2028)]
year = pd.DataFrame({'year':years*119})
year.shape

(1071, 1)

In [20]:
countries_total = []
for i in countries:
    for j in range(0,9):
        countries_total.append(i)

In [21]:
len(countries_total)

1071

In [22]:
data = pd.DataFrame(countries_total,columns=['country'])
climate_pred = data.join(year)

In [23]:
climate_pred = climate_pred.join(df_cda)
climate_pred = climate_pred.join(df_cha)
climate_pred = climate_pred.join(df_fga)
climate_pred = climate_pred.join(df_nda)
climate_pred = climate_pred.join(df_bca)

In [24]:
climate_pred.shape

(1071, 7)

In [25]:
climate.describe()

Unnamed: 0,year,CDA,CHA,FGA,NDA,BCA
count,2856.0,2856.0,2856.0,2856.0,2856.0,2856.0
mean,2006.5,45.612832,69.813558,75.881315,60.529332,59.847407
std,6.923399,20.547769,28.557937,26.864981,28.985214,31.603002
min,1995.0,0.0,0.0,0.0,0.0,0.0
25%,2000.75,32.712642,51.514287,68.328556,39.979301,35.903772
50%,2006.5,45.657211,75.672376,87.055491,63.171014,60.031367
75%,2012.25,58.373207,100.0,92.399902,82.429205,94.238995
max,2018.0,100.0,100.0,100.0,100.0,100.0


In [26]:
climate_pred.describe()

Unnamed: 0,year,CDA,CHA,FGA,NDA,BCA
count,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0
mean,2023.0,45.725665,69.564284,89.543604,57.516227,56.748969
std,2.583195,25.695135,25.134798,7.964706,27.170484,28.029411
min,2019.0,-286.130726,0.462149,46.899592,-131.953531,-2.408672
25%,2021.0,34.900554,52.789285,87.327318,42.719765,36.919752
50%,2023.0,47.844046,72.765109,90.702536,61.198179,55.07563
75%,2025.0,59.931267,91.311357,94.227968,76.651957,79.333657
max,2027.0,99.830786,130.559686,105.09036,142.115631,109.121687


In [27]:
climate_pred['CDA'] = climate_pred['CDA'].apply(lambda x: 0 if x<=0 else x)
climate_pred['CHA'] = climate_pred['CHA'].apply(lambda x: 0 if x<=0 else x)
climate_pred['FGA'] = climate_pred['FGA'].apply(lambda x: 0 if x<=0 else x)
climate_pred['NDA'] = climate_pred['NDA'].apply(lambda x: 0 if x<=0 else x)
climate_pred['BCA'] = climate_pred['BCA'].apply(lambda x: 0 if x<=0 else x)

climate_pred['CDA'] = climate_pred['CDA'].apply(lambda x: 100 if x>=100 else x)
climate_pred['CHA'] = climate_pred['CHA'].apply(lambda x: 100 if x>=100 else x)
climate_pred['FGA'] = climate_pred['FGA'].apply(lambda x: 100 if x>=100 else x)
climate_pred['NDA'] = climate_pred['NDA'].apply(lambda x: 100 if x>=100 else x)
climate_pred['BCA'] = climate_pred['BCA'].apply(lambda x: 100 if x>=100 else x)

In [28]:
climate_pred.describe()

Unnamed: 0,year,CDA,CHA,FGA,NDA,BCA
count,1071.0,1071.0,1071.0,1071.0,1071.0,1071.0
mean,2023.0,46.888759,69.353128,89.515581,58.131653,56.69686
std,2.583195,19.190392,24.799256,7.921643,24.373976,27.845778
min,2019.0,0.0,0.462149,46.899592,0.0,0.0
25%,2021.0,34.900554,52.789285,87.327318,42.719765,36.919752
50%,2023.0,47.844046,72.765109,90.702536,61.198179,55.07563
75%,2025.0,59.931267,91.311357,94.227968,76.651957,79.333657
max,2027.0,99.830786,100.0,100.0,100.0,100.0


In [29]:
final_df = []
for country in countries:
    df1 = climate[climate['country']==str(country)].reset_index()
    df2 = climate_pred[climate_pred['country']==str(country)].reset_index()
    final = df1.append(df2)
    final = final.reset_index()
    final.drop(columns=['level_0','index'],inplace=True)
    final_df.append(final)

In [30]:
def list_to_df(lst):
    df = lst[0]
    for item in range(1,len(lst)):
        df = df.append(lst[item])
    return df

In [31]:
all_pred_2018 = list_to_df(final_df)

In [32]:
all_pred_2018 = all_pred_2018.reset_index()
all_pred_2018.drop(columns=['index'],inplace=True)

In [33]:
all_pred_2018.shape

(3927, 7)

In [35]:
all_pred_2018.head(35)

Unnamed: 0,country,year,CDA,CHA,FGA,NDA,BCA
0,Albania,1995,100.0,73.07252,0.0,78.221486,100.0
1,Albania,1996,100.0,73.07252,0.0,78.221486,100.0
2,Albania,1997,100.0,73.07252,0.0,78.221486,100.0
3,Albania,1998,100.0,73.07252,0.0,78.221486,100.0
4,Albania,1999,100.0,73.07252,0.0,78.221486,100.0
5,Albania,2000,74.923269,76.760669,0.0,63.022453,48.045073
6,Albania,2001,39.235149,85.603156,18.915822,68.29695,0.0
7,Albania,2002,18.968655,100.0,40.199894,71.154023,0.0
8,Albania,2003,1.230429,100.0,51.079315,71.641696,0.0
9,Albania,2004,0.0,100.0,57.123048,60.929528,0.0


# Export xlsx and csv

In [34]:
climate_pred.to_csv('climate_pred_2018.csv')

In [35]:
all_pred_2018.to_csv('climate_all_pred_2018.csv')

# `Survival Analysis` to predict which variable represents the biggest change concern