# EPA112A - Programming for Data Science - Group 31

- Georges Puttaert - 4686160
- Thijs Roolvink - 4961382
- Gijs de Werd - 4717775

## Research Question

 **What is the relative influence of social, economic, and environmental indicators on a country's GDP, and can we accurately predict GDP based on these indicators?**

*Chosen Countries per category:*
- The Netherlands 
- Germany
- Greece
- Ireland

### Packages

In [78]:
import wbdata
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
import matplotlib.pyplot as plt

### Indicators from the World Bank

In [79]:
# Define indicators Inequality and Social Welfare
health_indicators = {'SH.XPD.CHEX.GD.ZS': "Health Expenditure as a Percentage of GDP", "SH.IMM.IDPT": "Immunization"}
GDP_indicator = {'NY.GDP.PCAP.CD': 'gdppc'}
life_exp_indicator = {'SP.DYN.LE00.IN': 'Life Expectancy at Birth', 'SH.DYN.MORT': 'Child Mortality'}
disease_indicator = {'SH.TBS.INCD': 'Indicence of Diseases'} 

#Define indicators Import & Export
export_indicator = {'NE.EXP.GNFS.KD.ZG': 'Export'}
import_indicator = {'NE.IMP.GNFS.KD.ZG': 'Import'}
freight_indicator = {'IS.AIR.GOOD.MT.K1': 'Freight'}

#Define indicators Evironmental
renewable_energy_indicator = {'EG.FEC.RNEW.ZS': 'Renewable energy consumption (% of total final energy consumption)'}


### DataFrames for chosen indicators and countries from the World Bank

In [80]:
countries = ['NLD', 'DEU', 'GRC', 'IRL']

#Dataframes Inequality and Social Welfare
df_health = wbdata.get_dataframe(health_indicators, country=countries, convert_date=True)
df_gdp = wbdata.get_dataframe(GDP_indicator, country=countries, convert_date=True)
df_life_exp = wbdata.get_dataframe(life_exp_indicator, country=countries, convert_date=True)
df_diseases = wbdata.get_dataframe(disease_indicator, country=countries, convert_date=True)

#Dataframes Import & Export
df_export = wbdata.get_dataframe(export_indicator, country=countries, convert_date=True)
df_import = wbdata.get_dataframe(import_indicator, country=countries, convert_date=True)
df_freight = wbdata.get_dataframe(freight_indicator, country=countries, convert_date=True)

### Data Cleaning

In [81]:
### Inequality and Social Welfare ###
# Reset index of the dataframes
df_health = df_health.reset_index()
df_gdp = df_gdp.reset_index()
df_life_exp = df_life_exp.reset_index()
df_diseases = df_diseases.reset_index()

#Formatting the date column to year
df_health['date'] = df_health['date'].dt.year
df_health = df_health.drop(['Immunization'], axis = 1)
df_gdp['date'] = df_gdp['date'].dt.year
df_life_exp['date'] = df_life_exp['date'].dt.year
df_diseases['date'] = df_diseases['date'].dt.year 

In [82]:
### Import & Export ###
# Reset index of the dataframes
df_export = df_export.reset_index()
df_import = df_import.reset_index()
df_freight = df_freight.reset_index()

#Formatting the date column to year
df_export['date'] = df_export['date'].dt.year
df_import['date'] = df_import['date'].dt.year
df_freight['date'] = df_freight['date'].dt.year

### Indicators of the European Data Bank 
- Air GHG - https://ec.europa.eu/eurostat/databrowser/view/sdg_13_10__custom_8184934/default/table?lang=en 
- The recycling rate of municipal waste - https://ec.europa.eu/eurostat/databrowser/view/cei_wm011/default/table?lang=en

In [83]:
df_emissions = df = pd.read_csv('Datasets/sdg_13_10_linear.csv')
df_recycling = pd.read_csv('Datasets/cei_wm011_linear.csv')

GHG_NL = df_emissions[(df_emissions['geo'] == 'NL') & (df_emissions['airpol'] == 'GHG') & (df_emissions['unit'] == 'T_HAB') & (df_emissions['src_crf'] == 'TOTXMEMONIA')]
recycling_NL = df_recycling[df_recycling['geo'] == 'NL']

### Visualizing the indicators

In [84]:
#GDP per capita
fig1 = px.line(df_gdp, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

### Inequality and Social Welfare ###

#Health expenditure as a percentae of GDP
fig2 = px.line(df_health, x = 'date', y = 'Health Expenditure as a Percentage of GDP', color = 'country', title = 'Health Expenditure as a Percentage of GDP')


#Life Expectancy at Birth
fig4 = px.line(df_life_exp, x = 'date', y = 'Life Expectancy at Birth', color = 'country', title = 'Life Expectancy at Birth')

#Child Mortality
fig5 = px.line(df_life_exp, x = 'date', y = 'Child Mortality', color = 'country', title = 'Child Mortality')

# Indices of Diseases
fig6  = px.line(df_diseases, x = 'date', y = 'Indicence of Diseases', color = 'country', title = 'Indices of Diseases')

fig1.show()
fig2.show()
fig4.show()
fig5.show()
fig6.show()

In [85]:
#GDP per capita
fig1 = px.line(df_gdp, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

### Import & Export ###
#Export
fig2 = px.line(df_export, x = 'date', y = 'Export', color = 'country', title = 'Exports of goods and services (annual percentage growth)')

#Import 
fig3 = px.line(df_import, x = 'date', y = 'Import', color = 'country', title = 'Imports of goods and services (annual percentage growth)')

#Freight
fig4 = px.line(df_freight, x = 'date', y = 'Freight', color = 'country', title = 'Air transport, freight (million ton-km)')

fig1.show()
fig2.show()
fig3.show()
fig4.show()

In [86]:
### Inequality and Social Welfare ###
#Dataframes with dates starting in 2000 unitll 2020
df_health_filtered = df_health[(df_health['date'] >= 2000) & (df_health['date'] <= 2020)]
df_gdp_filtered = df_gdp[(df_gdp['date'] >= 2000) & (df_gdp['date'] <= 2020)]
df_life_exp_filtered = df_life_exp[(df_life_exp['date'] >= 2000) & (df_life_exp['date'] <= 2020)]
df_diseases_filtered = df_diseases[(df_diseases['date'] >= 2000) & (df_diseases['date'] <= 2020)]

In [87]:
#GDP per capita
fig1 = px.line(df_gdp_filtered, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

### Inequality and Social Welfare ###
#Health expenditure as a percentae of GDP
fig2 = px.line(df_health_filtered, x = 'date', y = 'Health Expenditure as a Percentage of GDP', color = 'country', title = 'Health Expenditure as a Percentage of GDP')


#Life Expectancy at Birth
fig4 = px.line(df_life_exp_filtered, x = 'date', y = 'Life Expectancy at Birth', color = 'country', title = 'Life Expectancy at Birth')

#Child Mortality
fig5 = px.line(df_life_exp_filtered, x = 'date', y = 'Child Mortality', color = 'country', title = 'Child Mortality')

# Indices of Diseases
fig6  = px.line(df_diseases_filtered, x = 'date', y = 'Indicence of Diseases', color = 'country', title = 'Indices of Diseases')

fig1.show()
fig2.show()
fig4.show()
fig5.show()
fig6.show()

In [88]:
### Import & Export ###
#Dataframes with dates starting in 1975 unitll 2020
df_gdp_filtered_ie = df_gdp[(df_gdp['date'] >= 1975) & (df_gdp['date'] <= 2020)]
df_export_filtered = df_export[(df_export['date'] >= 1975) & (df_export['date'] <= 2020)]
df_import_filtered = df_import[(df_import['date'] >= 1975) & (df_import['date'] <= 2020)]
df_freight_filtered = df_freight[(df_freight['date'] >= 1975) & (df_freight['date'] <= 2020)]

In [89]:
#GDP per capita
fig1 = px.line(df_gdp_filtered_ie, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

### Import & Export ###
#Export
fig2 = px.line(df_export_filtered, x = 'date', y = 'Export', color = 'country', title = 'Exports of goods and services (annual percentage growth)')

#Import 
fig3 = px.line(df_import_filtered, x = 'date', y = 'Import', color = 'country', title = 'Imports of goods and services (annual percentage growth)')

#Freight
fig4 = px.line(df_freight_filtered, x = 'date', y = 'Freight', color = 'country', title = 'ir transport, freight (million ton-km)')

fig1.show()
fig2.show()
fig3.show()
fig4.show()

## The Netherlands

### Inequality and Social Welfare

Data Preparation

In [90]:
#Combining all dataframes
dfs_nld = [df_gdp_filtered[df_gdp_filtered['country'] == 'Netherlands'], df_health_filtered[df_health_filtered['country'] == 'Netherlands'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'Netherlands'], df_diseases_filtered[df_diseases_filtered['country'] == 'Netherlands']]
df_combined_nld = pd.concat(dfs_nld, axis = 1)

#Merge columns date
df_combined_nld = df_combined_nld.drop(['date'], axis = 1)
df_combined_nld['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Netherlands']['date']

#Merge columns country
df_combined_nld = df_combined_nld.drop(['country'], axis = 1)
df_combined_nld['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Netherlands']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_nld.isnull().sum().sum()}')

#Copy df_combined_nld to make train set
df_combined_nld_train = df_combined_nld.copy()
df_combined_nld_train =  df_combined_nld_train[df_combined_nld_train['date'] <= 2017]

#Copy df_combined_nld to make test set
df_combined_nld_test = df_combined_nld.copy()
df_combined_nld_test = df_combined_nld_test[df_combined_nld_test['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [91]:
df_corr_nld = df_combined_nld_test.drop('date', axis = 1)
corr = df_corr_nld.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale= px.colors.sequential.Emrld, width= 1500, height=750)

fig.show()





### Import & Export

Data Preparation

In [92]:
#Combining all dataframes
dfs_nld_ie = [df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Netherlands'], df_export_filtered[df_export_filtered['country'] == 'Netherlands'], df_import_filtered[df_import_filtered['country'] == 'Netherlands'], df_freight_filtered[df_freight_filtered['country'] == 'Netherlands']]
df_combined_nld_ie = pd.concat(dfs_nld_ie, axis = 1)

#Merge columns date
df_combined_nld_ie = df_combined_nld_ie.drop(['date'], axis = 1)
df_combined_nld_ie['date'] = df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Netherlands']['date']

#Merge columns country
df_combined_nld_ie = df_combined_nld_ie.drop(['country'], axis = 1)
df_combined_nld_ie['country'] = df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Netherlands']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_nld_ie.isnull().sum().sum()}')

#Copy df_combined_nld_ie to make train set
df_combined_nld_train_ie = df_combined_nld_ie.copy()
df_combined_nld_train_ie = df_combined_nld_train_ie[df_combined_nld_train_ie['date'] <= 2017]

#Copy df_combined_nld_ie to make test set
df_combined_nld_test_ie = df_combined_nld_ie.copy()
df_combined_nld_test_ie = df_combined_nld_test_ie[df_combined_nld_test_ie['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [93]:
df_corr_nld = df_combined_nld_test_ie.drop('date', axis = 1)
corr = df_corr_nld.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale= px.colors.sequential.Emrld, width= 1500, height=750)

fig.show()





## Germany

### Inequality and Social Welfare

Data Preparation

In [94]:
#Combining all dataframes
dfs_deu = [df_gdp_filtered[df_gdp_filtered['country'] == 'Germany'], df_health_filtered[df_health_filtered['country'] == 'Germany'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'Germany'], df_diseases_filtered[df_diseases_filtered['country'] == 'Germany']]
df_combined_deu = pd.concat(dfs_deu, axis = 1)

#Merge columns date
df_combined_deu = df_combined_deu.drop(['date'], axis = 1)
df_combined_deu['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Germany']['date']

#Merge columns country
df_combined_deu = df_combined_deu.drop(['country'], axis = 1)
df_combined_deu['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Germany']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_deu.isnull().sum().sum()}')

#Copy df_combined_deu to make train set
df_combined_deu_train = df_combined_deu.copy()
df_combined_deu_train =  df_combined_deu_train[df_combined_deu_train['date'] <= 2017]

#Copy df_combined_arg to make test set
df_combined_deu_test = df_combined_deu.copy()
df_combined_deu_test = df_combined_deu_test[df_combined_deu_test['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [95]:
df_corr_deu = df_combined_deu_test.drop('date', axis = 1)
corr = df_corr_deu.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale= px.colors.sequential.Emrld, width= 1500, height=750)

fig.show()





### Import & Export


Data Preparation

In [96]:
#Combining all dataframes
dfs_deu_ie = [df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Germany'], df_export_filtered[df_export_filtered['country'] == 'Germany'], df_import_filtered[df_import_filtered['country'] == 'Germany'], df_freight_filtered[df_freight_filtered['country'] == 'Germany']]
df_combined_deu_ie = pd.concat(dfs_deu_ie, axis = 1)

#Merge columns date
df_combined_deu_ie = df_combined_deu_ie.drop(['date'], axis = 1)
df_combined_deu_ie['date'] = df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Germany']['date']

#Merge columns country
df_combined_deu_ie = df_combined_deu_ie.drop(['country'], axis = 1)
df_combined_deu_ie['country'] = df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Germany']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_deu_ie.isnull().sum().sum()}')

#Copy df_combined_deu_ie to make train set
df_combined_deu_train_ie = df_combined_deu_ie.copy()
df_combined_deu_train_ie = df_combined_deu_train_ie[df_combined_deu_train_ie['date'] <= 2017]

#Copy df_combined_deu_ie to make test set
df_combined_deu_test_ie = df_combined_deu_ie.copy()
df_combined_deu_test_ie = df_combined_deu_test_ie[df_combined_deu_test_ie['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [97]:
df_corr_deu = df_combined_deu_test_ie.drop('date', axis = 1)
corr = df_corr_deu.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale= px.colors.sequential.Emrld, width= 1500, height=750)

fig.show()





## Greece

### Inequality and Social Welfare

Data Preparation

In [98]:
#Combining all dataframes
dfs_grc = [df_gdp_filtered[df_gdp_filtered['country'] == 'Greece'], df_health_filtered[df_health_filtered['country'] == 'Greece'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'Greece'], df_diseases_filtered[df_diseases_filtered['country'] == 'Greece']]
df_combined_grc = pd.concat(dfs_grc, axis = 1)

#Merge columns date
df_combined_grc = df_combined_grc.drop(['date'], axis = 1)
df_combined_grc['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Greece']['date']

#Merge columns country
df_combined_grc = df_combined_grc.drop(['country'], axis = 1)
df_combined_grc['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Greece']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_grc.isnull().sum().sum()}')

#Copy df_combined_grc to make train set
df_combined_grc_train = df_combined_grc.copy()
df_combined_grc_train =  df_combined_grc_train[df_combined_grc_train['date'] <= 2017]

#Copy df_combined_grc to make test set
df_combined_grc_test = df_combined_grc.copy()
df_combined_grc_test = df_combined_grc_test[df_combined_grc_test['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [99]:
df_corr_grc = df_combined_grc_test.drop('date', axis = 1)
corr = df_corr_grc.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale= px.colors.sequential.Emrld, width= 1500, height=750)

fig.show()





### Import & Export

Data Preparation

In [100]:
#Combining all dataframes
dfs_grc_ie = [df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Greece'], df_export_filtered[df_export_filtered['country'] == 'Greece'], df_import_filtered[df_import_filtered['country'] == 'Greece'], df_freight_filtered[df_freight_filtered['country'] == 'Greece']]
df_combined_grc_ie = pd.concat(dfs_grc_ie, axis = 1)

#Merge columns date
df_combined_grc_ie = df_combined_grc_ie.drop(['date'], axis = 1)
df_combined_grc_ie['date'] = df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Greece']['date']

#Merge columns country
df_combined_grc_ie = df_combined_grc_ie.drop(['country'], axis = 1)
df_combined_grc_ie['country'] = df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Greece']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_grc_ie.isnull().sum().sum()}')

#Copy df_combined_grc_ie to make train set
df_combined_grc_train_ie = df_combined_grc_ie.copy()
df_combined_grc_train_ie = df_combined_grc_train_ie[df_combined_grc_train_ie['date'] <= 2017]

#Copy df_combined_grc_ie to make test set
df_combined_grc_test_ie = df_combined_grc_ie.copy()
df_combined_grc_test_ie = df_combined_grc_test_ie[df_combined_grc_test_ie['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [101]:
df_corr_grc = df_combined_grc_test_ie.drop('date', axis = 1)
corr = df_corr_grc.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale= px.colors.sequential.Emrld, width= 1500, height=750)

fig.show()





## Ireland

### Inequality and Social Welfare

Data Preparation

In [102]:
#Combining all dataframes
dfs_irl = [df_gdp_filtered[df_gdp_filtered['country'] == 'Ireland'], df_health_filtered[df_health_filtered['country'] == 'Ireland'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'Ireland'], df_diseases_filtered[df_diseases_filtered['country'] == 'Ireland']]
df_combined_irl = pd.concat(dfs_irl, axis = 1)

#Merge columns date
df_combined_irl = df_combined_irl.drop(['date'], axis = 1)
df_combined_irl['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Ireland']['date']

#Merge columns country
df_combined_irl = df_combined_irl.drop(['country'], axis = 1)
df_combined_irl['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Ireland']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_irl.isnull().sum().sum()}')

#Copy df_combined_irl to make train set
df_combined_irl_train = df_combined_irl.copy()
df_combined_irl_train =  df_combined_irl_train[df_combined_irl_train['date'] <= 2017]

#Copy df_combined_irl to make test set
df_combined_irl_test = df_combined_irl.copy()
df_combined_irl_test = df_combined_irl_test[df_combined_irl_test['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [103]:
df_corr_irl = df_combined_irl_test.drop('date', axis = 1)
corr = df_corr_irl.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale= px.colors.sequential.Emrld, width= 1500, height=750)

fig.show()





### Import & Export

Data Preparation

In [104]:
#Combining all dataframes
dfs_irl_ie = [df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Ireland'], df_export_filtered[df_export_filtered['country'] == 'Ireland'], df_import_filtered[df_import_filtered['country'] == 'Ireland'], df_freight_filtered[df_freight_filtered['country'] == 'Ireland']]
df_combined_irl_ie = pd.concat(dfs_irl_ie, axis = 1)

#Merge columns date
df_combined_irl_ie = df_combined_irl_ie.drop(['date'], axis = 1)
df_combined_irl_ie['date'] = df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Ireland']['date']

#Merge columns country
df_combined_irl_ie = df_combined_irl_ie.drop(['country'], axis = 1)
df_combined_irl_ie['country'] = df_gdp_filtered_ie[df_gdp_filtered_ie['country'] == 'Ireland']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_irl_ie.isnull().sum().sum()}')

#Copy df_combined_irl_ie to make train set
df_combined_irl_train_ie = df_combined_irl_ie.copy()
df_combined_irl_train_ie = df_combined_irl_train_ie[df_combined_irl_train_ie['date'] <= 2017]

#Copy df_combined_irl_ie to make test set
df_combined_irl_test_ie = df_combined_irl_ie.copy()
df_combined_irl_test_ie = df_combined_irl_test_ie[df_combined_irl_test_ie['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [105]:
df_corr_irl = df_combined_irl_test_ie.drop('date', axis = 1)
corr = df_corr_irl.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale= px.colors.sequential.Emrld, width= 1500, height=750)

fig.show()





## Random Forest

Model 

In [106]:
#Sorted data of the countries in a dictonary
dict_countries_data = {'The Netherlands - Inequality and Social Welfare': [df_combined_nld_train, df_combined_nld_test, df_combined_nld],
                       'Germany - Inequality and Social Welfare': [df_combined_deu_train, df_combined_deu_test, df_combined_deu],
                       'Greece - Inequality and Social Welfare': [df_combined_grc_train, df_combined_grc_test, df_combined_grc],
                       'Ireland - Inequality and Social Welfare': [df_combined_irl_train, df_combined_irl_test, df_combined_irl], 
                       'The Netherlands - Import and Export': [df_combined_nld_train_ie, df_combined_nld_test_ie, df_combined_nld_ie],
                       'Germany - Import and Export': [df_combined_deu_train_ie, df_combined_deu_test_ie, df_combined_deu_ie],
                       'Greece - Import and Export': [df_combined_grc_train_ie, df_combined_grc_test_ie, df_combined_grc_ie],
                       'Ireland - Import and Export': [df_combined_irl_train_ie, df_combined_irl_test_ie, df_combined_irl_ie]}

In [107]:
mae_list = []
rmse_list = []
pred_list = []


for key in dict_countries_data.keys():
    #Setting up X and Y
    X_test = dict_countries_data[key][1].drop(['gdppc', 'date', 'country'], axis=1)
    X_train = dict_countries_data[key][0].drop(['gdppc', 'date', 'country'], axis=1)
    y_test = dict_countries_data[key][1]['gdppc']
    y_train = dict_countries_data[key][0]['gdppc']

    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train)

    model = RandomForestRegressor()

    #Checking for best grid
    # param_grid = {
    #     'n_estimators': [500, 1000, 2000],
    #     'max_features': [1, 'sqrt'],
    #     'max_depth': [10, 20, 30],
    #     'min_samples_split': [2, 5, 10],
    #     'min_samples_leaf': [1, 2, 4]
    # }

    # grid_search = GridSearchCV(estimator= model, param_grid=param_grid, cv=3, n_jobs=-1)
    # grid_search.fit(X_train_scaled, y_train)

    # # Then you could fit the model with the best parameters
    # best_grid = grid_search.best_estimator_

    ##Fit model
    model.fit(X_train_scaled, y_train)

    score = model.score(X_test_scaled, y_test)
    predictions = model.predict(X_test_scaled)
    last_year = list(y_train)[0]
    predictions_CI = np.insert(predictions, 0, last_year)

    errors = model.predict(X_test_scaled) - y_test.values

    #Mean Absolute Error
    mae = np.mean(np.abs(errors))
    mae_list.append(mae)

    #Root Mean Square Error
    rmse = np.sqrt(np.mean(np.abs((errors) ** 2)))
    rmse_list.append(rmse)

    #Standard Error of the Mean (SEM)
    sem = np.std(errors) / np.sqrt(len(errors))

    #Confidence Interval
    critical_value = 1.984
    CI_lower = mae - critical_value * (np.std(errors) / np.sqrt(len(errors)))
    CI_upper = mae + critical_value * (np.std(errors) / np.sqrt(len(errors)))
    CI = (CI_lower, CI_upper)

    CI_lower_pred = predictions - critical_value * sem
    CI_upper_pred = predictions + critical_value * sem

    #Visaulizing results
    list_predicted_gdp =  list(predictions) + list(y_train)
    df_results = pd.DataFrame()
    df_results['gdppc'] = list_predicted_gdp
    df_results['date'] = list(dict_countries_data[key][2]['date'])

    # Initialise the figure
    fig = go.Figure()

    # Add the line for the actual GDP
    fig.add_trace(
        go.Scatter(
            x=dict_countries_data[key][2]['date'],
            y=dict_countries_data[key][2]['gdppc'],
            mode='lines',
            name='Actual GDP',
            line=dict(color='blue')
        )
    )

    # Add the line for the predicted GDP
    fig.add_trace(
        go.Scatter(
            x=df_results['date'],
            y=df_results['gdppc'],
            mode='lines',
            name='Predicted GDP',
            line=dict(color='red', dash='dash')
        )
    )


    title = "GDP prediction of " + str(key) + " using Random Forest"  
    
    fig.update_layout(
    title=dict(text= title))

    # Show the figure
    fig.show()

    print(f'The Mean Absolute Error (MAE) for Random Forest is {mae}')
    print(f'The Confidence Level of MAE is {CI}')
    print(f'The Root Mean Square Error (RMSE) for Random Forest is {rmse}')

The Mean Absolute Error (MAE) for Random Forest is 8587.855724487514
The Confidence Level of MAE is (2782.8555556889132, 14392.855893286114)
The Root Mean Square Error (RMSE) for Random Forest is 9971.663215663319


The Mean Absolute Error (MAE) for Random Forest is 8335.081724087744
The Confidence Level of MAE is (276.13319388640866, 16394.03025428908)
The Root Mean Square Error (RMSE) for Random Forest is 10907.446484242035


The Mean Absolute Error (MAE) for Random Forest is 4270.700097102711
The Confidence Level of MAE is (-416.2882830475546, 8957.688477252977)
The Root Mean Square Error (RMSE) for Random Forest is 4923.8797611865275


The Mean Absolute Error (MAE) for Random Forest is 32671.569552331915
The Confidence Level of MAE is (27477.025998431593, 37866.11310623224)
The Root Mean Square Error (RMSE) for Random Forest is 32984.794162055994


The Mean Absolute Error (MAE) for Random Forest is 24620.73552133661
The Confidence Level of MAE is (9488.997444273653, 39752.47359839956)
The Root Mean Square Error (RMSE) for Random Forest is 27940.80709927381


The Mean Absolute Error (MAE) for Random Forest is 23153.76043255468
The Confidence Level of MAE is (10180.067899280677, 36127.452965828685)
The Root Mean Square Error (RMSE) for Random Forest is 25775.53965979776


The Mean Absolute Error (MAE) for Random Forest is 9408.058386470939
The Confidence Level of MAE is (274.9530122462256, 18541.16376069565)
The Root Mean Square Error (RMSE) for Random Forest is 9788.215179687624


The Mean Absolute Error (MAE) for Random Forest is 53043.20846862925
The Confidence Level of MAE is (30088.10688757745, 75998.31004968105)
The Root Mean Square Error (RMSE) for Random Forest is 56702.59621734277


## Linear Regression

Model

In [108]:
for key in dict_countries_data.keys():
    #Setting up X and Y
    X_test = dict_countries_data[key][1].drop(['gdppc', 'date', 'country'], axis=1)
    X_train = dict_countries_data[key][0].drop(['gdppc', 'date', 'country'], axis=1)
    y_test = dict_countries_data[key][1]['gdppc']
    y_train = dict_countries_data[key][0]['gdppc']

    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train)

    model = LinearRegression()

    ##Fit model
    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)
    predictions = model.predict(X_test)

    #Visaulizing results
    list_predicted_gdp =  list(predictions) + list(y_train)
    df_results = pd.DataFrame()
    df_results['gdppc'] = list_predicted_gdp
    df_results['date'] = list(dict_countries_data[key][2]['date'])

    # Initialise the figure
    fig = go.Figure()

    # Add the line for the actual GDP
    fig.add_trace(
        go.Scatter(
            x=dict_countries_data[key][2]['date'],
            y=dict_countries_data[key][2]['gdppc'],
            mode='lines',
            name='Actual GDP',
            line=dict(color='blue')
        )
    )

    # Add the line for the predicted GDP
    fig.add_trace(
        go.Scatter(
            x=df_results['date'],
            y=df_results['gdppc'],
            mode='lines',
            name='Predicted GDP',
            line=dict(color='red', dash='dash')
        )
    )

    title = "GDP prediction of " + str(key) + " using Linear Regression"
    
    fig.update_layout(
    title=dict(text= title))

    # Show the figure
    fig.show()

## Neural Network

Model

In [109]:
for key in dict_countries_data.keys():
    #Setting up X and Y
    X_test = dict_countries_data[key][1].drop(['gdppc', 'date', 'country'], axis=1)
    X_train = dict_countries_data[key][0].drop(['gdppc', 'date', 'country'], axis=1)
    y_test = dict_countries_data[key][1]['gdppc']
    y_train = dict_countries_data[key][0]['gdppc']

    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train)

    model = MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam',
                            max_iter= 50000)

    ##Fit model
    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)
    predictions = model.predict(X_test)

    #Visaulizing results
    list_predicted_gdp =  list(predictions) + list(y_train)
    df_results = pd.DataFrame()
    df_results['gdppc'] = list_predicted_gdp
    df_results['date'] = list(dict_countries_data[key][2]['date'])

    # Initialise the figure
    fig = go.Figure()

    # Add the line for the actual GDP
    fig.add_trace(
        go.Scatter(
            x=dict_countries_data[key][2]['date'],
            y=dict_countries_data[key][2]['gdppc'],
            mode='lines',
            name='Actual GDP',
            line=dict(color='blue')
        )
    )

    # Add the line for the predicted GDP
    fig.add_trace(
        go.Scatter(
            x=df_results['date'],
            y=df_results['gdppc'],
            mode='lines',
            name='Predicted GDP',
            line=dict(color='red', dash='dash')
        )
    )

    title = "GDP prediction of " + str(key) + " using Neural Network"
    
    fig.update_layout(
    title=dict(text= title))

    # Show the figure
    fig.show()