# EPA112A - Programming for Data Science - Group 31

- Georges Puttaert - 4686160
- Thijs Roolvink
- Gijs de Werd

## Research Question

 **Income Inequality and Social Welfare: What is the impact of income inequality on social welfare indicators, such as life expectancy, education access, and healthcare quality for countries in the low, lower middle, upper middle and high income categories?**

Link to countries per income category:
<br>
https://datatopics.worldbank.org/world-development-indicators/the-world-by-income-and-region.html

Chosen Countries per category:
- **Upper income**: United States of America (USA)
- **Upper middle income**: Argentina (ARG)
- **Lower middle income**: India (IND)
- **Low income**: Ethiopia (ETH)

### Packages

In [28]:
import wbdata
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

### Indicators from the World Bank

In [29]:
# Define indicators Inequality and Social Welfare
health_indicators = {'SH.XPD.CHEX.GD.ZS': "Health Expenditure as a Percentage of GDP", "SH.IMM.IDPT": "Immunization"}
GDP_indicator = {'NY.GDP.PCAP.CD': 'gdppc'}
life_exp_indicator = {'SP.DYN.LE00.IN': 'Life Expectancy at Birth', 'SH.DYN.MORT': 'Child Mortality'}
immuni_childmortality_indicator = {'SH.STA.MMRT': 'Maternal Mortality Rate'}
disease_indicator = {'SH.TBS.INCD': 'Indicence of Diseases'} 

#Define indicators Import & Export
export_indicator = {'NE.EXP.GNFS.KD.ZG': 'Export'}
import_indicator = {'NE.IMP.GNFS.KD.ZG': 'Import'}
freight_indicator = {'IS.AIR.GOOD.MT.K1': 'Freight'}


### DataFrames for chosen indicators and countries from the World Bank

In [30]:
countries = ['USA', 'ARG', 'IND', 'ETH']

#Dataframes Inequality and Social Welfare
df_health = wbdata.get_dataframe(health_indicators, country=countries, convert_date=True)
df_gdp = wbdata.get_dataframe(GDP_indicator, country=countries, convert_date=True)
df_life_exp = wbdata.get_dataframe(life_exp_indicator, country=countries, convert_date=True)
df_immuni_childmortality = wbdata.get_dataframe(immuni_childmortality_indicator, country=countries, convert_date=True)
df_diseases = wbdata.get_dataframe(disease_indicator, country=countries, convert_date=True)

#Dataframes Import & Export
df_export = wbdata.get_dataframe(export_indicator, country=countries, convert_date=True)
df_import = wbdata.get_dataframe(import_indicator, country=countries, convert_date=True)
df_freight = wbdata.get_dataframe(freight_indicator, country=countries, convert_date=True)

### Data Cleaning

In [31]:
### Inequality and Social Welfare ###
# Reset index of the dataframes
df_health = df_health.reset_index()
df_gdp = df_gdp.reset_index()
df_life_exp = df_life_exp.reset_index()
df_immuni_childmortality = df_immuni_childmortality.reset_index()
df_diseases = df_diseases.reset_index()

#Formatting the date column to year
df_health['date'] = df_health['date'].dt.year
df_gdp['date'] = df_gdp['date'].dt.year
df_life_exp['date'] = df_life_exp['date'].dt.year
df_immuni_childmortality['date'] = df_immuni_childmortality['date'].dt.year
df_diseases['date'] = df_diseases['date'].dt.year 

In [32]:
### Import & Export ###
# Reset index of the dataframes
df_export = df_export.reset_index()
df_import = df_import.reset_index()
df_freight = df_freight.reset_index()

#Formatting the date column to year
df_export['date'] = df_export['date'].dt.year
df_import['date'] = df_import['date'].dt.year
df_freight['date'] = df_freight['date'].dt.year

### Visualizing the indicators

In [33]:
#GDP per capita
fig1 = px.line(df_gdp, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

### Inequality and Social Welfare ###

#Health expenditure as a percentae of GDP
fig2 = px.line(df_health, x = 'date', y = 'Health Expenditure as a Percentage of GDP', color = 'country', title = 'Health Expenditure as a Percentage of GDP')

#Immunization
fig3 = px.line(df_health, x = 'date', y = 'Immunization', color = 'country', title = 'Immunization')

#Life Expectancy at Birth
fig4 = px.line(df_life_exp, x = 'date', y = 'Life Expectancy at Birth', color = 'country', title = 'Life Expectancy at Birth')

#Child Mortality
fig5 = px.line(df_life_exp, x = 'date', y = 'Child Mortality', color = 'country', title = 'Child Mortality')

# Maternal Mortality Rate
fig6 = px.line(df_immuni_childmortality, x = 'date', y = 'Maternal Mortality Rate', color = 'country', title = 'Maternal Mortality Rate')

# Indices of Diseases
fig7  = px.line(df_diseases, x = 'date', y = 'Indicence of Diseases', color = 'country', title = 'Indices of Diseases')

fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
fig7.show()

In [34]:
#GDP per capita
fig1 = px.line(df_gdp, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

### Import & Export ###
#Export
fig2 = px.line(df_export, x = 'date', y = 'Export', color = 'country', title = 'Exports of goods and services (annual percentage growth)')

#Import 
fig3 = px.line(df_import, x = 'date', y = 'Import', color = 'country', title = 'Imports of goods and services (annual percentage growth)')

#Freight
fig4 = px.line(df_freight, x = 'date', y = 'Freight', color = 'country', title = 'ir transport, freight (million ton-km)')

fig1.show()
fig2.show()
fig3.show()
fig4.show()

### Setting up a date range from 2000 until 2020

In [35]:
### Inequality and Social Welfare ###
#Dataframes with dates starting in 2000 unitll 2020
df_health_filtered = df_health[(df_health['date'] >= 2000) & (df_health['date'] <= 2020)]
df_gdp_filtered = df_gdp[(df_gdp['date'] >= 2000) & (df_gdp['date'] <= 2020)]
df_life_exp_filtered = df_life_exp[(df_life_exp['date'] >= 2000) & (df_life_exp['date'] <= 2020)]
df_immuni_childmortality_filtered = df_immuni_childmortality[(df_immuni_childmortality['date'] >= 2000) & (df_immuni_childmortality['date'] <= 2020)]
df_diseases_filtered = df_diseases[(df_diseases['date'] >= 2000) & (df_diseases['date'] <= 2020)]

In [36]:
#GDP per capita
fig1 = px.line(df_gdp_filtered, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

### Inequality and Social Welfare ###
#Health expenditure as a percentae of GDP
fig2 = px.line(df_health_filtered, x = 'date', y = 'Health Expenditure as a Percentage of GDP', color = 'country', title = 'Health Expenditure as a Percentage of GDP')

#Immunization
fig3 = px.line(df_health_filtered, x = 'date', y = 'Immunization', color = 'country', title = 'Immunization')

#Life Expectancy at Birth
fig4 = px.line(df_life_exp_filtered, x = 'date', y = 'Life Expectancy at Birth', color = 'country', title = 'Life Expectancy at Birth')

#Child Mortality
fig5 = px.line(df_life_exp_filtered, x = 'date', y = 'Child Mortality', color = 'country', title = 'Child Mortality')

# Maternal Mortality Rate
fig6 = px.line(df_immuni_childmortality_filtered, x = 'date', y = 'Maternal Mortality Rate', color = 'country', title = 'Maternal Mortality Rate')

# Indices of Diseases
fig7  = px.line(df_diseases_filtered, x = 'date', y = 'Indicence of Diseases', color = 'country', title = 'Indices of Diseases')

fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
fig7.show()

In [37]:
### Import & Export ###
#Dataframes with dates starting in 2000 unitll 2020
df_export_filtered = df_export[(df_export['date'] >= 2000) & (df_export['date'] <= 2000)]
df_import_filtered = df_import[(df_import['date'] >= 2000) & (df_import['date'] <= 2000)]
df_freight_filtered = df_freight[(df_freight['date'] >= 2000) & (df_freight['date'] <= 2000)]

In [38]:
#GDP per capita
fig1 = px.line(df_gdp_filtered, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

### Import & Export ###
#Export
fig2 = px.line(df_export_filtered, x = 'date', y = 'Export', color = 'country', title = 'Exports of goods and services (annual percentage growth)')

#Import 
fig3 = px.line(df_import_filtered, x = 'date', y = 'Import', color = 'country', title = 'Imports of goods and services (annual percentage growth)')

#Freight
fig4 = px.line(df_freight_filtered, x = 'date', y = 'Freight', color = 'country', title = 'ir transport, freight (million ton-km)')

fig1.show()
fig2.show()
fig3.show()
fig4.show()

## United States of America

### Inequality and Social Welfare

Data Preparation

In [39]:
#Combining all dataframes
dfs_usa = [df_gdp_filtered[df_gdp_filtered['country'] == 'United States'], df_health_filtered[df_health_filtered['country'] == 'United States'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'United States'], df_immuni_childmortality_filtered[df_immuni_childmortality_filtered['country'] == 'United States'], df_diseases_filtered[df_diseases_filtered['country'] == 'United States']]
df_combined_usa = pd.concat(dfs_usa, axis = 1)

#Merge columns date
df_combined_usa = df_combined_usa.drop(['date'], axis = 1)
df_combined_usa['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'United States']['date']

#Merge columns country
df_combined_usa = df_combined_usa.drop(['country'], axis = 1)
df_combined_usa['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'United States']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_usa.isnull().sum().sum()}')

#Copy df_combined_usa to make train set
df_combined_usa_train = df_combined_usa.copy()
df_combined_usa_train =  df_combined_usa_train[df_combined_usa_train['date'] <= 2017]

#Copy df_combined_usa to make test set
df_combined_usa_test = df_combined_usa.copy()
df_combined_usa_test = df_combined_usa_test[df_combined_usa_test['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [40]:
df_corr_usa = df_combined_usa_test.drop('date', axis = 1)
corr = df_corr_usa.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu', width= 1500, height=750)

fig.show()





### Import & Export

Data Preparation

In [41]:
#Combining all dataframes
dfs_usa_ie = [df_gdp_filtered[df_gdp_filtered['country'] == 'United States'], df_export_filtered[df_export_filtered['country'] == 'United States'], df_import_filtered[df_import_filtered['country'] == 'United States'], df_freight_filtered[df_freight_filtered['country'] == 'United States']]
df_combined_usa_ie = pd.concat(dfs_usa_ie, axis = 1)

#Merge columns date
df_combined_usa_ie = df_combined_usa_ie.drop(['date'], axis = 1)
df_combined_usa_ie['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'United States']['date']

#Merge columns country
df_combined_usa_ie = df_combined_usa_ie.drop(['country'], axis = 1)
df_combined_usa_ie['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'United States']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_usa_ie.isnull().sum().sum()}')

#Copy df_combined_usa_ie to make train set
df_combined_usa_train_ie = df_combined_usa_ie.copy()
df_combined_usa_train_ie = df_combined_usa_train_ie[df_combined_usa_train_ie['date'] <= 2017]

#Copy df_combined_usa_ie to make test set
df_combined_usa_test_ie = df_combined_usa_ie.copy()
df_combined_usa_test_ie = df_combined_usa_test_ie[df_combined_usa_test_ie['date'] > 2017]

Number of missing data: 60


Correlation between datasets

In [42]:
df_corr_usa = df_combined_usa_test_ie.drop('date', axis = 1)
corr = df_corr_usa.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu', width= 1500, height=750)

fig.show()





## Argentina

### Inequality and Social Welfare

Data Preparation

In [43]:
#Combining all dataframes
dfs_arg = [df_gdp_filtered[df_gdp_filtered['country'] == 'Argentina'], df_health_filtered[df_health_filtered['country'] == 'Argentina'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'Argentina'], df_immuni_childmortality_filtered[df_immuni_childmortality_filtered['country'] == 'Argentina'], df_diseases_filtered[df_diseases_filtered['country'] == 'Argentina']]
df_combined_arg = pd.concat(dfs_arg, axis = 1)

#Merge columns date
df_combined_arg = df_combined_arg.drop(['date'], axis = 1)
df_combined_arg['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Argentina']['date']

#Merge columns country
df_combined_arg = df_combined_arg.drop(['country'], axis = 1)
df_combined_arg['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Argentina']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_arg.isnull().sum().sum()}')

#Copy df_combined_arg to make train set
df_combined_arg_train = df_combined_arg.copy()
df_combined_arg_train =  df_combined_arg_train[df_combined_arg_train['date'] <= 2017]

#Copy df_combined_arg to make test set
df_combined_arg_test = df_combined_arg.copy()
df_combined_arg_test = df_combined_arg_test[df_combined_arg_test['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [44]:
df_corr_arg = df_combined_arg_test.drop('date', axis = 1)
corr = df_corr_arg.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu', width= 1500, height=750)

fig.show()





### Import & Export


Data Preparation

In [45]:
#Combining all dataframes
dfs_arg_ie = [df_gdp_filtered[df_gdp_filtered['country'] == 'Argentina'], df_export_filtered[df_export_filtered['country'] == 'Argentina'], df_import_filtered[df_import_filtered['country'] == 'Argentina'], df_freight_filtered[df_freight_filtered['country'] == 'Argentina']]
df_combined_arg_ie = pd.concat(dfs_arg_ie, axis = 1)

#Merge columns date
df_combined_arg_ie = df_combined_arg_ie.drop(['date'], axis = 1)
df_combined_arg_ie['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Argentina']['date']

#Merge columns country
df_combined_arg_ie = df_combined_arg_ie.drop(['country'], axis = 1)
df_combined_arg_ie['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Argentina']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_arg_ie.isnull().sum().sum()}')

#Copy df_combined_arg_ie to make train set
df_combined_arg_train_ie = df_combined_arg_ie.copy()
df_combined_arg_train_ie = df_combined_arg_train_ie[df_combined_arg_train_ie['date'] <= 2017]

#Copy df_combined_arg_ie to make test set
df_combined_arg_test_ie = df_combined_arg_ie.copy()
df_combined_arg_test_ie = df_combined_arg_test_ie[df_combined_arg_test_ie['date'] > 2017]

Number of missing data: 60


Correlation between datasets

In [46]:
df_corr_arg = df_combined_arg_test_ie.drop('date', axis = 1)
corr = df_corr_arg.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu', width= 1500, height=750)

fig.show()





## India

### Inequality and Social Welfare

Data Preparation

In [47]:
#Combining all dataframes
dfs_ind = [df_gdp_filtered[df_gdp_filtered['country'] == 'India'], df_health_filtered[df_health_filtered['country'] == 'India'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'India'], df_immuni_childmortality_filtered[df_immuni_childmortality_filtered['country'] == 'India'], df_diseases_filtered[df_diseases_filtered['country'] == 'India']]
df_combined_ind = pd.concat(dfs_ind, axis = 1)

#Merge columns date
df_combined_ind = df_combined_ind.drop(['date'], axis = 1)
df_combined_ind['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'India']['date']

#Merge columns country
df_combined_ind = df_combined_ind.drop(['country'], axis = 1)
df_combined_ind['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'India']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_ind.isnull().sum().sum()}')

#Copy df_combined_ind to make train set
df_combined_ind_train = df_combined_ind.copy()
df_combined_ind_train =  df_combined_ind_train[df_combined_ind_train['date'] <= 2017]

#Copy df_combined_ind to make test set
df_combined_ind_test = df_combined_ind.copy()
df_combined_ind_test = df_combined_ind_test[df_combined_ind_test['date'] > 2017]

Number of missing data: 0


Correlation between datasets

In [48]:
df_corr_ind = df_combined_ind_test.drop('date', axis = 1)
corr = df_corr_ind.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu', width= 1500, height=750)

fig.show()





### Import & Export

Data Preparation

In [49]:
#Combining all dataframes
dfs_ind_ie = [df_gdp_filtered[df_gdp_filtered['country'] == 'India'], df_export_filtered[df_export_filtered['country'] == 'India'], df_import_filtered[df_import_filtered['country'] == 'India'], df_freight_filtered[df_freight_filtered['country'] == 'India']]
df_combined_ind_ie = pd.concat(dfs_ind_ie, axis = 1)

#Merge columns date
df_combined_ind_ie = df_combined_ind_ie.drop(['date'], axis = 1)
df_combined_ind_ie['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'India']['date']

#Merge columns country
df_combined_ind_ie = df_combined_ind_ie.drop(['country'], axis = 1)
df_combined_ind_ie['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'India']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_ind_ie.isnull().sum().sum()}')

#Copy df_combined_ind_ie to make train set
df_combined_ind_train_ie = df_combined_ind_ie.copy()
df_combined_ind_train_ie = df_combined_ind_train_ie[df_combined_ind_train_ie['date'] <= 2017]

#Copy df_combined_ind_ie to make test set
df_combined_ind_test_ie = df_combined_ind_ie.copy()
df_combined_ind_test_ie = df_combined_ind_test_ie[df_combined_ind_test_ie['date'] > 2017]

Number of missing data: 60


Correlation between datasets

In [50]:
df_corr_ind = df_combined_ind_test_ie.drop('date', axis = 1)
corr = df_corr_ind.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu', width= 1500, height=750)

fig.show()





## Ethiopia

### Data Preparation

In [51]:
#Combining all dataframes
dfs_eth = [df_gdp_filtered[df_gdp_filtered['country'] == 'Ethiopia'], df_health_filtered[df_health_filtered['country'] == 'Ethiopia'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'Ethiopia'], df_immuni_childmortality_filtered[df_immuni_childmortality_filtered['country'] == 'Ethiopia'], df_diseases_filtered[df_diseases_filtered['country'] == 'Ethiopia']]
df_combined_eth = pd.concat(dfs_eth, axis = 1)

#Merge columns date
df_combined_eth = df_combined_eth.drop(['date'], axis = 1)
df_combined_eth['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Ethiopia']['date']

#Merge columns country
df_combined_eth = df_combined_eth.drop(['country'], axis = 1)
df_combined_eth['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'Ethiopia']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_eth.isnull().sum().sum()}')

#Copy df_combined_eth to make train set
df_combined_eth_train = df_combined_eth.copy()
df_combined_eth_train =  df_combined_eth_train[df_combined_eth_train['date'] <= 2017]

#Copy df_combined_eth to make test set
df_combined_eth_test = df_combined_eth.copy()
df_combined_eth_test = df_combined_eth_test[df_combined_eth_test['date'] > 2017]

Number of missing data: 0


### Correlation between datasets

In [52]:
df_corr_eth = df_combined_eth_test.drop('date', axis = 1)
corr = df_corr_eth.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
corr.where(~mask, inplace=True)

fig = px.imshow(corr, text_auto=True, color_continuous_scale='RdBu', width= 1500, height=750)

fig.show()





## Random Forest

### Model 

In [53]:
#Sorted data of the countries in a dictonary
dict_countries_data = {'United States - Inequality and Social Welfare': [df_combined_usa_train, df_combined_usa_test, df_combined_usa],
                       'Argentina - Inequality and Social Welfare': [df_combined_arg_train, df_combined_arg_test, df_combined_arg],
                       'India - Inequality and Social Welfare': [df_combined_ind_train, df_combined_ind_test, df_combined_ind],
                       'Ethiopia - Inequality and Social Welfare': [df_combined_eth_train, df_combined_eth_test, df_combined_eth], 
                       'United States - Import and Export': [df_combined_usa_train_ie, df_combined_usa_test_ie, df_combined_usa_ie],
                       'Argentina - Import and Export': [df_combined_arg_train_ie, df_combined_arg_test_ie, df_combined_arg_ie],
                       'India - Import and Export': [df_combined_ind_train_ie, df_combined_ind_test_ie, df_combined_ind_ie]}

In [54]:
mae_list = []
rmse_list = []
pred_list = []


for key in dict_countries_data.keys():
    #Setting up X and Y
    X_test = dict_countries_data[key][1].drop(['gdppc', 'date', 'country'], axis=1)
    X_train = dict_countries_data[key][0].drop(['gdppc', 'date', 'country'], axis=1)
    y_test = dict_countries_data[key][1]['gdppc']
    y_train = dict_countries_data[key][0]['gdppc']

    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train)

    model = RandomForestRegressor()

    #Checking for best grid
    # param_grid = {
    #     'n_estimators': [500, 1000, 2000],
    #     'max_features': [1, 'sqrt'],
    #     'max_depth': [10, 20, 30],
    #     'min_samples_split': [2, 5, 10],
    #     'min_samples_leaf': [1, 2, 4]
    # }

    # grid_search = GridSearchCV(estimator= model, param_grid=param_grid, cv=3, n_jobs=-1)
    # grid_search.fit(X_train_scaled, y_train)

    # # Then you could fit the model with the best parameters
    # best_grid = grid_search.best_estimator_

    ##Fit model
    model.fit(X_train_scaled, y_train)

    score = model.score(X_test_scaled, y_test)
    predictions = model.predict(X_test_scaled)
    last_year = list(y_train)[0]
    predictions_CI = np.insert(predictions, 0, last_year)

    errors = model.predict(X_test_scaled) - y_test.values

    #Mean Absolute Error
    mae = np.mean(np.abs(errors))
    mae_list.append(mae)

    #Root Mean Square Error
    rmse = np.sqrt(np.mean(np.abs((errors) ** 2)))
    rmse_list.append(rmse)

    #Standard Error of the Mean (SEM)
    sem = np.std(errors) / np.sqrt(len(errors))

    #Confidence Interval
    critical_value = 1.984
    CI_lower = mae - critical_value * (np.std(errors) / np.sqrt(len(errors)))
    CI_upper = mae + critical_value * (np.std(errors) / np.sqrt(len(errors)))
    CI = (CI_lower, CI_upper)

    CI_lower_pred = predictions - critical_value * sem
    CI_upper_pred = predictions + critical_value * sem

    #Visaulizing results
    list_predicted_gdp =  list(predictions) + list(y_train)
    df_results = pd.DataFrame()
    df_results['gdppc'] = list_predicted_gdp
    df_results['date'] = list(dict_countries_data[key][2]['date'])

    # Initialise the figure
    fig = go.Figure()

    # Add the line for the actual GDP
    fig.add_trace(
        go.Scatter(
            x=dict_countries_data[key][2]['date'],
            y=dict_countries_data[key][2]['gdppc'],
            mode='lines',
            name='Actual GDP',
            line=dict(color='blue')
        )
    )

    # Add the line for the predicted GDP
    fig.add_trace(
        go.Scatter(
            x=df_results['date'],
            y=df_results['gdppc'],
            mode='lines',
            name='Predicted GDP',
            line=dict(color='red', dash='dash')
        )
    )

    fig.add_trace(
    go.Scatter(
        x=df_results['date'],
        y=CI_upper_pred,
        mode='lines',
        name='Upper Confidence Limit',
        line=dict(width=0),
        fill=None
        )
    )   
    fig.add_trace(
        go.Scatter(
            x=df_results['date'],
            y=CI_lower_pred,
            mode='lines',
            name='Lower Confidence Limit',
            line=dict(width=0),
            fill='tonexty'
        )
    )

    title = "GDP prediction of " + str(key) + " using Random Forest"  
    
    fig.update_layout(
    title=dict(text= title))

    # Show the figure
    fig.show()

    print(f'The Mean Absolute Error (MAE) for Random Forest is {mae}')
    print(f'The Confidence Level of MAE is {CI}')
    print(f'The Root Mean Square Error (RMSE) for Random Forest is {rmse}')

The Mean Absolute Error (MAE) for Random Forest is 16334.87692826249
The Confidence Level of MAE is (11077.984435207094, 21591.769421317887)
The Root Mean Square Error (RMSE) for Random Forest is 16967.322552555324


The Mean Absolute Error (MAE) for Random Forest is 1281.6639756412048
The Confidence Level of MAE is (-559.4259475738925, 3122.753898856302)
The Root Mean Square Error (RMSE) for Random Forest is 1702.010928905055


The Mean Absolute Error (MAE) for Random Forest is 861.7289882648183
The Confidence Level of MAE is (767.1411088622256, 956.3168676674111)
The Root Mean Square Error (RMSE) for Random Forest is 865.6764199436511


The Mean Absolute Error (MAE) for Random Forest is 506.72341469336965
The Confidence Level of MAE is (404.68328826401574, 608.7635411227236)
The Root Mean Square Error (RMSE) for Random Forest is 514.4941411796686



invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide



ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Linear Regression

### Model

In [None]:
for key in dict_countries_data.keys():
    #Setting up X and Y
    X_test = dict_countries_data[key][1].drop(['gdppc', 'date', 'country'], axis=1)
    X_train = dict_countries_data[key][0].drop(['gdppc', 'date', 'country'], axis=1)
    y_test = dict_countries_data[key][1]['gdppc']
    y_train = dict_countries_data[key][0]['gdppc']

    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train)

    model = LinearRegression()

    ##Fit model
    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)
    predictions = model.predict(X_test)

    #Visaulizing results
    list_predicted_gdp =  list(predictions) + list(y_train)
    df_results = pd.DataFrame()
    df_results['gdppc'] = list_predicted_gdp
    df_results['date'] = list(dict_countries_data[key][2]['date'])

    # Initialise the figure
    fig = go.Figure()

    # Add the line for the actual GDP
    fig.add_trace(
        go.Scatter(
            x=dict_countries_data[key][2]['date'],
            y=dict_countries_data[key][2]['gdppc'],
            mode='lines',
            name='Actual GDP',
            line=dict(color='blue')
        )
    )

    # Add the line for the predicted GDP
    fig.add_trace(
        go.Scatter(
            x=df_results['date'],
            y=df_results['gdppc'],
            mode='lines',
            name='Predicted GDP',
            line=dict(color='red', dash='dash')
        )
    )

    title = "GDP prediction of " + str(key) + " using Linear Regression"
    
    fig.update_layout(
    title=dict(text= title))

    # Show the figure
    fig.show()

## Neural Network

### Model

In [None]:
for key in dict_countries_data.keys():
    #Setting up X and Y
    X_test = dict_countries_data[key][1].drop(['gdppc', 'date', 'country'], axis=1)
    X_train = dict_countries_data[key][0].drop(['gdppc', 'date', 'country'], axis=1)
    y_test = dict_countries_data[key][1]['gdppc']
    y_train = dict_countries_data[key][0]['gdppc']

    scaler = StandardScaler()
    X_test_scaled = scaler.fit_transform(X_test)
    X_train_scaled = scaler.fit_transform(X_train)

    model = MLPRegressor(hidden_layer_sizes=(128, 64, 32), activation='relu', solver='adam',
                            max_iter= 50000)

    ##Fit model
    model.fit(X_train, y_train)

    score = model.score(X_test, y_test)
    predictions = model.predict(X_test)

    #Visaulizing results
    list_predicted_gdp =  list(predictions) + list(y_train)
    df_results = pd.DataFrame()
    df_results['gdppc'] = list_predicted_gdp
    df_results['date'] = list(dict_countries_data[key][2]['date'])

    # Initialise the figure
    fig = go.Figure()

    # Add the line for the actual GDP
    fig.add_trace(
        go.Scatter(
            x=dict_countries_data[key][2]['date'],
            y=dict_countries_data[key][2]['gdppc'],
            mode='lines',
            name='Actual GDP',
            line=dict(color='blue')
        )
    )

    # Add the line for the predicted GDP
    fig.add_trace(
        go.Scatter(
            x=df_results['date'],
            y=df_results['gdppc'],
            mode='lines',
            name='Predicted GDP',
            line=dict(color='red', dash='dash')
        )
    )

    title = "GDP prediction of " + str(key) + " using Neural Network"
    
    fig.update_layout(
    title=dict(text= title))

    # Show the figure
    fig.show()