# EPA112A - Programming for Data Science - Group 31

- Georges Puttaert - 4686160
- Thijs Roolvink
- Gijs de Werd

## Research Question

 **Income Inequality and Social Welfare: What is the impact of income inequality on social welfare indicators, such as life expectancy, education access, and healthcare quality for countries in the low, lower middle, upper middle and high income categories?**

Link to countries per income category:
<br>
https://datatopics.worldbank.org/world-development-indicators/the-world-by-income-and-region.html

Chosen Countries per category:
- **Upper income**: United States of America (USA)
- **Upper middle income**: Argentina (ARG)
- **Lower middle income**: India (IND)
- **Low income**: Ethiopia (ETH)

### Packages

In [163]:
import wbdata
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import GridSearchCV

### Indicators from the World Bank

In [164]:
# Define indicators
health_indicators = {'SH.XPD.CHEX.GD.ZS': "Health Expenditure as a Percentage of GDP", "SH.IMM.IDPT": "Immunization"}
GDP_indicator = {'NY.GDP.PCAP.CD': 'gdppc'}
life_exp_indicator = {'SP.DYN.LE00.IN': 'Life Expectancy at Birth', 'SH.DYN.MORT': 'Child Mortality'}
immuni_childmortality_indicator = {'SH.STA.MMRT': 'Maternal Mortality Rate'}
unemployement_indicator = {'SH.TBS.INCD': 'Unemployement'} #Incidence of Diseases

### DataFrames for chosen indicators and countries from the World Bank

In [165]:
countries = ['USA', 'ARG', 'IND', 'ETH']

df_health = wbdata.get_dataframe(health_indicators, country=countries, convert_date=True)
df_gdp = wbdata.get_dataframe(GDP_indicator, country=countries, convert_date=True)
df_life_exp = wbdata.get_dataframe(life_exp_indicator, country=countries, convert_date=True)
df_immuni_childmortality = wbdata.get_dataframe(immuni_childmortality_indicator, country=countries, convert_date=True)
df_unemployement = wbdata.get_dataframe(unemployement_indicator, country=countries, convert_date=True)

### Data Cleaning

In [166]:
# Reset index of the dataframes
df_health = df_health.reset_index()
df_gdp = df_gdp.reset_index()
df_life_exp = df_life_exp.reset_index()
df_immuni_childmortality = df_immuni_childmortality.reset_index()
df_unemployement = df_unemployement.reset_index()

#Formatting the date column to year
df_health['date'] = df_health['date'].dt.year
df_gdp['date'] = df_gdp['date'].dt.year
df_life_exp['date'] = df_life_exp['date'].dt.year
df_immuni_childmortality['date'] = df_immuni_childmortality['date'].dt.year
df_unemployement['date'] = df_unemployement['date'].dt.year 

### Comparing the four chosen countries

In [167]:
#GDP per capita
fig1 = px.line(df_gdp, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

#Health expenditure as a percentae of GDP
fig2 = px.line(df_health, x = 'date', y = 'Health Expenditure as a Percentage of GDP', color = 'country', title = 'Health Expenditure as a Percentage of GDP')

#Immunization
fig3 = px.line(df_health, x = 'date', y = 'Immunization', color = 'country', title = 'Immunization')

#Life Expectancy at Birth
fig4 = px.line(df_life_exp, x = 'date', y = 'Life Expectancy at Birth', color = 'country', title = 'Life Expectancy at Birth')

#Child Mortality
fig5 = px.line(df_life_exp, x = 'date', y = 'Child Mortality', color = 'country', title = 'Child Mortality')

# Maternal Mortality Rate
fig6 = px.line(df_immuni_childmortality, x = 'date', y = 'Maternal Mortality Rate', color = 'country', title = 'Maternal Mortality Rate')

# Unemployement Rate
fig7  = px.line(df_unemployement, x = 'date', y = 'Unemployement', color = 'country', title = 'Unemployement Rate')

fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
fig7.show()

### Setting up a date range from 2000 until 2020

In [168]:
type(df_health['date'])

pandas.core.series.Series

In [169]:
#Dataframes with dates starting in 2000 unitll 2020
df_health_filtered = df_health[(df_health['date'] >= 2000) & (df_health['date'] <= 2020)]
df_gdp_filtered = df_gdp[(df_gdp['date'] >= 2000) & (df_gdp['date'] <= 2020)]
df_life_exp_filtered = df_life_exp[(df_life_exp['date'] >= 2000) & (df_life_exp['date'] <= 2020)]
df_immuni_childmortality_filtered = df_immuni_childmortality[(df_immuni_childmortality['date'] >= 2000) & (df_immuni_childmortality['date'] <= 2020)]
df_unemployement_filtered = df_unemployement[(df_unemployement['date'] >= 2000) & (df_unemployement['date'] <= 2020)]

In [170]:
#GDP per capita
fig1 = px.line(df_gdp_filtered, x = 'date', y = 'gdppc', color = 'country', title = 'GDP per Capita')

#Health expenditure as a percentae of GDP
fig2 = px.line(df_health_filtered, x = 'date', y = 'Health Expenditure as a Percentage of GDP', color = 'country', title = 'Health Expenditure as a Percentage of GDP')

#Immunization
fig3 = px.line(df_health_filtered, x = 'date', y = 'Immunization', color = 'country', title = 'Immunization')

#Life Expectancy at Birth
fig4 = px.line(df_life_exp_filtered, x = 'date', y = 'Life Expectancy at Birth', color = 'country', title = 'Life Expectancy at Birth')

#Child Mortality
fig5 = px.line(df_life_exp_filtered, x = 'date', y = 'Child Mortality', color = 'country', title = 'Child Mortality')

# Maternal Mortality Rate
fig6 = px.line(df_immuni_childmortality_filtered, x = 'date', y = 'Maternal Mortality Rate', color = 'country', title = 'Maternal Mortality Rate')

# Unemployement Rate
fig7  = px.line(df_unemployement_filtered, x = 'date', y = 'Unemployement', color = 'country', title = 'Unemployement Rate')

fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
fig7.show()

## Random Forest

### Data Preparation

In [171]:
#Combining all dataframes
dfs_usa = [df_gdp_filtered[df_gdp_filtered['country'] == 'United States'], df_health_filtered[df_health_filtered['country'] == 'United States'], df_life_exp_filtered[df_life_exp_filtered['country'] == 'United States'], df_immuni_childmortality_filtered[df_immuni_childmortality_filtered['country'] == 'United States'], df_unemployement_filtered[df_unemployement_filtered['country'] == 'United States']]
df_combined_usa = pd.concat(dfs_usa, axis = 1)

#Merge columns date
df_combined_usa = df_combined_usa.drop(['date'], axis = 1)
df_combined_usa['date'] = df_gdp_filtered[df_gdp_filtered['country'] == 'United States']['date']

#Merge columns country
df_combined_usa = df_combined_usa.drop(['country'], axis = 1)
df_combined_usa['country'] = df_gdp_filtered[df_gdp_filtered['country'] == 'United States']['country']

#Check for missing data
print(f'Number of missing data: {df_combined_usa.isnull().sum().sum()}')

#Copy df_combined_usa to make train set
df_combined_usa_train = df_combined_usa.copy()
df_combined_usa_train =  df_combined_usa_train[df_combined_usa_train['date'] <= 2017]

#Copy df_combined_usa to make test set
df_combined_usa_test = df_combined_usa.copy()
df_combined_usa_test = df_combined_usa_test[df_combined_usa_test['date'] > 2017]

Number of missing data: 0


### Model 

In [172]:
#Setting up X and Y
X_test = df_combined_usa_test.drop(['gdppc', 'date', 'country'], axis=1)
X_train = df_combined_usa_train.drop(['gdppc', 'date', 'country'], axis=1)
y_test = df_combined_usa_test['gdppc']
y_train = df_combined_usa_train['gdppc']

scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)
X_train_scaled = scaler.fit_transform(X_train)

model = RandomForestRegressor()

In [173]:
#Checking for best grid
param_grid = {
    'n_estimators': [500, 1000, 2000],
    'max_features': [1, 'sqrt'],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator= model, param_grid=param_grid, cv=3, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Then you could fit the model with the best parameters
best_grid = grid_search.best_estimator_

In [174]:
##Fit model
best_grid.fit(X_train, y_train)

score = best_grid.score(X_test, y_test)
predictions = best_grid.predict(X_test)

### Visualization of the results

In [175]:
list_predicted_gdp =  list(predictions) + list(y_train)
df_results_usa = pd.DataFrame()
df_results_usa['gdppc'] = list_predicted_gdp
df_results_usa['date'] = list(df_combined_usa['date'])

# Initialise the figure
fig = go.Figure()

# Add the line for the actual GDP
fig.add_trace(
    go.Scatter(
        x=df_combined_usa['date'],
        y=df_combined_usa['gdppc'],
        mode='lines',
        name='Actual GDP',
        line=dict(color='blue')
    )
)

# Add the line for the predicted GDP
fig.add_trace(
    go.Scatter(
        x=df_results_usa['date'],
        y=df_results_usa['gdppc'],
        mode='lines',
        name='Predicted GDP',
        line=dict(color='red', dash='dash')
    )
)

# Show the figure
fig.show()