# Importing dataset

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
sb.set()
from sklearn import metrics
import missingno as msn
%matplotlib inline
import matplotlib.pyplot as plt
reg_data = pd.read_excel('WHR2018Chapter2OnlineData.xls',sheet_name = 'SupportingFactors')
data = pd.read_excel('WHR2018Chapter2OnlineData.xls',sheet_name = 'Table2.1')
region = pd.DataFrame(reg_data[["country","Region indicator"]])  # extracting region information from supporting factors sheet

data_M = data.merge(region, on='country', how='left')         # inserting a column of region corresponding to countries to the dataset
col = list(data_M)
col.insert(1,col.pop(col.index('Region indicator')))
data_M = data_M.loc[:,col]
data_M.head(100)    # merged data

In [None]:
data_M['Region indicator'].fillna("None", inplace=True)
print("number of regions:",len(data_M['Region indicator'].unique())-1)    # len()-1 because "none" is not a region
data_M['Region indicator'].value_counts()    # information about the regions

In [None]:
f, axes = plt.subplots(1, 1, figsize=(32, 5))
sb.countplot(data_M["Region indicator"])

# Visualisation of missing data

In [None]:
data_M.info()

In [None]:
msn.matrix(data_M.sample(1562))   # visualise the locations where the values are missing

msn.bar(data_M.sample(1562))    # counting the data points present for each variable in the dataset

# Filling in the missing data

In [None]:
import scipy as sp

In [None]:
data_pred = data_M.interpolate(method = 'linear')    # use scipy.interpolation to fill in the missing data values
data_pred.head(1562)

In [None]:
data_pred.info()

In [None]:
msn.matrix(data_pred.sample(1562))    # some data points are still missing beacause extrapolation was not done and those points lie outside the range of given data

msn.bar(data_pred.sample(1562))

# Visualisation of filled-in data

In [None]:
sb.pairplot(data = data_M.drop(['year'],axis=1))    #distribution of original data

In [None]:
sb.pairplot(data = data_pred.drop(['year'],axis=1))    #distribution of filled in data, which should resemble that of the original data

In [None]:
data_complete = data_pred.dropna()    # we did not use extrapolation to find the missing data points outside the range of given data points because 1.their numbers are not great and 2.their values would be more unreliable as they will be predicted from predicted values.
data_complete.head(1562)

In [None]:
data_complete.info()

In [None]:
data_c = data_complete.drop(['year'],axis=1)
f, axes = plt.subplots(1, 1, figsize=(12, 8))
sb.heatmap(data_c.corr(), vmin = -1, vmax = 1, annot = True, fmt = ".2f")    # look at the first column or first row to see which variable has the highest correlation coefficient with Life Ladder

# Building regression models

## <font color='purple'>Preparation of train-test datasets</font>

In [None]:
data_comp = data_complete.drop(['GINI index (World Bank estimate)'],axis = 1)

<font color='red'>**_we are not going to use this column in building our model because the number data points given in the dataset was less than half of the number of the whole dataset and our model would be entirely based on our predicted values when we filled in the missing data points_**</font>

In [None]:
data_comp = data_comp.drop(['Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year'],axis = 1)

<font color='red'>**_we are not going to use these columns because they are directly related/calculated from life ladder, which is the variable that we are trying to predict_**</font>

In [None]:
data_comp1 = data_comp.drop(['year','country','Region indicator'],axis = 1).reset_index()

data_comp = data_comp1.drop(['index'],axis = 1)

<font color='red'>**_we are not going to use these columns because they are string variables/do not have a linear relationship with life ladder_**</font>

In [None]:
data_comp.head()

In [None]:
y = pd.DataFrame(data_comp['Life Ladder'])
X = data_comp.drop(['Life Ladder'],axis = 1)    # drop life ladder becasue it is the variable we are trying to predict

## <font color='purple'>Linear regression model</font>

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

print("X_Train Set :", X_train.shape)
print("y_Train Set :", y_train.shape)
print("X_Test Set  :", X_test.shape)
print("y_Test Set  :", y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error

In [None]:
linreg.fit(X_train, y_train)    # train the model with train data
print('Intercept of Regression \t: b = ', linreg.intercept_)
print('Coefficients of Regression \t: a = ', linreg.coef_)
print()
pd.DataFrame(list(zip(X_train.columns, linreg.coef_[0])), columns = ["Predictors", "Coefficients"])

### <font color='grey'> Using cross validation to get an estimate of how well the model will do</font>

In [None]:
y_val_pred_L = cross_val_predict(linreg, X, y, cv=5)    # estimate how well this model will do for predictions

f, axes = plt.subplots(1, 1, figsize=(12, 12))
plt.scatter(y, y_val_pred_L, color = "blue")
plt.plot(y, y, 'w-', linewidth = 1)
plt.xlabel("True values of the Response Variable (Train)")
plt.ylabel("Predicted values of the Response Variable (Train)")
plt.show()

print("Goodness of Fit of Model \t")
print("Score of model (R^2) \t:", cross_val_score(linreg, X, y, cv=5))    # R^2 values for each training and validation iteration
print("Error of prediction (MSE) \t:", mean_squared_error(y, y_val_pred_L))
print("Accuracy of prediction \t:", metrics.r2_score(y, y_val_pred_L))    # how well this model should do for predictions
print()

### <font color='grey'> Actual performance of the model</font>

In [None]:
y_test_pred_L = linreg.predict(X_test)

f, axes = plt.subplots(1, 1, figsize=(12, 12))
plt.scatter(y_test, y_test_pred_L, color = "green")
plt.plot(y_test, y_test, 'w-', linewidth = 1)
plt.xlabel("True values of the Response Variable (Test)")
plt.ylabel("Predicted values of the Response Variable (Test)")
plt.show()

print("Goodness of Fit of Model \tTest Dataset")
print("Score of model (R^2) \t:", linreg.score(X_test, y_test))
print("Erroe of prediction (MSE) \t:", mean_squared_error(y_test, y_test_pred_L))
print("Accuracy of prediction \t:", metrics.r2_score(y_test, y_test_pred_L))    # how well the model actually did on the test set
print()

## <font color='purple'>Random forest regressor model</font>

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

In [None]:
rfr.fit(X_train,y_train.values.ravel())

### <font color='grey'> Using cross validation to get an estimate of how well the model will do</font>

In [None]:
y_val_pred_R = cross_val_predict(rfr, X, y.values.ravel(), cv=5)    # estimate how well this model will do for predictions

f, axes = plt.subplots(1, 1, figsize=(12, 12))
plt.scatter(y, y_val_pred_R, color = "blue")
plt.plot(y, y, 'w-', linewidth = 1)
plt.xlabel("True values of the Response Variable (Train)")
plt.ylabel("Predicted values of the Response Variable (Train)")
plt.show()

print("Goodness of Fit of Model \t")
print("Score of model (R^2) \t:", cross_val_score(rfr, X, y.values.ravel(), cv=5))    # R^2 values for each training and validation iteration
print("Error of prediction (MSE) \t:", mean_squared_error(y, y_val_pred_R))
print("Accuracy of prediction \t:", metrics.r2_score(y, y_val_pred_R))    # how well this model should do for predictions
print()

### <font color='grey'> Actual performance of the model</font>

In [None]:
y_test_pred_R = rfr.predict(X_test)

f, axes = plt.subplots(1, 1, figsize=(12, 12))
plt.scatter(y_test, y_test_pred_R, color = "green")
plt.plot(y_test, y_test, 'w-', linewidth = 1)
plt.xlabel("True values of the Response Variable (Test)")
plt.ylabel("Predicted values of the Response Variable (Test)")
plt.show()

print("Goodness of Fit of Model \tTest Dataset")
print("Score of model (R^2) \t:", rfr.score(X_test, y_test))
print("Erroe of prediction (MSE) \t:", mean_squared_error(y_test, y_test_pred_R))
print("Accuracy of prediction \t:", metrics.r2_score(y_test, y_test_pred_R))    # how well the model actually did on the test set
print()

## <font color='purple'>Multi-layer perceptron regressior model</font>

### <font color='grey'> Data scaling</font>

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_tr = pd.DataFrame(scaler.transform(X_train))
X_te = pd.DataFrame(scaler.transform(X_test))

In [None]:
scaler.fit(y_train)
y_tr = pd.DataFrame(scaler.transform(y_train))
y_te = pd.DataFrame(scaler.transform(y_test))

<font color='red'>**_Data scaling is important here to avoid exploding of gradient_**</font>

In [None]:
from sklearn.neural_network import MLPRegressor
mlpr = MLPRegressor(hidden_layer_sizes=(10), activation='tanh', solver='adam',alpha=0.001,batch_size='auto',
               learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=1000, shuffle=True,
               random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9,
               nesterovs_momentum=True, early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
               epsilon=1e-08)

In [None]:
reg = mlpr.fit(X_tr,y_tr.values.ravel())
#print(reg.coefs_)
#print(reg.intercepts_)

### <font color='grey'> Using cross validation to get an estimate of how well the model will do</font>

In [None]:
scaler.fit(X)
X_sc = pd.DataFrame(scaler.transform(X))
scaler.fit(y)
y_sc = pd.DataFrame(scaler.transform(y))

y_val_pred_M = reg.predict(X_sc)

f, axes = plt.subplots(1, 1, figsize=(12, 12))
plt.scatter(y_sc, y_val_pred_M, color = "blue")
plt.plot(y_sc, y_sc, 'w-', linewidth = 1)
plt.xlabel("True values of the Response Variable (Train)")
plt.ylabel("Predicted values of the Response Variable (Train)")
plt.show()

print("Goodness of Fit of Model \t")
print("Score of model (R^2) \t:", cross_val_score(reg, X_sc, y_sc.values.ravel(), cv=5))    # R^2 values for each training and validation iteration
print("Error of prediction (MSE) \t:", mean_squared_error(y_sc, y_val_pred_M))
print("Accuracy of prediction \t:", metrics.r2_score(y_sc, y_val_pred_M))    # how well this model should do for predictions
print()

### <font color='grey'> Actual performance of the model</font>

In [None]:
y_test_pred_M = reg.predict(X_te)

In [None]:
f, axes = plt.subplots(1, 1, figsize=(12, 12))
plt.scatter(y_te, y_test_pred_M, color = "green")
plt.plot(y_te, y_te, 'w-', linewidth = 1)
plt.xlabel("True values of the Response Variable (Train)")
plt.ylabel("Predicted values of the Response Variable (Train)")
plt.show()

print("Goodness of Fit of Model \tTest Dataset")
print("Score of model (R^2) \t:", reg.score(X_te, y_te))
print("Erroe of prediction (MSE) \t:", mean_squared_error(y_te, y_test_pred_M))
print("Accuracy of prediction \t:", metrics.r2_score(y_te, y_test_pred_M))    # how well the model actually did on the test set
print()

# Prediction of 2018 Life Ladder

In [None]:
data_2018 = pd.read_excel('WHR2019Chapter2OnlineData.xls',sheet_name = 'Table2.1')

In [None]:
data_2018.info()

In [None]:
overall = pd.DataFrame(data_2018[['Year','Country name','Life Ladder','Log GDP per capita','Social support','Healthy life expectancy at birth','Freedom to make life choices','Generosity','Perceptions of corruption','Positive affect','Negative affect','Confidence in national government','Democratic Quality','Delivery Quality','Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year','GINI index (World Bank estimate)','GINI index (World Bank estimate), average 2000-16','gini of household income reported in Gallup, by wp5-year']])

In [None]:
overall.info()

In [None]:
msn.matrix(overall.sample(1704))
msn.bar(overall.sample(1704))

In [None]:
pred_2018 = overall.interpolate(method = 'linear')

In [None]:
msn.matrix(pred_2018.sample(1704))
msn.bar(pred_2018.sample(1704))

In [None]:
pred_2018 = pred_2018.dropna()
pred_2018 = pred_2018.drop(['GINI index (World Bank estimate)'],axis = 1)
pred_2018 = pred_2018.drop(['Standard deviation of ladder by country-year','Standard deviation/Mean of ladder by country-year'],axis = 1)
pred_2018 = pred_2018.drop(['Country name'],axis = 1)

pred_2018.info()

In [None]:
year_2018 = pred_2018[pred_2018["Year"] == 2018]
year_2018 = year_2018.drop(['Year'],axis = 1).reset_index()
year_2018 = year_2018.drop(['index'],axis = 1)
year_2018.info()

In [None]:
year_2018.head(135)

In [None]:
y_2018 = pd.DataFrame(year_2018['Life Ladder'])
X_2018 = year_2018.drop(['Life Ladder'],axis = 1)

In [None]:
y_2018_pred_L = linreg.predict(X_2018)
y_2018_pred_R = rfr.predict(X_2018)

In [None]:
scaler.fit(X_2018)    # Data scaling for MLPRegressor
X_M_2018 = pd.DataFrame(scaler.transform(X_2018))

In [None]:
scaler.fit(y_2018)    # Data scaling for MLPRegressor
y_M_2018 = pd.DataFrame(scaler.transform(y_2018))

In [None]:
y_2018_pred_M = reg.predict(X_M_2018)

In [None]:
f, axes = plt.subplots(1, 3, figsize=(24, 6))
axes[0].scatter(y_2018, y_2018_pred_L, color = "blue")
axes[0].plot(y_2018, y_2018, 'w-', linewidth = 1)
axes[0].set_xlabel("True values of the Response Variable")
axes[0].set_ylabel("Predicted values of the Response Variable")

axes[1].scatter(y_2018, y_2018_pred_R, color = "green")
axes[1].plot(y_2018, y_2018, 'w-', linewidth = 1)
axes[1].set_xlabel("True values of the Response Variable")
axes[1].set_ylabel("Predicted values of the Response Variable")

axes[2].scatter(y_M_2018, y_2018_pred_M, color = "red")
axes[2].plot(y_M_2018, y_M_2018, 'w-', linewidth = 1)
axes[2].set_xlabel("True values of the Response Variable")
axes[2].set_ylabel("Predicted values of the Response Variable")
plt.show()

In [None]:
print("Goodness of Fit of Linear Regression Model \t")
print("Score of the model (R^2) \t:", linreg.score(X_2018, y_2018))
print("Error of prediction (MSE) \t:", mean_squared_error(y_2018, y_2018_pred_L))
print("Accuracy of prediction:", metrics.r2_score(y_2018, y_2018_pred_L))
print()

print("Goodness of Fit of Random Forest Regressor Model \t")
print("Score of model (R^2) \t:", rfr.score(X_2018, y_2018))
print("Erroe of prediction (MSE) \t:", mean_squared_error(y_2018, y_2018_pred_R))
print("Accuracy of prediction \t:", metrics.r2_score(y_2018, y_2018_pred_R))
print()

print("Goodness of Fit of Linear Regression Model \t")
print("Score of the model (R^2) \t:", reg.score(X_M_2018, y_M_2018))
print("Error of prediction (MSE) \t:", mean_squared_error(y_M_2018, y_2018_pred_M))
print("Accuracy of prediction:", metrics.r2_score(y_M_2018, y_2018_pred_M))
print()

# conclusion

Based on the scores obtained from cross validation and those from the prediction for the test datasets, MLPRegressor and random forest regressor have the higher scores and accuracies than the linear regression model, and hence would be the best model for Life Ladder predictions in the future.