In [196]:
# Import libraries
import sys
import pandas as pd
import math
import numpy as np
from operator import itemgetter
import time
import joblib

from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFE, VarianceThreshold, SelectFromModel
from sklearn.feature_selection import SelectKBest, mutual_info_regression, mutual_info_classif, chi2
from sklearn import metrics
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import KBinsDiscretizer, scale

In [197]:
# Global parameters
norm_features=0                                     #Normalize features switch
feat_select=1                                       #Control Switch for Feature Selection
fs_type=2                                           #Feature Selection type (1=Stepwise Backwards Removal, 2=Wrapper Select, 3=Univariate Selection)
lv_filter=0                                         #Control switch for low variance filter on features
feat_start=2                                        #Start column of features
dataset=1                                           #Which merged data to use (0=reduced, 1=full)

In [198]:
# Load data
df0=pd.read_csv('2021.csv',engine='python')
df1=df0[['Country name','Happiness score', 'Freedom to make life choices', 'Social support']]
df2 = pd.read_csv('CPI 2021 Score.csv',engine='python')
df3=pd.read_csv('happyscore_income.csv',engine='python')
df4=df3[['country','avg_satisfaction', 'GDP', 'avg_income']]
df5= pd.merge(left = df1, right = df2,how= 'inner',left_on='Country name', right_on='Country')
df5.drop(['Country'],axis=1, inplace=True)
df6= pd.merge(left = df5, right = df4,how= 'inner',left_on='Country name', right_on='country')
df6.drop(['country'],axis=1, inplace=True)
df6.head()

Unnamed: 0,Country name,Happiness score,Freedom to make life choices,Social support,CPI 2021 Score,avg_satisfaction,GDP,avg_income
0,Finland,7.842,0.949,0.954,89,7.9,1.29025,17310.195
1,Denmark,7.62,0.946,0.954,90,8.4,1.32548,17496.51
2,Switzerland,7.571,0.919,0.942,86,8.0,1.39651,23400.04
3,Iceland,7.554,0.955,0.983,78,8.1,1.30232,18828.345
4,Netherlands,7.464,0.913,0.942,83,7.6,1.32944,18234.435


In [199]:
df7= pd.merge(left = df0, right = df2,how= 'inner',left_on='Country name', right_on='Country')
df7.drop(['Country'],axis=1, inplace=True)
df8= pd.merge(left = df7, right = df3,how= 'inner',left_on='Country name', right_on='country')
df8.drop(['country'],axis=1, inplace=True)
df8.head()

Unnamed: 0,Country name,Regional indicator,Happiness score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,...,adjusted_satisfaction,avg_satisfaction,std_satisfaction,avg_income,median_income,income_inequality,region,happyScore,GDP,country.1
0,Finland,Western Europe,7.842,0.032,7.904,7.78,10.775,0.954,72.0,0.949,...,70.0,7.9,1.53,17310.195,14962.56,27.72375,'Western Europe',7.406,1.29025,Finland
1,Denmark,Western Europe,7.62,0.035,7.687,7.552,10.933,0.954,72.7,0.946,...,74.0,8.4,1.53,17496.51,15630.885,28.155,'Western Europe',7.527,1.32548,Denmark
2,Switzerland,Western Europe,7.571,0.036,7.643,7.5,11.117,0.942,74.4,0.919,...,70.0,8.0,1.62,23400.04,19442.92,32.93,'Western Europe',7.587,1.39651,Switzerland
3,Iceland,Western Europe,7.554,0.059,7.67,7.438,10.878,0.983,73.0,0.955,...,71.0,8.1,1.64,18828.345,16179.315,28.78,'Western Europe',7.561,1.30232,Iceland
4,Netherlands,Western Europe,7.464,0.027,7.518,7.41,10.932,0.942,72.4,0.913,...,69.0,7.6,1.38,18234.435,15880.545,29.27125,'Western Europe',7.378,1.32944,Netherlands


In [200]:
header=[]

if dataset==0:
    for col in df6.columns:
        header.append(col)
else:
    header=['Logged GDP per capita','Social support',
              'Healthy life expectancy','Freedom to make life choices','Generosity','Perceptions of corruption',
              'Ladder score in Dystopia','Dystopia + residual','CPI 2021 Score','adjusted_satisfaction',
              'avg_satisfaction','std_satisfaction','avg_income','median_income','income_inequality','GDP']

print(header)

['Logged GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'Ladder score in Dystopia', 'Dystopia + residual', 'CPI 2021 Score', 'adjusted_satisfaction', 'avg_satisfaction', 'std_satisfaction', 'avg_income', 'median_income', 'income_inequality', 'GDP']


In [201]:
# Data and target

if dataset==0:
    data = df6[['Freedom to make life choices', 'Social support','CPI 2021 Score','avg_satisfaction', 'GDP', 'avg_income']]
    target = df6[['Happiness score']]
else:
    data = df8[['Logged GDP per capita','Social support',
              'Healthy life expectancy','Freedom to make life choices','Generosity','Perceptions of corruption',
              'Ladder score in Dystopia','Dystopia + residual','CPI 2021 Score','adjusted_satisfaction',
              'avg_satisfaction','std_satisfaction','avg_income','median_income','income_inequality','GDP']]    #Take out unnecessary statistical features
    target = df8[['Happiness score']]

data = data.to_numpy()
target = np.ravel(target)


In [202]:
# Preprocess (normalize) data
if norm_features==1:
    data = scale(data)

In [203]:
# Feature Selection

#Low Variance Filter
if lv_filter==1:
    print('--LOW VARIANCE FILTER ON--', '\n')
    
    #LV Threshold
    sel = VarianceThreshold(threshold=0.5)                                          #Removes any feature with less than 20% variance
    fit_mod=sel.fit(data)
    fitted=sel.transform(data)
    sel_idx=fit_mod.get_support()

    #Get lists of selected and non-selected features (names and indexes)
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(data[0])):
        if sel_idx[i]==1:                                                           #Selected Features get added to temp header
            temp.append(header[i+feat_start])
            temp_idx.append(i)
        else:                                                                       #Indexes of non-selected features get added to delete array
            temp_del.append(i)

    print('Selected:', temp)
    print('Features (total, selected):', len(data[0]), len(temp))
    print('\n')

    #Filter selected columns from original dataset
    header = header[0:feat_start]
    for field in temp:
        header.append(field)
    data = np.delete(data, temp_del, axis=1)                                 #Deletes non-selected features by index


#Feature Selection
if feat_select==1:
    '''Three steps:
       1) Run Feature Selection
       2) Get lists of selected and non-selected features
       3) Filter columns from original dataset
       '''
    
    print('--FEATURE SELECTION ON--', '\n')
    
    ##1) Run Feature Selection #######
    #Wrapper Select via model
    if fs_type==2:
        
        rgr = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=None, min_samples_split=3, max_features=0.33)   # Deprecated since version 1.0: Criterion “mse” was deprecated in v1.0 and will be removed in version 1.2. Use criterion="squared_error" which is equivalent. 
        sel = SelectFromModel(rgr, prefit=False, threshold='mean', max_features=None)
        print ('Wrapper Select: ')

        fit_mod=sel.fit(data, target)    
        sel_idx=fit_mod.get_support()       
        

    ##2) Get lists of selected and non-selected features (names and indexes) #######
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(data[0])):
        if sel_idx[i]==1:                                                           #Selected Features get added to temp header
            temp.append(header[i+feat_start])
            temp_idx.append(i)
        else:                                                                       #Indexes of non-selected features get added to delete array
            temp_del.append(i)
    print('Selected:', temp)
    print('Features (total/selected):', len(data[0]), len(temp))
    print('\n')
            
               
    ##3) Filter selected columns from original dataset #########
    header = header[0:feat_start]
    for field in temp:
        header.append(field)
    data = np.delete(data, temp_del, axis=1)                                 #Deletes non-selected features by index

--FEATURE SELECTION ON-- 

Wrapper Select: 
Selected: ['Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'std_satisfaction', 'avg_income', 'income_inequality', 'GDP']
Features (total/selected): 16 7




In [204]:
# Train models

print('--ML Model Output--', '\n')

#Test/Train split
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.25)

####Cross-Val Regressors####
#Setup Crossval regression scorers
scorers = {'Neg_MSE': 'neg_mean_squared_error', 'expl_var': 'explained_variance'} 

#SciKit Decision Tree Regressor - Cross Val
start_ts=time.time()
rgr = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth=None, min_samples_split=3, min_samples_leaf=1, max_features=None)
scores = cross_validate(rgr, data, target, scoring=scorers, cv=5)

scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])                                                                      #Turns negative MSE scores into RMSE
scores_Expl_Var = scores['test_expl_var']
print("Decision Tree RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
print("Decision Tree Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
print("CV Runtime:", time.time()-start_ts)
print("\n")
    
#SciKit Random Forest Regressor - Cross Val
start_ts=time.time()
rgr = RandomForestRegressor(n_estimators=100, criterion='squared_error', max_depth=None, min_samples_split=3, max_features=0.33)   # Deprecated since version 1.0: Criterion “mse” was deprecated in v1.0 and will be removed in version 1.2. Use criterion="squared_error" which is equivalent. 
scores = cross_validate(estimator=rgr, X=data, y=target, scoring=scorers, cv=5)

scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])                                       #Turns negative MSE scores into RMSE
scores_Expl_Var = scores['test_expl_var']
print("Random Forest RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
print("Random Forest Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
print("CV Runtime:", time.time()-start_ts)
print("\n")
    
#SciKit Gradient Boosting - Cross Val
start_ts=time.time()
rgr=GradientBoostingRegressor(n_estimators=100, loss='squared_error', learning_rate=0.1, max_depth=3, min_samples_split=3)   # Deprecated since version 1.0: The loss ‘ls’ was deprecated in v1.0 and will be removed in version 1.2. Use loss='squared_error' which is equivalent.
scores=cross_validate(estimator=rgr, X=data, y=target, scoring=scorers, cv=5)                                                                                                 

scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])                                       #Turns negative MSE scores into RMSE
scores_Expl_Var = scores['test_expl_var']
print("Gradient Boosting RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
print("Gradient Boosting Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
print("CV Runtime:", time.time()-start_ts)
print("\n")

#SciKit Ada Boosting - Cross Val
start_ts=time.time()
rgr=AdaBoostRegressor(n_estimators=100, base_estimator=None, loss='linear', learning_rate=0.5)
scores=cross_validate(estimator=rgr, X=data, y=target, scoring=scorers, cv=5)                                                                                                 

scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])                                       #Turns negative MSE scores into RMSE
scores_Expl_Var = scores['test_expl_var']
print("Ada Boosting RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
print("Ada Boosting Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
print("CV Runtime:", time.time()-start_ts)
print("\n")

#SciKit Neural Network - Cross Val
start_ts=time.time()
rgr=MLPRegressor(activation='logistic', solver='lbfgs', alpha=0.0001, max_iter=1000, hidden_layer_sizes=(10,))
scores=cross_validate(estimator=rgr, X=data, y=target, scoring=scorers, cv=5)                                                                                                 

scores_RMSE = np.asarray([math.sqrt(-x) for x in scores['test_Neg_MSE']])                                       #Turns negative MSE scores into RMSE
scores_Expl_Var = scores['test_expl_var']
print("Neural Network RMSE:: %0.2f (+/- %0.2f)" % ((scores_RMSE.mean()), (scores_RMSE.std() * 2)))
print("Neural Network Expl Var: %0.2f (+/- %0.2f)" % ((scores_Expl_Var.mean()), (scores_Expl_Var.std() * 2)))
print("CV Runtime:", time.time()-start_ts)

--ML Model Output-- 

Decision Tree RMSE:: 0.96 (+/- 0.21)
Decision Tree Expl Var: -13.69 (+/- 27.46)
CV Runtime: 0.019981861114501953


Random Forest RMSE:: 0.75 (+/- 0.56)
Random Forest Expl Var: -4.41 (+/- 10.80)
CV Runtime: 0.6092503070831299


Gradient Boosting RMSE:: 0.74 (+/- 0.43)
Gradient Boosting Expl Var: -6.14 (+/- 16.61)
CV Runtime: 0.2599925994873047


Ada Boosting RMSE:: 0.78 (+/- 0.55)
Ada Boosting Expl Var: -5.71 (+/- 16.33)
CV Runtime: 0.704174280166626


Neural Network RMSE:: 1.11 (+/- 1.39)
Neural Network Expl Var: 0.01 (+/- 0.05)
CV Runtime: 0.02612781524658203
