In [None]:
# Import libraries
import sys
import pandas as pd
import math
import numpy as np
from operator import itemgetter
import time
import joblib

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import RFE, VarianceThreshold, SelectFromModel
from sklearn.feature_selection import SelectKBest, mutual_info_regression, mutual_info_classif, chi2
from sklearn import metrics
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import KBinsDiscretizer, scale

In [None]:
# Global parameters
norm_features=0                                     #Normalize features switch
feat_select=0                                       #Control Switch for Feature Selection
fs_type=3                                           #Feature Selection type (1=Stepwise Backwards Removal, 2=Wrapper Select, 3=Univariate Selection)
lv_filter=0                                         #Control switch for low variance filter on features
feat_start=2                                        #Start column of features
dataset=0                                           #Which merged data to use (0=reduced, 1=full)
binning=1                                           #Control Switch for Bin Target
bin_cnt=3                                           #If bin target, this sets number of classes
k_cnt=3                                             #Number of 'Top k' best ranked features to select, only applies for fs_types 1 and 3

In [None]:
# Load data
df0=pd.read_csv('2021.csv',engine='python')
df1=df0[['Country name','Happiness score', 'Freedom to make life choices', 'Social support']]
df2 = pd.read_csv('CPI 2021 Score.csv',engine='python')
df3=pd.read_csv('happyscore_income.csv',engine='python')
df4=df3[['country','avg_satisfaction', 'GDP', 'avg_income']]
df5= pd.merge(left = df1, right = df2,how= 'inner',left_on='Country name', right_on='Country')
df5.drop(['Country'],axis=1, inplace=True)
df6= pd.merge(left = df5, right = df4,how= 'inner',left_on='Country name', right_on='country')
df6.drop(['country'],axis=1, inplace=True)
df6.head()

Unnamed: 0,Country name,Happiness score,Freedom to make life choices,Social support,CPI 2021 Score,avg_satisfaction,GDP,avg_income
0,Finland,7.842,0.949,0.954,89,7.9,1.29025,17310.195
1,Denmark,7.62,0.946,0.954,90,8.4,1.32548,17496.51
2,Switzerland,7.571,0.919,0.942,86,8.0,1.39651,23400.04
3,Iceland,7.554,0.955,0.983,78,8.1,1.30232,18828.345
4,Netherlands,7.464,0.913,0.942,83,7.6,1.32944,18234.435


In [None]:
df7= pd.merge(left = df0, right = df2,how= 'inner',left_on='Country name', right_on='Country')
df7.drop(['Country'],axis=1, inplace=True)
df8= pd.merge(left = df7, right = df3,how= 'inner',left_on='Country name', right_on='country')
df8.drop(['country'],axis=1, inplace=True)
df8.head()

Unnamed: 0,Country name,Regional indicator,Happiness score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,...,adjusted_satisfaction,avg_satisfaction,std_satisfaction,avg_income,median_income,income_inequality,region,happyScore,GDP,country.1
0,Finland,Western Europe,7.842,0.032,7.904,7.78,10.775,0.954,72.0,0.949,...,70.0,7.9,1.53,17310.195,14962.56,27.72375,'Western Europe',7.406,1.29025,Finland
1,Denmark,Western Europe,7.62,0.035,7.687,7.552,10.933,0.954,72.7,0.946,...,74.0,8.4,1.53,17496.51,15630.885,28.155,'Western Europe',7.527,1.32548,Denmark
2,Switzerland,Western Europe,7.571,0.036,7.643,7.5,11.117,0.942,74.4,0.919,...,70.0,8.0,1.62,23400.04,19442.92,32.93,'Western Europe',7.587,1.39651,Switzerland
3,Iceland,Western Europe,7.554,0.059,7.67,7.438,10.878,0.983,73.0,0.955,...,71.0,8.1,1.64,18828.345,16179.315,28.78,'Western Europe',7.561,1.30232,Iceland
4,Netherlands,Western Europe,7.464,0.027,7.518,7.41,10.932,0.942,72.4,0.913,...,69.0,7.6,1.38,18234.435,15880.545,29.27125,'Western Europe',7.378,1.32944,Netherlands


In [None]:
header=[]

if dataset==0:
    for col in df6.columns:
        header.append(col)
else:
    header=['Logged GDP per capita','Social support',
              'Healthy life expectancy','Freedom to make life choices','Generosity','Perceptions of corruption',
              'Ladder score in Dystopia','Dystopia + residual','CPI 2021 Score','adjusted_satisfaction',
              'avg_satisfaction','std_satisfaction','avg_income','median_income','income_inequality','GDP']

print(header)

['Country name', 'Happiness score', 'Freedom to make life choices', 'Social support', 'CPI 2021 Score', 'avg_satisfaction', 'GDP', 'avg_income']


In [None]:
# Data and target

if dataset==0:
    data = df6[['Freedom to make life choices', 'Social support','CPI 2021 Score','avg_satisfaction', 'GDP', 'avg_income']]
    target = df6[['Happiness score']]
else:
    data = df8[['Logged GDP per capita','Social support',
              'Healthy life expectancy','Freedom to make life choices','Generosity','Perceptions of corruption',
              'Ladder score in Dystopia','Dystopia + residual','CPI 2021 Score','adjusted_satisfaction',
              'avg_satisfaction','std_satisfaction','avg_income','median_income','income_inequality','GDP']]    #Take out unnecessary statistical features
    target = df8[['Happiness score']]

data = data.to_numpy()
target = np.ravel(target)


In [None]:
# Preprocess (normalize) data
if norm_features==1:
    data = scale(data)
if binning==1:
    #Discretize Target variable with KBinsDiscretizer
    enc = KBinsDiscretizer(n_bins=[bin_cnt], encode='ordinal', strategy='kmeans')                         #Strategy here is important, quantile creating equal bins, but kmeans prob being more valid "clusters"
    target_bin = enc.fit_transform(target.reshape(-1,1))

    #Get Bin min/max
    temp=[[] for x in range(bin_cnt+1)]
    for i in range(len(target)):
        for j in range(bin_cnt):
            if target_bin[i]==j:
                temp[j].append(target[i])

    for j in range(bin_cnt):
        print('Bin', j, ':', min(temp[j]), max(temp[j]), len(temp[j]))
    print('\n')

    #Convert Target array back to correct shape
    target=np.ravel(target_bin)

Bin 0 : 3.145 5.171 40
Bin 1 : 5.266 6.491 49
Bin 2 : 6.69 7.842 19






In [None]:
# Feature Selection

#Low Variance Filter
if lv_filter==1:
    print('--LOW VARIANCE FILTER ON--', '\n')
    
    #LV Threshold
    sel = VarianceThreshold(threshold=0.5)                                          #Removes any feature with less than 20% variance
    fit_mod=sel.fit(data)
    fitted=sel.transform(data)
    sel_idx=fit_mod.get_support()

    #Get lists of selected and non-selected features (names and indexes)
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(data[0])):
        if sel_idx[i]==1:                                                           #Selected Features get added to temp header
            temp.append(header[i+feat_start])
            temp_idx.append(i)
        else:                                                                       #Indexes of non-selected features get added to delete array
            temp_del.append(i)

    print('Selected:', temp)
    print('Features (total, selected):', len(data[0]), len(temp))
    print('\n')

    #Filter selected columns from original dataset
    header = header[0:feat_start]
    for field in temp:
        header.append(field)
    data = np.delete(data, temp_del, axis=1)                                 #Deletes non-selected features by index


#Feature Selection
if feat_select==1:
    '''Three steps:
       1) Run Feature Selection
       2) Get lists of selected and non-selected features
       3) Filter columns from original dataset
       '''
    
    print('--FEATURE SELECTION ON--', '\n')
    
    ##1) Run Feature Selection #######
    if fs_type==1:
        #Stepwise Recursive Backwards Feature removal
        if binning==1:
            clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=3)  
            sel = RFE(clf, n_features_to_select=k_cnt, step=.1)
            print('Stepwise Recursive Backwards - Random Forest: ')
            
        fit_mod=sel.fit(data, target)
        print(sel.ranking_)
        sel_idx=fit_mod.get_support()      

    if fs_type==2:
        #Wrapper Select via model
        if binning==1:
            clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=3)             
            sel = SelectFromModel(clf, prefit=False, threshold='mean', max_features=None)                   
            print ('Wrapper Select: ')
            
        fit_mod=sel.fit(data, target)    
        sel_idx=fit_mod.get_support()

    if fs_type==3:
        if binning==1:                                                              ######Only work if the Target is binned###########
            #Univariate Feature Selection - Chi-squared
            sel=SelectKBest(chi2, k=k_cnt)
            fit_mod=sel.fit(data, target)                                         #will throw error if any negative values in features, so turn off feature normalization, or switch to mutual_info_classif
            print ('Univariate Feature Selection - Chi2: ')
            sel_idx=fit_mod.get_support()
            
        #Print ranked variables out sorted
        temp=[]
        scores=fit_mod.scores_
        for i in range(feat_start, len(header)):            
            temp.append([header[i], float(scores[i-feat_start])])

        print('Ranked Features')
        temp_sort=sorted(temp, key=itemgetter(1), reverse=True)
        for i in range(len(temp_sort)):
            print(i, temp_sort[i][0], ':', temp_sort[i][1])
        print('\n')
            
    if fs_type==4:
        if binning==1:
            clf=RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=3)
            clf.fit(data, target)
            sel_idx=[]
            for x in clf.feature_importances_:
                if x>=np.mean(clf.feature_importances_):
                    sel_idx.append(1)
                else:
                    sel_idx.append(0)

    ##2) Get lists of selected and non-selected features (names and indexes) #######
    temp=[]
    temp_idx=[]
    temp_del=[]
    for i in range(len(data[0])):
        if sel_idx[i]==1:                                                           #Selected Features get added to temp header
            temp.append(header[i+feat_start])
            temp_idx.append(i)
        else:                                                                       #Indexes of non-selected features get added to delete array
            temp_del.append(i)
    print('Selected:', temp)
    print('Features (total/selected):', len(data[0]), len(temp))
    print('\n')
            
               
    ##3) Filter selected columns from original dataset #########
    header = header[0:feat_start]
    for field in temp:
        header.append(field)
    data = np.delete(data, temp_del, axis=1)                                 #Deletes non-selected features by index

In [None]:
#@title
# Train models

print('--ML Model Output--', '\n')

#Test/Train split
#data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.35)

####Cross-Val Classifiers####
#Setup Crossval classifier scorers
scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc_ovo', 'f1': 'f1_macro'}    

#SciKit Decision Tree - Cross Val
start_ts=time.time()
#clf = DecisionTreeClassifier()
#clf = clf.fit(data_train, target_train)
clf=DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=3, min_samples_leaf=1, max_features=None)
scores = cross_validate(clf, data, target, scoring=scorers, cv=5)

scores_Acc = scores['test_Accuracy']
print("Decision Tree Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
scores_AUC= scores['test_roc_auc']                                                                    
print("Decision Tree AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2))              
scores_F1 = scores['test_f1']
print("Decision Tree F1:%0.2f (+/- %0.2f)" % (scores_F1.mean(), scores_F1.std() * 2))
print("CV Runtime:", time.time()-start_ts)
print("\n")

#SciKit Random Forest - Cross Val
start_ts=time.time()
clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=3) 
scores = cross_validate(estimator=clf, X=data, y=target, scoring=scorers, cv=5)                                                                                                

scores_Acc = scores['test_Accuracy']                                                                                                                                    
print("Random Forest Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
scores_AUC= scores['test_roc_auc']                                                                                      
print("Random Forest AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2))      
scores_F1 = scores['test_f1']
print("Random Forest F1:%0.2f (+/- %0.2f)" % (scores_F1.mean(), scores_F1.std() * 2))
print("CV Runtime:", time.time()-start_ts)
print("\n")

#SciKit Gradient Boosting - Cross Val
start_ts=time.time()
clf=GradientBoostingClassifier(n_estimators=100, loss='deviance', learning_rate=0.1, max_depth=3, min_samples_split=3)
scores=cross_validate(estimator=clf, X=data, y=target, scoring=scorers, cv=5)

scores_Acc = scores['test_Accuracy']                                                                                                                                    
print("Gradient Boosting Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
scores_AUC= scores['test_roc_auc']                                                                                       
print("Gradient Boosting AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2)) 
scores_F1 = scores['test_f1']
print("Gradient Boosting F1:%0.2f (+/- %0.2f)" % (scores_F1.mean(), scores_F1.std() * 2))
print("CV Runtime:", time.time()-start_ts)
print("\n")

#SciKit Ada Boosting - Cross Val
start_ts=time.time()
clf=AdaBoostClassifier(n_estimators=100, base_estimator=None, learning_rate=0.1)
scores=cross_validate(estimator=clf, X=data, y=target, scoring=scorers, cv=5)

scores_Acc = scores['test_Accuracy']                                                                                                                                    
print("Ada Boosting Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
scores_AUC= scores['test_roc_auc']                                                                                       
print("Ada Boosting AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2)) 
scores_F1 = scores['test_f1']
print("Ada Boosting F1:%0.2f (+/- %0.2f)" % (scores_F1.mean(), scores_F1.std() * 2))
print("CV Runtime:", time.time()-start_ts)
print("\n")

#SciKit Neural Network - Cross Val
start_ts=time.time()
clf=MLPClassifier(activation='logistic', solver='adam', alpha=0.0001, max_iter=1000, hidden_layer_sizes=(10,))
scores=cross_validate(estimator=clf, X=data, y=target, scoring=scorers, cv=5)

scores_Acc = scores['test_Accuracy']                                                                                                                                    
print("Neural Network Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
scores_AUC= scores['test_roc_auc']                                                                                      
print("Neural Network AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2)) 
scores_F1 = scores['test_f1']
print("Neural Network F1:%0.2f (+/- %0.2f)" % (scores_F1.mean(), scores_F1.std() * 2))
print("CV Runtime:", time.time()-start_ts)

--ML Model Output-- 

Decision Tree Acc: 0.80 (+/- 0.13)
Decision Tree AUC: 0.84 (+/- 0.13)
Decision Tree F1:0.78 (+/- 0.15)
CV Runtime: 0.14917874336242676


Random Forest Acc: 0.84 (+/- 0.10)
Random Forest AUC: 0.95 (+/- 0.07)
Random Forest F1:0.84 (+/- 0.08)
CV Runtime: 1.9644887447357178


Gradient Boosting Acc: 0.82 (+/- 0.13)
Gradient Boosting AUC: 0.91 (+/- 0.09)
Gradient Boosting F1:0.81 (+/- 0.16)
CV Runtime: 2.9386110305786133


Ada Boosting Acc: 0.72 (+/- 0.26)
Ada Boosting AUC: 0.92 (+/- 0.05)
Ada Boosting F1:0.68 (+/- 0.37)
CV Runtime: 2.261735677719116






Neural Network Acc: 0.56 (+/- 0.26)
Neural Network AUC: 0.76 (+/- 0.22)
Neural Network F1:0.37 (+/- 0.29)
CV Runtime: 2.0405333042144775
