# Models

### Data Processing

In [10]:
import pandas as pd
import numpy as np
! pip install graphviz



Choose top 4 features (chosen by PCA Analysis)

In [15]:
data = pd.read_csv('../normalized_data.csv')
data = data[['Social support', 'Log GDP per capita', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Happiness_Score_Percentile']]
data = data.dropna()
data.iloc[0]

Social support                     -2.888644
Log GDP per capita                 -1.565348
Healthy life expectancy at birth   -1.697393
Freedom to make life choices       -1.111509
Happiness_Score_Percentile          8.000000
Name: 0, dtype: float64

Split into testing and training

In [16]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

y_train = train['Happiness_Score_Percentile']
x_train = train.drop(['Happiness_Score_Percentile'], axis=1)
y_test = test['Happiness_Score_Percentile']
x_test = test.drop(['Happiness_Score_Percentile'], axis=1)

assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)
assert len(x_train.iloc[0]) == len(x_test.iloc[0])
print(len(x_train), 'train examples')
print(len(x_test), 'test examples')

931 train examples
233 test examples


### Models

Regular Linear Regression

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

reg = LinearRegression().fit(x_train, y_train)
y_pred = reg.predict(x_test)
print(reg.score(x_test, y_test))

#Find accuracy and error on test dataset
print("RMS: ", (mean_squared_error(y_test, y_pred))**0.5)
print("Accuracy: ", accuracy_score(y_test, y_pred.round()))

0.6705702722746456
RMS:  1.2243946338832206
Accuracy:  0.30472103004291845


Linear Regression with L1 and L2 Regularization and Cross Validation selection

In [18]:
from sklearn.linear_model import ElasticNetCV

regr = ElasticNetCV()
regr.fit(x_train, y_train)
print(regr.score(x_test, y_test))

y_pred = regr.predict(x_test)

# print(y_pred)
print("RMS: ", (mean_squared_error(y_test, y_pred))**0.5)
#Find accuracy
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(y_test, y_pred.round()))
# print("Accuracy: ", accuracy_score(y_test, y_pred))


0.6708913928280686
RMS:  1.2237977324633336
Accuracy:  0.30042918454935624


Ordinal Regression

In [19]:
! pip install statsmodels



In [20]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

mod_prob = OrderedModel(y_train, x_train, distr='logit')
res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 1.538745
         Iterations: 34
         Function evaluations: 35
         Gradient evaluations: 35


0,1,2,3
Dep. Variable:,Happiness_Score_Percentile,Log-Likelihood:,-1432.6
Model:,OrderedModel,AIC:,2891.0
Method:,Maximum Likelihood,BIC:,2954.0
Date:,"Wed, 30 Nov 2022",,
Time:,14:41:32,,
No. Observations:,931,,
Df Residuals:,918,,
Df Model:,13,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Social support,-1.0166,0.097,-10.447,0.000,-1.207,-0.826
Log GDP per capita,-1.1692,0.130,-9.023,0.000,-1.423,-0.915
Healthy life expectancy at birth,-0.6049,0.120,-5.060,0.000,-0.839,-0.371
Freedom to make life choices,-0.8337,0.074,-11.331,0.000,-0.978,-0.690
0/1,-4.8383,0.180,-26.925,0.000,-5.190,-4.486
1/2,0.2572,0.100,2.568,0.010,0.061,0.453
2/3,0.2521,0.086,2.927,0.003,0.083,0.421
3/4,0.4995,0.070,7.175,0.000,0.363,0.636
4/5,0.3678,0.078,4.721,0.000,0.215,0.520


In [21]:
y_pred = res_prob.model.predict(res_prob.params, x_test)
y_pred = np.argmax(y_pred, axis=1)

print("RMS: ", (mean_squared_error(y_test, y_pred))**0.5)
print("Accuracy: ", accuracy_score(y_test, y_pred))

RMS:  1.3053209034349393
Accuracy:  0.3562231759656652


Decision Tree

In [28]:
#decision tree and visualization
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz

# clf = DecisionTreeClassifier(random_state=0, max_depth=10)
# clf = clf.fit(x_train, y_train)
# y_pred = clf.predict(x_test)
# print(clf.score(x_test, y_test))

# #visualize tree
# dot_data = tree.export_graphviz(clf, out_file=None)
# graph = graphviz.Source(dot_data)
# graph.render("happiness_depth10")

depths = [3,4,5,6,7,8,9,10]

for depth in depths:
    clf = DecisionTreeClassifier(random_state=0, max_depth=depth)
    clf = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Depth: ", depth, " Accuracy: ", clf.score(x_test, y_test))

    dot_data = tree.export_graphviz(clf, out_file=None, feature_names=['Social support', 'Log GDP per capita',
       'Healthy life expectancy at birth', 'Freedom to make life choices'], filled=True, rotate=True)
    graph = graphviz.Source(dot_data)
    graph.render("tree_visualizations_color/happiness_depth" + str(depth))

    dot_data1 = tree.export_graphviz(clf, out_file=None, feature_names=['Social support', 'Log GDP per capita',
       'Healthy life expectancy at birth', 'Freedom to make life choices'], rotate=True)
    graph = graphviz.Source(dot_data1)
    graph.render("tree_visualizations/happiness_depth" + str(depth))



Depth:  3  Accuracy:  0.4034334763948498
Depth:  4  Accuracy:  0.3776824034334764
Depth:  5  Accuracy:  0.41201716738197425
Depth:  6  Accuracy:  0.4248927038626609
Depth:  7  Accuracy:  0.44635193133047213
Depth:  8  Accuracy:  0.43776824034334766
Depth:  9  Accuracy:  0.4034334763948498
Depth:  10  Accuracy:  0.40772532188841204


In [23]:
data.columns

Index(['Social support', 'Log GDP per capita',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Happiness_Score_Percentile'],
      dtype='object')