# Models

### Data Processing

In [1]:
import pandas as pd
import numpy as np

Choose top 4 features (chosen by PCA Analysis)

In [11]:
data = pd.read_csv('../normalized_data.csv')
data = data[['Social support', 'Log GDP per capita', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Happiness_Score_Percentile']]
data = data.dropna()
data.iloc[0]

Social support                     -2.888644
Log GDP per capita                 -1.565348
Healthy life expectancy at birth   -1.697393
Freedom to make life choices       -1.111509
Happiness_Score_Percentile          8.000000
Name: 0, dtype: float64

Split into testing and training

In [12]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)

y_train = train['Happiness_Score_Percentile']
x_train = train.drop(['Happiness_Score_Percentile'], axis=1)
y_test = test['Happiness_Score_Percentile']
x_test = test.drop(['Happiness_Score_Percentile'], axis=1)

assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)
assert len(x_train.iloc[0]) == len(x_test.iloc[0])
print(len(x_train), 'train examples')
print(len(x_test), 'test examples')

931 train examples
233 test examples


### Models

Regular Linear Regression

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score

reg = LinearRegression().fit(x_train, y_train)
y_pred = reg.predict(x_test)
print(reg.score(x_test, y_test))

#Find accuracy and error on test dataset
print("RMS: ", (mean_squared_error(y_test, y_pred))**2)
print("Accuracy: ", accuracy_score(y_test, y_pred.round()))

0.7130553702399715
RMS:  1.8238466603760444
Accuracy:  0.33476394849785407


Linear Regression with L1 and L2 Regularization and Cross Validation selection

In [16]:
from sklearn.linear_model import ElasticNetCV

regr = ElasticNetCV(cv=5, random_state=0)
regr.fit(x_train, y_train)
print(regr.score(x_test, y_test))

y_pred = regr.predict(x_test)

# print(y_pred)
print("RMS: ", (mean_squared_error(y_test, y_pred))**2)
#Find accuracy
from sklearn.metrics import accuracy_score
print("Accuracy: ", accuracy_score(y_test, y_pred.round()))
# print("Accuracy: ", accuracy_score(y_test, y_pred))


0.7125884552912709
RMS:  1.8297869988663908
Accuracy:  0.33476394849785407


Ordinal Regression

In [17]:
! pip install statsmodels



In [18]:
from statsmodels.miscmodels.ordinal_model import OrderedModel

mod_prob = OrderedModel(y_train, x_train, distr='logit')
res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 1.551978
         Iterations: 33
         Function evaluations: 34
         Gradient evaluations: 34


0,1,2,3
Dep. Variable:,Happiness_Score_Percentile,Log-Likelihood:,-1444.9
Model:,OrderedModel,AIC:,2916.0
Method:,Maximum Likelihood,BIC:,2979.0
Date:,"Mon, 28 Nov 2022",,
Time:,19:42:24,,
No. Observations:,931,,
Df Residuals:,918,,
Df Model:,13,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Social support,-0.9198,0.096,-9.579,0.000,-1.108,-0.732
Log GDP per capita,-1.1509,0.127,-9.036,0.000,-1.401,-0.901
Healthy life expectancy at birth,-0.6163,0.120,-5.153,0.000,-0.851,-0.382
Freedom to make life choices,-0.8348,0.073,-11.446,0.000,-0.978,-0.692
0/1,-4.7459,0.178,-26.673,0.000,-5.095,-4.397
1/2,0.2824,0.099,2.841,0.005,0.088,0.477
2/3,0.2248,0.087,2.579,0.010,0.054,0.396
3/4,0.4538,0.071,6.411,0.000,0.315,0.592
4/5,0.2964,0.080,3.698,0.000,0.139,0.453


In [19]:
y_pred = res_prob.model.predict(res_prob.params, x_test)
y_pred = np.argmax(y_pred, axis=1)

print("RMS: ", (mean_squared_error(y_test, y_pred))**2)
print("Accuracy: ", accuracy_score(y_test, y_pred))

RMS:  2.1168376650886924
Accuracy:  0.3776824034334764


  xb = xb[:, None]


Decision Tree

In [21]:
#decision tree and visualization
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz

# clf = DecisionTreeClassifier(random_state=0, max_depth=10)
# clf = clf.fit(x_train, y_train)
# y_pred = clf.predict(x_test)
# print(clf.score(x_test, y_test))

# #visualize tree
# dot_data = tree.export_graphviz(clf, out_file=None)
# graph = graphviz.Source(dot_data)
# graph.render("happiness_depth10")

depths = [3,4,5,6,7,8,9,10]

for depth in depths:
    clf = DecisionTreeClassifier(random_state=0, max_depth=depth)
    clf = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Depth: ", depth, " Accuracy: ", clf.score(x_test, y_test))

    dot_data = tree.export_graphviz(clf, out_file=None)
    graph = graphviz.Source(dot_data)
    graph.render("tree_visualizations/happiness_depth" + str(depth))



Depth:  3  Accuracy:  0.39914163090128757
Depth:  4  Accuracy:  0.40772532188841204
Depth:  5  Accuracy:  0.38626609442060084
Depth:  6  Accuracy:  0.4334763948497854
Depth:  7  Accuracy:  0.41201716738197425
Depth:  8  Accuracy:  0.3776824034334764
Depth:  9  Accuracy:  0.38626609442060084
Depth:  10  Accuracy:  0.4034334763948498
