## Predict Hapiness Rank using Ordinal Regression

In [1]:
# Install needed libraries
!pip install pandas
!pip install numpy
!pip install statsmodels
!pip install sklearn

import pandas as pd
import numpy as np
from statsmodels.miscmodels.ordinal_model import OrderedModel
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split



### Data Processing

#### Use Top 4 Features from PCA Results

In [2]:
data = pd.read_csv('../normalized_data.csv')
data = data[['Social support', 'Log GDP per capita', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Happiness_Score_Percentile']]
data = data.dropna()
data.iloc[0]

Social support                     -2.888644
Log GDP per capita                 -1.565348
Healthy life expectancy at birth   -1.697393
Freedom to make life choices       -1.111509
Happiness_Score_Percentile          8.000000
Name: 0, dtype: float64

#### Split Data into Testing and Training

In [3]:
train, test = train_test_split(data, test_size=0.2)

y_train = train['Happiness_Score_Percentile']
x_train = train.drop(['Happiness_Score_Percentile'], axis=1)
y_test = test['Happiness_Score_Percentile']
x_test = test.drop(['Happiness_Score_Percentile'], axis=1)

assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)
assert len(x_train.iloc[0]) == len(x_test.iloc[0])
print(len(x_train), 'train examples')
print(len(x_test), 'test examples')

931 train examples
233 test examples


### Run Ordinal Regression

In [4]:
mod_prob = OrderedModel(y_train, x_train, distr='logit')
res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 1.558616
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36


0,1,2,3
Dep. Variable:,Happiness_Score_Percentile,Log-Likelihood:,-1451.1
Model:,OrderedModel,AIC:,2928.0
Method:,Maximum Likelihood,BIC:,2991.0
Date:,"Wed, 30 Nov 2022",,
Time:,14:23:37,,
No. Observations:,931,,
Df Residuals:,918,,
Df Model:,13,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Social support,-0.8836,0.096,-9.224,0.000,-1.071,-0.696
Log GDP per capita,-1.1333,0.128,-8.878,0.000,-1.384,-0.883
Healthy life expectancy at birth,-0.7139,0.119,-5.994,0.000,-0.947,-0.481
Freedom to make life choices,-0.8017,0.072,-11.167,0.000,-0.942,-0.661
0/1,-4.6349,0.174,-26.579,0.000,-4.977,-4.293
1/2,0.1914,0.104,1.839,0.066,-0.013,0.395
2/3,0.2251,0.087,2.581,0.010,0.054,0.396
3/4,0.4733,0.069,6.831,0.000,0.338,0.609
4/5,0.3209,0.078,4.126,0.000,0.168,0.473


In [5]:
y_pred = res_prob.model.predict(res_prob.params, x_test)
y_pred = np.argmax(y_pred, axis=1)

print("RMS: ", (mean_squared_error(y_test, y_pred))**0.5)
print("Accuracy: ", accuracy_score(y_test, y_pred))

RMS:  1.1719174706180953
Accuracy:  0.38626609442060084


  xb = xb[:, None]
