## Predict Hapiness Rank using Ordinal Regression

In [1]:
# Install needed libraries
!pip install pandas
!pip install numpy
!pip install statsmodels
!pip install sklearn

import pandas as pd
import numpy as np
from statsmodels.miscmodels.ordinal_model import OrderedModel
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split



### Data Processing

#### Use Top 4 Features from PCA Results

In [2]:
data = pd.read_csv('../normalized_data.csv')
data = data[['Social support', 'Log GDP per capita', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Happiness_Score_Percentile']]
data = data.dropna()
data.iloc[0]

Social support                     -2.888644
Log GDP per capita                 -1.565348
Healthy life expectancy at birth   -1.697393
Freedom to make life choices       -1.111509
Happiness_Score_Percentile          8.000000
Name: 0, dtype: float64

#### Split Data into Testing and Training

In [3]:
train, test = train_test_split(data, test_size=0.2)

y_train = train['Happiness_Score_Percentile']
x_train = train.drop(['Happiness_Score_Percentile'], axis=1)
y_test = test['Happiness_Score_Percentile']
x_test = test.drop(['Happiness_Score_Percentile'], axis=1)

assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)
assert len(x_train.iloc[0]) == len(x_test.iloc[0])
print(len(x_train), 'train examples')
print(len(x_test), 'test examples')

931 train examples
233 test examples


### Run Ordinal Regression

In [4]:
mod_prob = OrderedModel(y_train, x_train, distr='logit')
res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

Optimization terminated successfully.
         Current function value: 1.544257
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36


0,1,2,3
Dep. Variable:,Happiness_Score_Percentile,Log-Likelihood:,-1437.7
Model:,OrderedModel,AIC:,2901.0
Method:,Maximum Likelihood,BIC:,2964.0
Date:,"Wed, 30 Nov 2022",,
Time:,16:23:59,,
No. Observations:,931,,
Df Residuals:,918,,
Df Model:,13,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Social support,-0.9306,0.097,-9.633,0.000,-1.120,-0.741
Log GDP per capita,-1.1499,0.125,-9.164,0.000,-1.396,-0.904
Healthy life expectancy at birth,-0.6676,0.118,-5.659,0.000,-0.899,-0.436
Freedom to make life choices,-0.8349,0.071,-11.836,0.000,-0.973,-0.697
0/1,-4.7791,0.179,-26.662,0.000,-5.130,-4.428
1/2,0.3035,0.099,3.076,0.002,0.110,0.497
2/3,0.1830,0.090,2.035,0.042,0.007,0.359
3/4,0.4972,0.069,7.190,0.000,0.362,0.633
4/5,0.3762,0.077,4.913,0.000,0.226,0.526


In [5]:
y_pred = res_prob.model.predict(res_prob.params, x_test)
y_pred = np.argmax(y_pred, axis=1)

print("RMS: ", (mean_squared_error(y_test, y_pred))**0.5)
print("Accuracy: ", accuracy_score(y_test, y_pred))

RMS:  1.2256206275788528
Accuracy:  0.36909871244635195


  xb = xb[:, None]
