## Predict Hapiness Rank using a Decision Tree

In [29]:
# Install needed libraries
!pip install pandas
!pip install numpy
!pip install sklearn
!pip install graphviz

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import graphviz



### Data Processing

#### Select Features

In [45]:
data = pd.read_csv('../normalized_data.csv')

# Top 4 features from PCA
# data = data[['Social support', 'Log GDP per capita', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Happiness_Score_Percentile']]

# Top 7 features from PCA
# data = data[['Social support', 'Log GDP per capita', 'Healthy life expectancy at birth', 'Freedom to make life choices', 'Negative affect', 'Perceptions of corruption', 'Positive affect', 'Happiness_Score_Percentile']]

# All 10 features
data['Country'] = data['Country'].astype('category')
data['Country'] = data['Country'].cat.codes
data = data.drop(['Percentile Ranges'], axis=1)
data = data.drop(['Unnamed: 0'], axis=1)
data = data.drop(["Happiness Score"], axis=1)
data = data.drop(["Country"], axis=1)
data = data.drop(["Year"], axis=1)
data = data.drop(["Freedom_Rating"], axis=1)
data = data.drop(["Life Expectancy"], axis=1)

# Happiness Score from 1 - 10
data['Happiness_Score_Percentile'] = abs(data['Happiness_Score_Percentile'] - 9)

# Binary happiness score
data['Happiness_Score_Percentile'] = data['Happiness_Score_Percentile'].transform(lambda x: pd.cut(x, bins=2, labels=[0,1]))
data = data.dropna()
data.iloc[0]

Log GDP per capita                  -1.565348
Social support                      -2.888644
Healthy life expectancy at birth    -1.697393
Freedom to make life choices        -1.111509
Generosity                           0.371831
Perceptions of corruption            0.355938
Positive affect                     -0.992580
Negative affect                      0.034729
Confidence in national government    0.080438
GDP                                 -0.300283
Happiness_Score_Percentile           0.000000
Name: 0, dtype: float64

#### Split Data into Testing and Training

In [46]:
train, test = train_test_split(data, test_size=0.2)

y_train = train['Happiness_Score_Percentile']
x_train = train.drop(['Happiness_Score_Percentile'], axis=1)
y_test = test['Happiness_Score_Percentile']
x_test = test.drop(['Happiness_Score_Percentile'], axis=1)

assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)
assert len(x_train.iloc[0]) == len(x_test.iloc[0])
print(len(x_train), 'train examples')
print(len(x_test), 'test examples')

879 train examples
220 test examples


### Run Decision Tree

In [47]:
depths = [3,4,5,6,7,8,9,10,100]

for depth in depths:
    clf = DecisionTreeClassifier(random_state=0, max_depth=depth)
    clf = clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print("Depth: ", depth, " Accuracy: ", clf.score(x_test, y_test))

#     dot_data = tree.export_graphviz(clf, out_file=None, feature_names=['Social support', 'Log GDP per capita',
#        'Healthy life expectancy at birth', 'Freedom to make life choices'], filled=True, rotate=True)
#     graph = graphviz.Source(dot_data)
#     graph.render("tree_visualizations_color/happiness_depth" + str(depth))

#     dot_data1 = tree.export_graphviz(clf, out_file=None, feature_names=['Social support', 'Log GDP per capita',
#        'Healthy life expectancy at birth', 'Freedom to make life choices'], rotate=True)
#     graph = graphviz.Source(dot_data1)
#     graph.render("tree_visualizations/happiness_depth" + str(depth))

Depth:  3  Accuracy:  0.8454545454545455
Depth:  4  Accuracy:  0.8545454545454545
Depth:  5  Accuracy:  0.85
Depth:  6  Accuracy:  0.8318181818181818
Depth:  7  Accuracy:  0.85
Depth:  8  Accuracy:  0.8636363636363636
Depth:  9  Accuracy:  0.8454545454545455
Depth:  10  Accuracy:  0.8636363636363636
Depth:  100  Accuracy:  0.8363636363636363
