In [55]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import sqlite3
from sqlalchemy import create_engine
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

## Loading and preprocessing data

In [56]:
# Create SQLite connection
happiness_path = Path('Resources/HappinessIndexScore.sqlite')
engine = create_engine(f'sqlite:///{happiness_path}')
conn = engine.connect()

In [57]:
# Create df
happiness_df = pd.read_sql('SELECT * FROM final_output', con=engine)

happiness_df.head()

Unnamed: 0,country,region,ladder_score,logged_GPD_per_capita,social_support,healthy_life_expectancy,freedom_life_choices,generosity,perceptions_corruption,population_density,unemployment_rate,median_age,gini_coefficient,avg_temperature,lt_alcohol_per_capita
0,Finland,Europe,7.804,10.792,0.969,71.15,0.961,-0.019,0.182,16.6,7.16,43.2,27.7,3.24,8.23
1,Denmark,Europe,7.586,10.962,0.954,71.25,0.934,0.134,0.196,138.0,5.14,42.2,27.7,9.77,9.16
2,Iceland,Europe,7.53,10.896,0.983,72.05,0.936,0.211,0.668,3.5,3.56,37.8,26.1,2.11,7.72
3,Israel,Middle East,7.473,10.639,0.943,72.697,0.809,-0.023,0.708,412.24,3.39,30.1,38.6,20.23,3.07
4,Netherlands,Europe,7.403,10.942,0.93,71.55,0.887,0.213,0.379,420.38,3.56,42.2,29.2,11.72,8.23


In [58]:
# Find the mid point of the happiness score
mid_point = (happiness_df['ladder_score'].min()+happiness_df['ladder_score'].max()) * 0.5
mid_point

4.8315

In [59]:
# Create a happiness column with values of 0 or 1 based on the ladder score
happiness_df['happiness'] = 0
happiness_df.loc[happiness_df['ladder_score'] >= mid_point, 'happiness'] = 1

In [60]:
# Drop unnecesary columns
happiness_df.drop(['country', 'region', "ladder_score"], inplace= True, axis= 1)

In [61]:
# Define features set
X = happiness_df.copy()

In [62]:
# Define target vector
y = happiness_df["happiness"].ravel()

# Split the data into training and testing sets.

In [63]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split (X, y, random_state=48)

In [64]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [65]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

In [66]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

In [67]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=50, max_leaf_nodes=16, random_state=42)


In [68]:
# # Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Making Predictions Using the Random Forest Model

In [69]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Model Evaluation

In [70]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [72]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,12,0
Actual 1,0,19


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        19

    accuracy                           1.00        31
   macro avg       1.00      1.00      1.00        31
weighted avg       1.00      1.00      1.00        31



In [73]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.3045204855186525, 'happiness'),
 (0.13374190715318252, 'social_support'),
 (0.13194098024213044, 'healthy_life_expectancy'),
 (0.11187647043018363, 'median_age'),
 (0.08714124128433752, 'logged_GPD_per_capita'),
 (0.05109082894104355, 'freedom_life_choices'),
 (0.04528899952828081, 'lt_alcohol_per_capita'),
 (0.04263453608163819, 'avg_temperature'),
 (0.028910141361883784, 'unemployment_rate'),
 (0.024633690989328574, 'perceptions_corruption'),
 (0.017320579271364118, 'generosity'),
 (0.011109982509956231, 'population_density'),
 (0.009790156688018246, 'gini_coefficient')]