In [1]:
import psycopg2
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Connect to Database

In [3]:
#Connect to PgAdmin
engine = psycopg2.connect(
    database="project-data",
    user = "postgres",
    password = "Post23!Fnord",
    host = "happinessproject.cqkpnj5dcmou.us-east-1.rds.amazonaws.com",
    port = '5432'
    )

cursor = engine.cursor()

In [4]:
#Pull combined data table
query = "SELECT * from combined_data"

df = pd.read_sql(query, con=engine)

In [5]:
df.head()

Unnamed: 0,country_name,region,ladder_score,stand_err,up_whisker,low_whisker,log_gdp_per_cap,social_support,healty_life_exp,freedom,...,ladder_2011,ladder_2012,ladder_2013,ladder_2014,ladder_2015,ladder_2016,ladder_2017,ladder_2018,ladder_2019,ladder_2020
0,Afghanistan,South Asia,2.523,0.038,2.596,2.449,7.695,0.463,52.493,0.382,...,3.832,3.783,3.572,3.131,3.983,4.22,2.662,2.694,2.375,
1,Albania,Central and Eastern Europe,5.117,0.059,5.234,5.001,9.52,0.697,68.999,0.785,...,5.867,5.51,4.551,4.814,4.607,4.511,4.64,5.004,4.995,5.365
2,Algeria,Middle East and North Africa,4.887,0.053,4.991,4.783,9.342,0.802,66.005,0.48,...,5.317,5.605,,6.355,,5.341,5.249,5.043,4.745,
3,Argentina,Latin America and Caribbean,5.929,0.056,6.04,5.819,9.962,0.898,69.0,0.828,...,6.776,6.468,6.582,6.671,6.697,6.427,6.039,5.793,6.086,5.901
4,Armenia,Commonwealth of Independent States,5.283,0.058,5.397,5.168,9.487,0.799,67.055,0.825,...,4.26,4.32,4.277,4.453,4.348,4.325,4.288,5.062,5.488,


# Clean master table for Machine Learning

In [6]:
#column names
list(df)

['country_name',
 'region',
 'ladder_score',
 'stand_err',
 'up_whisker',
 'low_whisker',
 'log_gdp_per_cap',
 'social_support',
 'healty_life_exp',
 'freedom',
 'generosity',
 'percept_corrupt',
 'ladder_score_distopia',
 'expby_log_gdp_per_cap',
 'expby_social_support',
 'expby_healty_life_exp',
 'expby_freedom',
 'expby_generosity',
 'expby_percept_corrupt',
 'distopia_plus_resid',
 'ac_both_sexes',
 'ac_male',
 'ac_female',
 'population_2021',
 'covid_total_cases',
 'covid_new_cases',
 'covid_total_deaths',
 'covid_new_deaths',
 'covid_total_recovered',
 'covid_new_recovered',
 'covid_active_cases',
 'covid_serious_critical',
 'covid_cases_per_mil',
 'covid_deaths_per_mil',
 'covid_total_tests',
 'covid_tests_per_mil',
 'covid_who_region',
 'population_2020',
 'land_area_skm',
 'density_skm',
 'meat_consumption',
 'median_age',
 'screen_time_avg',
 'suicide_rate',
 'ladder_2010',
 'ladder_2011',
 'ladder_2012',
 'ladder_2013',
 'ladder_2014',
 'ladder_2015',
 'ladder_2016',
 'ladde

In [7]:
df2 = df.filter(['ladder_score', 'healty_life_exp', 'covid_tests_per_mil', 'freedom', 'social_support', 'percept_corrupt', 'log_gdp_per_cap', 'covid_total_tests', 'meat_consumption', 'median_age', 'generosity', 'land_area_skm', 'covid_cases_per_mil'], axis=1)

In [8]:
# view total null values for columns
df2.isna().sum()

ladder_score            0
healty_life_exp         0
covid_tests_per_mil    17
freedom                 0
social_support          0
percept_corrupt         0
log_gdp_per_cap         0
covid_total_tests      17
meat_consumption        9
median_age              5
generosity              0
land_area_skm           8
covid_cases_per_mil     7
dtype: int64

In [9]:
#compare NA values to dataset row count
len(df2)

149

In [10]:
#drop all rows containing NA values
df2 = df2.dropna()

In [11]:
# view total null values for columns
df2.isna().sum()

ladder_score           0
healty_life_exp        0
covid_tests_per_mil    0
freedom                0
social_support         0
percept_corrupt        0
log_gdp_per_cap        0
covid_total_tests      0
meat_consumption       0
median_age             0
generosity             0
land_area_skm          0
covid_cases_per_mil    0
dtype: int64

In [12]:
#length of final dataset for analysis 
len(df2)

125

# Random Forest: Find the 10 variables that impact happiness scores the most 

In [13]:
# Create columns and target variables
columns = [
    'ladder_score', 'healty_life_exp', 'covid_tests_per_mil', 'freedom', 'social_support', 'percept_corrupt', 
    'log_gdp_per_cap', 'covid_total_tests', 'meat_consumption', 'median_age', 'generosity', 'land_area_skm', 
    'covid_cases_per_mil'
]

target = ["ladder_score"]

In [14]:
#Convert variable type for target to non-continuous dtype
df2 = df2.astype({'ladder_score':'int'})

df2.head()

Unnamed: 0,ladder_score,healty_life_exp,covid_tests_per_mil,freedom,social_support,percept_corrupt,log_gdp_per_cap,covid_total_tests,meat_consumption,median_age,generosity,land_area_skm,covid_cases_per_mil
0,2,52.493,2317.0,0.382,0.463,0.924,7.695,90396.0,17.3,27.4,-0.102,652230.0,946.0
3,5,69.0,17564.0,0.828,0.898,0.834,9.962,794544.0,79.7,31.7,-0.182,2780400.0,5044.0
4,5,67.055,57898.0,0.825,0.799,0.629,9.487,171600.0,27.7,35.1,-0.168,29743.0,13435.0
5,7,73.9,181419.0,0.914,0.94,0.442,10.796,4631419.0,108.2,38.7,0.159,7741220.0,779.0
6,7,73.3,104008.0,0.908,0.934,0.481,10.906,937275.0,94.1,44.0,0.042,83871.0,2408.0


In [15]:
# Create our features
X = pd.get_dummies(df2.drop(columns="ladder_score"))


# Create our target
y = df2["ladder_score"]

In [16]:
X.describe()

Unnamed: 0,healty_life_exp,covid_tests_per_mil,freedom,social_support,percept_corrupt,log_gdp_per_cap,covid_total_tests,meat_consumption,median_age,generosity,land_area_skm,covid_cases_per_mil
count,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0
mean,65.4872,76278.64,0.800048,0.821624,0.722704,9.520368,2105374.0,47.7176,30.9928,-0.022264,862006.0,3343.968
std,6.429815,125386.061457,0.11233,0.115561,0.188043,1.146408,6767493.0,35.43999,8.90933,0.148387,2197486.0,4456.51166
min,48.7,4.0,0.382,0.463,0.082,6.635,120.0,3.1,15.4,-0.288,298.0,3.0
25%,60.704,8677.0,0.731,0.764,0.667,8.755,125317.0,17.6,23.8,-0.133,56785.0,386.0
50%,66.9,36535.0,0.822,0.844,0.787,9.59,352546.0,39.0,30.2,-0.046,238533.0,1317.0
75%,70.0,92626.0,0.89,0.913,0.845,10.499,1383816.0,71.1,39.6,0.079,716550.0,5044.0
max,76.953,995282.0,0.97,0.983,0.939,11.647,63139600.0,145.9,47.3,0.542,17098240.0,25130.0


In [17]:
# Check the balance of our target values
y.value_counts()

5    39
6    31
4    28
7    17
3     9
2     1
Name: ladder_score, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({3: 7, 5: 33, 6: 20, 4: 19, 7: 13, 2: 1})

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

# Creating a StandardScaler instance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [20]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

0.45353535353535357

In [21]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[0, 1, 1, 0, 0],
       [1, 5, 3, 0, 0],
       [1, 0, 4, 1, 0],
       [0, 0, 4, 6, 1],
       [0, 0, 0, 2, 2]], dtype=int64)

In [22]:
# Print the imbalanced classification report

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          3       0.00      0.00      0.93      0.00      0.00      0.00         2
          4       0.83      0.56      0.96      0.67      0.73      0.51         9
          5       0.33      0.67      0.69      0.44      0.68      0.46         6
          6       0.67      0.55      0.86      0.60      0.68      0.45        11
          7       0.67      0.50      0.96      0.57      0.69      0.46         4

avg / total       0.61      0.53      0.87      0.55      0.65      0.44        32



In [23]:
#export DataFrame to CSV file
df2.to_csv('happiness_top12_df.csv', index=False)