In [1]:
import psycopg2
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Connect to Database

In [3]:
#Connect to PgAdmin
engine = psycopg2.connect(
    database="project-data",
    user = "postgres",
    password = "Post23!Fnord",
    host = "happinessproject.cqkpnj5dcmou.us-east-1.rds.amazonaws.com",
    port = '5432'
    )

cursor = engine.cursor()


In [4]:
#Pull combined data table
query = "SELECT * from combined_data"

df = pd.read_sql(query, con=engine)

In [5]:
df.head()

Unnamed: 0,country_name,region,ladder_score,stand_err,up_whisker,low_whisker,log_gdp_per_cap,social_support,healty_life_exp,freedom,...,ladder_2010,ladder_2012,ladder_2013,ladder_2014,ladder_2015,ladder_2016,ladder_2017,ladder_2018,ladder_2019,ladder_2020
0,Brazil,Latin America and Caribbean,6.33,0.043,6.415,6.245,9.577,0.882,66.601,0.804,...,6.837,7.038,7.14,6.981,6.547,6.375,6.333,6.191,6.451,6.11
1,India,South Asia,3.819,0.026,3.869,3.769,8.755,0.603,60.633,0.893,...,4.989,4.635,4.428,4.424,4.342,4.179,4.046,3.818,3.249,4.225
2,Russia,Commonwealth of Independent States,5.477,0.033,5.541,5.413,10.189,0.903,64.703,0.718,...,5.385,5.389,5.537,6.037,5.996,5.855,5.579,5.514,5.441,5.495
3,South Africa,Sub-Saharan Africa,4.956,0.06,5.074,4.839,9.403,0.86,56.904,0.749,...,4.652,4.931,3.661,4.828,4.887,4.77,4.514,4.884,5.035,4.947
4,Mexico,Latin America and Caribbean,6.317,0.053,6.42,6.213,9.859,0.831,68.597,0.862,...,6.802,6.91,7.443,6.68,6.236,6.824,6.41,6.55,6.432,5.964


# Clean master table for Machine Learning

In [6]:
# Function to delete all columns with a specified NA percentage and delete rows still containing NA values

def remove_percent_missing(perc): 
    global df
    min_count =  int(((100-perc)/100)*df.shape[0] + 1)
    mod_df = df.dropna( axis=1, 
            thresh=min_count)
    df = mod_df.dropna()
    df = pd.DataFrame(df)

    print(len(df))

In [7]:
#Drop columns not needed for analysis
df = df.drop(columns=["population_2020", "ladder_2010", "ladder_2012","ladder_2013", "ladder_2014", "ladder_2015", "ladder_2016",
                      "ladder_2017", "ladder_2018", "ladder_2019", "ladder_2020", "country_name", "up_whisker", "low_whisker",
                      "expby_log_gdp_per_cap", "expby_social_support", "expby_healty_life_exp", "expby_freedom", "expby_generosity",
                      "expby_percept_corrupt", "stand_err", "distopia_plus_resid", "ac_both_sexes", "region"])

# Enter percent of NA columns values wishing to remove using whole number, display dataset length

In [8]:
#Enter percent using whole number, display dataset length
remove_percent_missing()

84


# Random Forest: Find the 12 variables that impact happiness scores the most 

In [9]:
# Create columns and target variables
columns = [
    "country_name", "region", "ladder_score", "stand_err",
    "up_whisker", "low_whisker", "log_gdp_per_cap", "social_support",
    "expby_social_support", "freedom", "generosity", "percept_corrupt",
    "ladder_score_distopia", "expby_log_gdp_per_cap", "expby_social_support", "expby_healty_life_exp",
    "expby_freedom", "expby_generosity", "expby_percept_corrupt", "distopia_plus_resid", "ac_both_sexes", "ac_male",
    "ac_female", "population_2021", "covid_total_cases", "covid_new_cases", "covid_total_deaths", "covid_total_recovered",
    "covid_new_recovered", "covid_active_cases", "covid_serious_critical", "covid_cases_per_mil", "covid_deaths_per_mil",
    "covid_total_tests", "covid_tests_per_mil", "covid_who_region", "population_2020", "land_area_skm", "land_area_skm", 
    "density_skm", "meat_consumption", "median_age", "screen_time_avg", "suicide_rate", "ladder_2010", "ladder_2012", 
    "ladder_2013", "ladder_2014", "ladder_2015", "ladder_2016", "ladder_2017", "ladder_2018", "ladder_2019", "ladder_2020"
]

target = ["ladder_score"]

In [10]:
#Convert variable type for target to non-continuous dtype
df = df.astype({'ladder_score':'int'})

df.head()

Unnamed: 0,ladder_score,log_gdp_per_cap,social_support,healty_life_exp,freedom,generosity,percept_corrupt,ladder_score_distopia,ac_male,ac_female,...,covid_cases_per_mil,covid_deaths_per_mil,covid_total_tests,covid_tests_per_mil,covid_who_region,land_area_skm,density_skm,meat_consumption,median_age,suicide_rate
0,6,9.577,0.882,66.601,0.804,-0.071,0.756,2.43,11.65,3.22,...,13716.0,464.0,13206188.0,62085.0,Americas,8515770.0,25.06,82.4,32.6,6.1
1,3,8.755,0.603,60.633,0.893,0.089,0.774,2.43,9.06,1.89,...,1466.0,30.0,22149351.0,16035.0,South-EastAsia,3287263.0,454.94,5.2,28.1,16.5
2,5,10.189,0.903,64.703,0.718,-0.111,0.845,2.43,17.99,4.31,...,5974.0,100.0,29716907.0,203623.0,Europe,17098242.0,8.82,51.0,39.6,26.5
3,4,9.403,0.86,56.904,0.749,-0.067,0.86,2.43,15.74,3.46,...,9063.0,162.0,3149807.0,53044.0,Africa,1219090.0,47.63,39.0,27.1,12.8
4,6,9.859,0.831,68.597,0.862,-0.147,0.799,2.43,8.13,2.18,...,3585.0,391.0,1056915.0,8189.0,Americas,1964375.0,64.91,58.6,28.3,5.2


In [11]:
# Create our features
X = pd.get_dummies(df.drop(columns="ladder_score"))


# Create our target
y = df["ladder_score"]

In [12]:
X.describe()

Unnamed: 0,log_gdp_per_cap,social_support,healty_life_exp,freedom,generosity,percept_corrupt,ladder_score_distopia,ac_male,ac_female,population_2021,...,density_skm,meat_consumption,median_age,suicide_rate,covid_who_region_Africa,covid_who_region_Americas,covid_who_region_EasternMediterranean,covid_who_region_Europe,covid_who_region_South-EastAsia,covid_who_region_WesternPacific
count,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,...,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0,84.0
mean,9.627226,0.825512,65.97369,0.795107,-0.04969,0.75319,2.43,9.467762,2.65256,51626320.0,...,160.061786,48.544048,31.216667,9.504762,0.190476,0.214286,0.142857,0.357143,0.047619,0.047619
std,1.003613,0.112456,5.847626,0.113524,0.130379,0.164949,4.467564e-15,6.536607,2.040093,154404200.0,...,293.808761,33.012781,8.604648,5.216723,0.395035,0.41279,0.352029,0.482035,0.214238,0.214238
min,6.958,0.463,50.102,0.382,-0.288,0.179,2.43,0.0,0.0,541448.0,...,2.97,5.1,16.5,2.2,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.05375,0.77375,61.9995,0.74075,-0.1475,0.72,2.43,3.605,0.7775,6523238.0,...,45.045,21.225,24.2,5.425,0.0,0.0,0.0,0.0,0.0,0.0
50%,9.6595,0.8505,67.0,0.818,-0.075,0.802,2.43,8.95,2.295,17226350.0,...,87.11,42.5,29.95,9.1,0.0,0.0,0.0,0.0,0.0,0.0
75%,10.37225,0.90375,69.7765,0.8775,0.03725,0.857,2.43,15.58,4.42,41155980.0,...,144.8925,70.4,39.375,12.275,0.0,0.0,0.0,1.0,0.0,0.0
max,11.647,0.954,75.1,0.97,0.311,0.939,2.43,20.61,6.34,1381345000.0,...,2012.1,145.9,47.3,26.5,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
# Check the balance of our target values
y.value_counts()

5    26
4    22
6    21
7    11
3     3
2     1
Name: ladder_score, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
Counter(y_train)

Counter({5: 17, 4: 16, 6: 18, 7: 9, 2: 1, 3: 2})

In [15]:
# Resample the training data with the BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

# Creating a StandardScaler instance.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [16]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score

0.5222222222222221

In [17]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[0, 1, 0, 0, 0],
       [0, 5, 1, 0, 0],
       [0, 2, 4, 3, 0],
       [0, 0, 2, 1, 0],
       [0, 0, 0, 0, 2]], dtype=int64)

In [18]:
# Print the imbalanced classification report

from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          3       0.00      0.00      1.00      0.00      0.00      0.00         1
          4       0.62      0.83      0.80      0.71      0.82      0.67         6
          5       0.57      0.44      0.75      0.50      0.58      0.32         9
          6       0.25      0.33      0.83      0.29      0.53      0.26         3
          7       1.00      1.00      1.00      1.00      1.00      1.00         2

avg / total       0.55      0.57      0.81      0.55      0.65      0.46        21



In [19]:
# List the features sorted in descending order by feature importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.09145369519398287, 'freedom'),
 (0.08721176031971921, 'healty_life_exp'),
 (0.0735706148299859, 'log_gdp_per_cap'),
 (0.07156833530690546, 'meat_consumption'),
 (0.07140308235654569, 'percept_corrupt'),
 (0.05753020288245204, 'social_support'),
 (0.05521985879344281, 'covid_tests_per_mil'),
 (0.040222600120236984, 'ac_female'),
 (0.04003538838360513, 'generosity'),
 (0.03547429134794273, 'covid_deaths_per_mil'),
 (0.035418259737926706, 'covid_total_tests'),
 (0.030923970939754415, 'covid_active_cases'),
 (0.029681534205639067, 'covid_total_cases'),
 (0.02877732729211083, 'ac_male'),
 (0.02798580323968175, 'median_age'),
 (0.02795542809010666, 'covid_cases_per_mil'),
 (0.026439806405656106, 'land_area_skm'),
 (0.025180021387184975, 'density_skm'),
 (0.02408746291693555, 'suicide_rate'),
 (0.024071372072934236, 'covid_total_recovered'),
 (0.02268447469429737, 'population_2021'),
 (0.022099888583178972, 'covid_serious_critical'),
 (0.020424968738411722, 'covid_total_deaths'),
 (0.0103

# Export Machine Learning Dataset

In [20]:
##export DataFrame to CSV file
#df.to_csv('happiness_df.csv', index=False)