# Imports

In [1]:
import pandas as pd
import numpy as np
from numpy import where
import matplotlib.pyplot as plt

import sklearn as sk
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

from sklearn import metrics
import seaborn as sns
from sklearn.inspection import permutation_importance
from sklearn.ensemble import GradientBoostingClassifier
from pycm import *
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn import tree
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from collections import Counter
import xgboost as xgb
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from boruta import BorutaPy
import ppscore as pps

# Feature Selection

In [2]:
X_test = pd.read_excel(f'../processed_data/outliers_removed/imputed/knn/x_test.xlsx')
X_train = pd.read_excel(f'../processed_data/outliers_removed/imputed/knn/x_train.xlsx')
y_train = pd.read_excel(f'../processed_data/outliers_removed/imputed/knn/y_train.xlsx').values.ravel()
y_test = pd.read_excel(f'../processed_data/outliers_removed/imputed/knn/y_test.xlsx').values.ravel()

In [3]:
X_test.columns

Index(['Year', 'suicide_rates', 'crimes', 'population', 'population_density',
       'unemployment', 'mean_income_support', 'n_is_claimants', 'house_sales',
       'bankruptcy', 'processing', 'productivity', 'road_casualties', 'ofsted',
       'infant_mortality', 'childhood_tooth_decay', 'gp_access',
       'dental_access', 'Core City', 'Other City', 'Large Town', 'Medium Town',
       'Small Town', 'Village or smaller', 'gardens_perc',
       'income_support_rate', 'house_sales_pp', 'bankruptcy_pp',
       'no_rainy_days', 'inflation', 'previous_period_happiness_change',
       'Year_no_outliers', 'suicide_rates_no_outliers', 'crimes_no_outliers',
       'population_no_outliers', 'population_density_no_outliers',
       'unemployment_no_outliers', 'mean_income_support_no_outliers',
       'n_is_claimants_no_outliers', 'house_sales_no_outliers',
       'bankruptcy_no_outliers', 'processing_no_outliers',
       'productivity_no_outliers', 'road_casualties_no_outliers',
       'ofsted_no

# Barouta

In [4]:
# define random forest classifier, with utilising all cores and
# sampling in proportion to y labels


def run_barouta(model, X_test, y_test):
    # define Boruta feature selection method
    feat_selector = BorutaPy(model, n_estimators='auto', verbose=2, random_state=42)

    # find all relevant features - 5 features should be selected
    feat_selector.fit(X_test.values, y_test)

    # call transform() on X to filter it down to selected features
    X_filtered = feat_selector.transform(X_test.values)

    # zip my names, ranks, and decisions in a single iterable
    feature_ranks = list(zip(X_test.columns, 
                             feat_selector.ranking_, 
                             feat_selector.support_))

    display(feature_ranks)
    feature_ranks_dict = {}

    for i in feature_ranks:
        name, rank, valid = i
        feature_ranks_dict[name] = rank

    return feature_ranks_dict

In [5]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
barouta_rf_dict = run_barouta(rf, X_test, y_test)
barouta_rf_dict

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	62
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	0
Tentative: 	9
Rejected: 	53
Iteration: 	9 / 100
Confirmed: 	2
Tentative: 	7
Rejected: 	53
Iteration: 	10 / 100
Confirmed: 	2
Tentative: 	7
Rejected: 	53
Iteration: 	11 / 100
Confirmed: 	2
Tentative: 	7
Rejected: 	53
Iteration: 	12 / 100
Confirmed: 	2
Tentative: 	7
Rejected: 	53
Iteration: 	13 / 100
Confirmed: 	2
Tentative: 	7
Rejected: 	53
Iteration: 	14 / 100
Confirmed: 	2
Tentative: 	7
Rejected: 	53
Iteration: 	15 / 100
Confirmed: 	2
Tentative: 	7
Rejected: 	53
Iteration: 	16 / 100
Confirmed: 	2
Tentative: 	7
Rejected: 	53
I

[('Year', 2, False),
 ('suicide_rates', 51, False),
 ('crimes', 8, False),
 ('population', 42, False),
 ('population_density', 30, False),
 ('unemployment', 33, False),
 ('mean_income_support', 3, False),
 ('n_is_claimants', 21, False),
 ('house_sales', 10, False),
 ('bankruptcy', 41, False),
 ('processing', 17, False),
 ('productivity', 7, False),
 ('road_casualties', 16, False),
 ('ofsted', 37, False),
 ('infant_mortality', 32, False),
 ('childhood_tooth_decay', 1, True),
 ('gp_access', 25, False),
 ('dental_access', 4, False),
 ('Core City', 55, False),
 ('Other City', 56, False),
 ('Large Town', 52, False),
 ('Medium Town', 54, False),
 ('Small Town', 53, False),
 ('Village or smaller', 28, False),
 ('gardens_perc', 13, False),
 ('income_support_rate', 39, False),
 ('house_sales_pp', 28, False),
 ('bankruptcy_pp', 36, False),
 ('no_rainy_days', 46, False),
 ('inflation', 48, False),
 ('previous_period_happiness_change', 1, True),
 ('Year_no_outliers', 1, True),
 ('suicide_rates_no_

{'Year': 2,
 'suicide_rates': 51,
 'crimes': 8,
 'population': 42,
 'population_density': 30,
 'unemployment': 33,
 'mean_income_support': 3,
 'n_is_claimants': 21,
 'house_sales': 10,
 'bankruptcy': 41,
 'processing': 17,
 'productivity': 7,
 'road_casualties': 16,
 'ofsted': 37,
 'infant_mortality': 32,
 'childhood_tooth_decay': 1,
 'gp_access': 25,
 'dental_access': 4,
 'Core City': 55,
 'Other City': 56,
 'Large Town': 52,
 'Medium Town': 54,
 'Small Town': 53,
 'Village or smaller': 28,
 'gardens_perc': 13,
 'income_support_rate': 39,
 'house_sales_pp': 28,
 'bankruptcy_pp': 36,
 'no_rainy_days': 46,
 'inflation': 48,
 'previous_period_happiness_change': 1,
 'Year_no_outliers': 1,
 'suicide_rates_no_outliers': 50,
 'crimes_no_outliers': 12,
 'population_no_outliers': 43,
 'population_density_no_outliers': 32,
 'unemployment_no_outliers': 46,
 'mean_income_support_no_outliers': 3,
 'n_is_claimants_no_outliers': 24,
 'house_sales_no_outliers': 5,
 'bankruptcy_no_outliers': 44,
 'pro

# Sci Kit Learn

In [6]:
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
rf.fit(X_train, y_train)

scikitlearn_features = pd.DataFrame({'feature_importance': rf.feature_importances_}, index=X_train.columns)
scikitlearn_features['SciKitLearn: RandomForest'] = scikitlearn_features['feature_importance'].rank(method='dense', ascending=False)

scikitlearn_features = scikitlearn_features.drop('feature_importance', axis=1)
scikitlearn_features

Unnamed: 0,SciKitLearn: RandomForest
Year,3.0
suicide_rates,53.0
crimes,9.0
population,47.0
population_density,59.0
...,...
house_sales_pp_no_outliers,31.0
bankruptcy_pp_no_outliers,43.0
no_rainy_days_no_outliers,18.0
inflation_no_outliers,27.0


# Predictive Power Score

In [7]:
df = X_train.copy()
df['mean_happiness'] = y_train

pps_df = pps.predictors(df, "mean_happiness")
pps_df['pps_rank'] = pps_df['ppscore'].rank(method='dense', ascending=False)

# Correlation

In [8]:
corr_dict = {}

for col in X_train.columns:
    corr = np.corrcoef(X_train[col], y_train)[0][1]
    corr_dict[col] = corr

corr_dict = {key: rank for rank, key in enumerate(sorted(corr_dict, key=corr_dict.get), 1)}
corr_dict

{'previous_period_happiness_change': 1,
 'previous_period_happiness_change_no_outliers': 2,
 'Year_no_outliers': 3,
 'Year': 4,
 'mean_income_support_no_outliers': 5,
 'mean_income_support': 6,
 'unemployment_no_outliers': 7,
 'unemployment': 8,
 'no_rainy_days_no_outliers': 9,
 'no_rainy_days': 10,
 'bankruptcy_pp': 11,
 'n_is_claimants': 12,
 'bankruptcy_pp_no_outliers': 13,
 'n_is_claimants_no_outliers': 14,
 'Village or smaller': 15,
 'Village or smaller_no_outliers': 16,
 'bankruptcy': 17,
 'bankruptcy_no_outliers': 18,
 'Medium Town_no_outliers': 19,
 'income_support_rate_no_outliers': 20,
 'Small Town': 21,
 'productivity': 22,
 'productivity_no_outliers': 23,
 'Medium Town': 24,
 'road_casualties_no_outliers': 25,
 'gp_access': 26,
 'income_support_rate': 27,
 'gp_access_no_outliers': 28,
 'house_sales_pp': 29,
 'Small Town_no_outliers': 30,
 'road_casualties': 31,
 'house_sales_pp_no_outliers': 32,
 'Large Town': 33,
 'Other City': 34,
 'Large Town_no_outliers': 35,
 'Core Cit

# Comparison

In [9]:
feature_comparison_df = pps_df[['x', 'pps_rank']].set_index('x')
feature_comparison_df['Barouta: RandomForest'] = feature_comparison_df.index.map(barouta_rf_dict)

feature_comparison_df['correlation'] = feature_comparison_df.index.map(corr_dict)

feature_comparison_df['average_rank'] = feature_comparison_df.mean(axis=1)
feature_comparison_df = feature_comparison_df.sort_values('average_rank')

feature_comparison_df.head(35)

Unnamed: 0_level_0,pps_rank,Barouta: RandomForest,correlation,average_rank
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
previous_period_happiness_change,1.0,1,1,1.0
previous_period_happiness_change_no_outliers,2.0,1,2,1.666667
Year,4.0,2,4,3.333333
Year_no_outliers,6.0,1,3,3.333333
mean_income_support_no_outliers,18.0,3,5,8.666667
mean_income_support,18.0,3,6,9.0
productivity,8.0,7,22,12.333333
productivity_no_outliers,7.0,19,23,16.333333
n_is_claimants,18.0,21,12,17.0
no_rainy_days_no_outliers,3.0,40,9,17.333333


In [10]:
textfile = open("features.txt", "w")
for element in feature_comparison_df.index:
    textfile.write(element + "\n")
textfile.close()