In [1]:
from numpy import array
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
import pandas as pd
import statistics
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.impute import KNNImputer
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import numpy as np

# data sample
data = pd.read_csv('../clustering_data_with_noout_cols.csv')

with open("features_final.txt") as f:
    features = f.readlines()
    features = [f.replace('\n', '') for f in features]
    

In [2]:
features = features[:5]
features

['previous_period_happiness_change',
 'Year',
 'mean_income_support_no_outliers',
 'productivity',
 'n_is_claimants']

In [4]:
# prepare cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

imputed_full = SimpleImputer(strategy='most_frequent').fit_transform(data)
imputed_full = pd.DataFrame(imputed_full, columns=data.columns)

RF_scores = []
LR_scores = []
SVM_scores = []
XGB_scores = []
GB_scores = []

RF_aucs = []
LR_aucs = []
SVM_aucs = []
XGB_aucs = []
GB_aucs = []

RF_cm = []
LR_cm = []
SVM_cm = []
XGB_cm = []
GB_cm = []

i=0

# enumerate splits
for train, test in kfold.split(data):
    y_train = data['mean_happiness_binary'].iloc[train]
    X_train = data[features].iloc[train]
    y_test = data['mean_happiness_binary'].iloc[test]
    imputer = KNNImputer(n_neighbors=50, weights='distance')

    # imputing training
    imputer.fit(X_train)
    X_train_knn = imputer.transform(X_train)
    x_train_imputed = pd.DataFrame(X_train_knn, columns=X_train.columns)

    # imputing test data
    feature_data = data[features]
    X_full_imputed = imputer.fit_transform(feature_data)
    X_full_knn_df = pd.DataFrame(X_full_imputed, columns=feature_data.columns)
    x_test_imputed = X_full_knn_df.iloc[test]

    #run model
    LR = LogisticRegression(solver='lbfgs', multi_class='multinomial')
    LR.fit(x_train_imputed, y_train)
    LR_scores.append(LR.score(x_test_imputed, y_test))
    LR_y_pred = LR.predict(x_test_imputed)
    LR_aucs.append(roc_auc_score(y_test, LR_y_pred))
    LR_cm.append(metrics.confusion_matrix(y_test, LR_y_pred))

    RF = RandomForestClassifier(n_estimators=1000, max_depth=10)
    RF.fit(x_train_imputed, y_train)
    RF_scores.append(RF.score(x_test_imputed, y_test))
    RF_y_pred = RF.predict(x_test_imputed)
    RF_aucs.append(roc_auc_score(y_test, RF_y_pred))
    RF_cm.append(metrics.confusion_matrix(y_test, RF_y_pred))


    SVM_ovo = svm.SVC(kernel="linear")
    SVM_ovo.fit(x_train_imputed, y_train)
    SVM_scores.append(SVM_ovo.score(x_test_imputed, y_test))
    SVM_y_pred = SVM_ovo.predict(x_test_imputed)
    SVM_aucs.append(roc_auc_score(y_test, SVM_y_pred))
    SVM_cm.append(metrics.confusion_matrix(y_test, SVM_y_pred))

    GB = GradientBoostingClassifier(n_estimators=1000, max_depth=10)
    GB.fit(x_train_imputed, y_train)
    GB_scores.append(GB.score(x_test_imputed, y_test))
    GB_y_pred = GB.predict(x_test_imputed)
    GB_aucs.append(roc_auc_score(y_test, GB_y_pred))
    GB_cm.append(metrics.confusion_matrix(y_test, GB_y_pred))
    
    print(i)
    i+=1

0
1
2
3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


4
5
6
7
8
9


# Accuracies

In [17]:
print('Linear Regression')
print(statistics.mean(LR_scores))

print('\nRandom Forest')
print(statistics.mean(RF_scores))

print('\nSVM ono')
print(statistics.mean(SVM_scores))

print('\nGradient Boost')
print(statistics.mean(GB_scores))

Linear Regression
0.6459507612568837

Random Forest
0.6535462806676555

SVM ono
0.6500366562665166

Gradient Boost
0.6343034456890526


# AUC ROC

In [18]:
print('Linear Regression')
print(statistics.mean(LR_aucs))

print('\nRandom Forest')
print(statistics.mean(RF_aucs))

print('\nSVM ono')
print(statistics.mean(SVM_aucs))

print('\nGradient Boost')
print(statistics.mean(GB_aucs))

Linear Regression
0.6453973206577325

Random Forest
0.6522344664617421

SVM ono
0.6495413790202341

Gradient Boost
0.6337605301593765


# Confusion Matrix

In [None]:
'''cm = ConfusionMatrix(y_test.to_numpy(), y_pred, digit=5)
cm.save_html('RF_pycm2.html')

from pycm import*

from pandas_profiling import ProfileReport
profile = ProfileReport(df_num, title="Pandas Profiling Report",vars={"num":{"low_categorical_threshold":0}})
profile

'''

In [21]:
for cm in [RF_cm, LR_cm, SVM_cm, GB_cm]:
    cm_flat = [testcase.ravel() for testcase in cm]
    tn, fp, fn, tp = np.mean(cm_flat, axis=0)
    print(tp, tn, fp, fn)

108.1 116.0 55.3 63.5
110.4 111.1 60.2 61.2
112.3 110.6 60.7 59.3
107.0 110.5 60.8 64.6


# Exploration

In [23]:
from pycm import *
cm = ConfusionMatrix(y_test.to_numpy(), RF_y_pred, digit=5)
cm.save_html('RF.html')

{'Status': True,
 'Message': 'C:\\Users\\gv9\\Uni\\Year 2\\2 - Applied ML\\Predicting Happiness\\Notebooks\\RF.html.html'}