In [1]:
import pandas as pd  # to handle data
import numpy as np 
import datetime  # to get the current year value
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sn

from sklearn.model_selection import train_test_split,GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression

from preprocessing import *

In [2]:
def to_df_restore_dtypes(data_array, og_df, exclude):
    col_names = og_df.drop(exclude, axis=1).columns.tolist() 
    arr_df = pd.DataFrame(data_array, columns = col_names)
    
    bool_columns = og_df.drop(exclude, axis=1).select_dtypes(include='bool').columns.tolist() 
    int_columns  = og_df.drop(exclude, axis=1).select_dtypes(include='int64').columns.tolist() 
    str_columns  = og_df.drop(exclude, axis=1).select_dtypes(include='string').columns.tolist() 
    
    arr_df[bool_columns] = arr_df[bool_columns].astype('bool')
    arr_df[int_columns]  = arr_df[int_columns].astype('int64')
    arr_df[str_columns]  = arr_df[str_columns].astype('string')
    
    return arr_df

In [3]:
def print_confusion_matrix(true, pred, t=0.5):
    pred_binary = [1 if y >= t else 0 for y in pred]
    cm = confusion_matrix(true, pred_binary)
    tn, fp, fn, tp = cm.ravel()
    
    print('confusion matrix:')
    print(cm)
    print()
    print('true positives: ', tp)
    print('false positives:', fp)
    print('true negatives: ', tn)
    print('false negatives:', fn)
    print()

# Split the data

In [5]:
train_m3 = pd.read_csv('data/train_month_3_with_target.csv')
test_m3 = pd.read_csv('data/test_month_3.csv')

X = train_m3.drop('target', axis=1).to_numpy()
y = train_m3['target'].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

Features selected in other notebooks

In [6]:
feature_set_oldNames = ['visits_distinct_so_areas',
 'prem_fire_car_other_insurance',
 'customer_self_employed',
 'customer_gender',
 'has_insurance_21',
 'customer_age',
 'bal_savings_account_starter',
 'has_current_account',
 'bal_current_account_starter',
 'bal_savings_account',
 'bal_mortgage_loan',
 'bal_personal_loan']

feature_set = ["remainder__" + x for x in feature_set_oldNames]
feature_set

['remainder__visits_distinct_so_areas',
 'remainder__prem_fire_car_other_insurance',
 'remainder__customer_self_employed',
 'remainder__customer_gender',
 'remainder__has_insurance_21',
 'remainder__customer_age',
 'remainder__bal_savings_account_starter',
 'remainder__has_current_account',
 'remainder__bal_current_account_starter',
 'remainder__bal_savings_account',
 'remainder__bal_mortgage_loan',
 'remainder__bal_personal_loan']

In [7]:
# preprocces
X_train_df = to_df_restore_dtypes(X_train, train_m3, 'target')
X_test_df = to_df_restore_dtypes(X_test, train_m3, 'target')

data = preprocess(X_train_df, X_test_df, test_m3, 0)
X_train_df = data[0]
X_test_df = data[1]
test_m3 = data[2]

# convert to arrays
#full_feature_set = list(set(X_train_df.columns.tolist()) - {'remainder__client_id'}) 
used_features = feature_set

X_train_df = X_train_df[used_features]
X_train_arr = X_train_df.to_numpy()

test_client_id = X_test_df.pop('remainder__client_id')  
X_test_df = X_test_df[used_features]
X_test_arr = X_test_df.to_numpy()

# convert from boolean to int
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Logistic regression

Logistic regression without any sort of class weights.

In [8]:
LogReg = LogisticRegression(solver="liblinear",random_state=42).fit(X_train_arr, y_train)
LR_pred_prob = LogReg.predict_proba(X_test_arr)[:,1]
LR_pred_class = LogReg.predict(X_test_arr)

In [9]:
prediction = pd.DataFrame(LR_pred_prob, columns = ['prob'])
prediction['true'] = y_test.tolist()
prediction['pred'] = LR_pred_class

sorted_df = prediction.sort_values(by='prob', ascending=False)

top_250 = sorted_df.iloc[:250,:]
y_prob_250 = top_250['prob'].to_numpy()
y_true_250 = top_250['true'].to_numpy()
# The top 250 instances are classified as positives
y_pred_250 = np.ones(y_true_250.shape)

rest = sorted_df.iloc[250:,:]
y_prob_rest = rest['prob'].to_numpy()
y_true_rest = rest['true'].to_numpy()
# Observations not in the top 250 are classified as negatives
y_pred_rest = np.zeros(y_true_rest.shape)

new_y_true = np.concatenate((y_true_250, y_true_rest), axis=0)
new_y_pred = np.concatenate((y_pred_250, y_pred_rest), axis=0)

print_confusion_matrix(new_y_true, new_y_pred)

confusion matrix:
[[15214   233]
 [  461    17]]

true positives:  17
false positives: 233
true negatives:  15214
false negatives: 461



# Class weights

Below we perform a grid search for optimal weights for each of the two classes. At each value in parameter "weights" we perform a cross validation procedure and then select the parameters with highest score according to the "roc_auc" setting.

In [30]:
weights = np.linspace(0.0,0.99,200)

#Creating a dictionary grid for grid search
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}

#Fitting grid search to the train data with 5 folds
gridsearch = GridSearchCV(estimator= LogReg, 
                          param_grid= param_grid,
                          cv=StratifiedKFold(), 
                          n_jobs=-1, 
                          scoring="roc_auc", 
                          verbose=2).fit(X_train_arr, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


In [31]:
weight_0 = gridsearch.best_params_["class_weight"][0]
weight_1 = gridsearch.best_params_["class_weight"][1]

print(f'Best score: {gridsearch.best_score_} with param: {gridsearch.best_params_}')

Best score: 0.7133453386657517 with param: {'class_weight': {0: 0.024874371859296484, 1: 0.9751256281407035}}


In [32]:
LogReg_w = LogisticRegression(solver='liblinear',class_weight = {0:weight_0,1:weight_1}, random_state=10).fit(X_train_arr, y_train) #
LR_pred_prob_w = LogReg_w.predict_proba(X_test_arr)[:,1]
LR_pred_class_w = LogReg_w.predict(X_test_arr)

In [98]:
prediction_w = pd.DataFrame(LR_pred_prob_w, columns = ['prob'])
prediction_w['true'] = y_test.tolist()
prediction_w['pred'] = LR_pred_class_w

sorted_df_w = prediction_w.sort_values(by='prob', ascending=False)

top_250_w = sorted_df_w.iloc[:250,:]
y_prob_250_w = top_250_w['prob'].to_numpy()
y_true_250_w = top_250_w['true'].to_numpy()

# The top 250 instances are classified as positives
y_pred_250_w = np.ones(y_true_250_w.shape)

rest_w = sorted_df_w.iloc[250:,:]
y_prob_rest_w = rest_w['prob'].to_numpy()
y_true_rest_w = rest_w['true'].to_numpy()
# Observations not in the top 250 are classified as negatives
y_pred_rest_w = np.zeros(y_true_rest_w.shape)

new_y_true_w = np.concatenate((y_true_250_w, y_true_rest_w), axis=0)
new_y_pred_w = np.concatenate((y_pred_250_w, y_pred_rest_w), axis=0)

print_confusion_matrix(new_y_true_w, new_y_pred_w)

confusion matrix:
[[15225   222]
 [  450    28]]

true positives:  28
false positives: 222
true negatives:  15225
false negatives: 450

