In [None]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, auc, f1_score, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve 
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('data/model_data.csv')


data.info()


In [None]:
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week', 'contacted_before', 'poutcome']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1=data.join(cat_list)
    data=data1

data_vars=data.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]

data_final=data[to_keep]
data_final.columns.values
data_final.info()

In [None]:
#initial cols
#cols=['previous', 'emp.var.rate', 'euribor3m', 'job_blue-collar', 'job_entrepreneur', 'job_management', 'job_retired', 'job_services', 'job_student', 'education_basic.9y', 'default_unknown', 'contact_cellular', 'contact_telephone', 'month_dec', 'month_jul', 'month_jun', 'month_may', 'month_nov', 'day_of_week_fri', 'day_of_week_mon']

#adjusted with p values cols
#cols=['emp.var.rate', 'cons.price.idx', 'euribor3m', 'nr.employed', 'job_retired', 'job_student', 'default_no', 'contact_cellular', 'month_apr', 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_oct', 'day_of_week_mon', 'contacted_before_0', 'contacted_before_1', 'poutcome_failure', 'poutcome_success']
#X=data_X[cols]
#y=data_y['subscribed']

In [None]:
def filter_strings(string_arr, boolean_arr):
    filtered_arr = []
    for i in range(len(string_arr)):
        if boolean_arr[i]:
            filtered_arr.append(string_arr[i])
    return filtered_arr


filtered_strings = filter_strings(X, rfe.support_.tolist())
print(filtered_strings)

In [None]:
#initial cols
cols=['euribor3m', 'marital_divorced', 'marital_married', 'marital_single', 'education_basic.4y', 'education_basic.6y', 'education_basic.9y', 'education_high.school', 'education_professional.course', 'education_university.degree', 'education_unknown', 'housing_no', 'housing_yes', 'loan_no', 'loan_yes', 'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed']
#adjusted with p values cols
#cols=['previous', 'emp.var.rate', 'euribor3m', 'job_blue-collar','job_retired', 'job_services', 'job_student', 'default_unknown', 'contact_cellular', 'contact_telephone', 'month_jul', 'month_jun', 'month_may','day_of_week_fri', 'day_of_week_mon']
X=data_X[cols]
y=data_y['subscribed']

In [None]:
######################
cols=['previous', 'emp.var.rate', 'euribor3m', 'job_blue-collar','job_retired', 'job_services', 'job_student', 'default_unknown', 'contact_cellular', 'contact_telephone', 'month_jul', 'month_jun', 'month_may','day_of_week_fri', 'day_of_week_mon']


In [None]:
def LogRegOptimizer(X, y):
    print('**We will optimize the hyper-parameters of a Logistic Regression model using Randomized Search**\n')

    #function to help us display metrics in a percentage format
    def percentage(x):  
        x = round(x*100,2)
        return (str(x) + "%")


    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=1)

    over = SMOTE()
    
    X_train, y_train = over.fit_resample(X_train,y_train)

    grid_params = {'C' : [0.001,.009,0.01,.09,1,5,10,25], 
                    'penalty' : ["l1","l2", "elasticnet"],
                    'solver': ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
    }

    logreg=LogisticRegression()
    logreg_cv=RandomizedSearchCV(logreg, grid_params, cv = 10 , verbose = True, n_jobs= -1, scoring = "roc_auc")  #randomized search as opposed to gridsearch, to improve run time
    logreg_cv.fit(X_train,y_train)
    
    
    y_pred = logreg_cv.predict(X_test)


    print("----------------------------------- Predict proba-----------------------------------")

    
    print(logreg_cv.best_estimator_.predict_proba(X_test)[:,0])
    print("----------------------------------- Confusion Matrix-----------------------------------")
    print(confusion_matrix(y_test, y_pred))

    print("--------------------------------- Classification Report---------------------------------")
    print(classification_report(y_test, y_pred))


    print("-----------------------------------------Metrics----------------------------------------\n")
    print("tuned hyperparameters :(best parameters) ",logreg_cv.best_params_)
    print("ROC AUC SCORE:" + str(roc_auc_score(y_test, y_pred)))
    print("Gini (Somer's D) coefficient:" + str((roc_auc_score(y_test, y_pred)*2-1)))
    print('Accuracy Score : ' + percentage(accuracy_score(y_test,y_pred)))
    print('Precision Score : ' + percentage(precision_score(y_test,y_pred)))
    print('Recall Score : ' + percentage(recall_score(y_test,y_pred)))
    print('F1 Score : ' + percentage(f1_score(y_test,y_pred)))

    # get the values required to plot a ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    # plot the ROC curve
    plt.plot(fpr, tpr)
    # plot a secondary diagonal line, to plot randomness of model
    plt.plot(fpr, fpr, linestyle = '--', color = 'k')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')

In [None]:
X = data_final.drop(labels=['subscribed'], axis=1)

LogRegOptimizer(X = X, y = data_final.subscribed)