In [None]:
%matplotlib inline  
# To make data visualisations display in Jupyter Notebooks 
import numpy as np   # linear algebra
import pandas as pd  # Data processing, Input & Output load
import matplotlib.pyplot as plt # Visualization & plotting
import datetime
from sklearn.linear_model import LogisticRegression #  Logistic Regression (aka logit) classifier in linear model
import joblib  #Joblib is a set of tools to provide lightweight pipelining in Python (Avoid computing twice the same thing)

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
                                    # GridSearchCV - Implements a “fit” and a “score” method
                                    # train_test_split - Split arrays or matrices into random train and test subsets
                                    # cross_val_score - Evaluate a score by cross-validation
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer, accuracy_score, roc_curve, confusion_matrix, classification_report
                                    # Differnt metrics to evaluate the model 
import pandas_profiling as pp   # simple and fast exploratory data analysis of a Pandas Datafram

import warnings   # To avoid warning messages in the code run
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
def plot_roc_curve(y_train_actual, train_pred_prob, y_test_actual, test_pred_prob, *args):
    '''
    Generate the train & test roc curve
    '''

    AUC_Train = roc_auc_score(y_train_actual, train_pred_prob)
    AUC_Test = roc_auc_score(y_test_actual, test_pred_prob)

    if len(args) == 0:
        print("Train AUC = ", AUC_Train)
        print("Test AUC = ", AUC_Test)
        fpr, tpr, thresholds = roc_curve(y_train_actual, train_pred_prob)
        fpr_tst, tpr_tst, thresholds = roc_curve(y_test_actual, test_pred_prob)
        roc_plot(fpr, tpr, fpr_tst, tpr_tst)

    else:
        AUC_Valid = roc_auc_score(args[0], args[1])
        print("Train AUC = ", AUC_Train)
        print("Test AUC = ", AUC_Test)
        print("Validation AUC = ", AUC_Valid)
        fpr, tpr, thresholds = roc_curve(y_train_actual, train_pred_prob)
        fpr_tst, tpr_tst, thresholds = roc_curve(y_test_actual, test_pred_prob)
        fpr_val, tpr_val, thresholds = roc_curve(args[0], args[1])
        roc_plot(fpr, tpr, fpr_tst, tpr_tst, fpr_val, tpr_val)

In [None]:
def roc_plot(fpr, tpr, fpr_tst, tpr_tst, *args):
    '''
    Generates roc plot
    '''

    fig = plt.plot(fpr, tpr, label='Train')
    fig = plt.plot(fpr_tst, tpr_tst, label='Test')

    if len(args) == 0:
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.title("ROC curve using ")
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.legend(loc='lower right')
        plt.grid(True)
        plt.show()

    else:
        fig = plt.plot(args[0], args[1], label='Validation')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.0])
        plt.title("ROC curve using ")
        plt.xlabel('False Positive Rate (1 - Specificity)')
        plt.ylabel('True Positive Rate (Sensitivity)')
        plt.legend(loc='lower right')
        plt.grid(True)
        plt.show()

In [None]:
# Read-in the dataset
Insurance_Data = pd.read_csv('carInsurance_train.csv')
print('Train Data Shape - ', Insurance_Data.shape)
Insurance_Data.head()

In [None]:
# What type of values are stored in the columns?
Insurance_Data.info()

In [None]:
pp.ProfileReport(Insurance_Data)

In [None]:
# Let's look at some statistical information about our dataframe.
Insurance_Data.describe(include='all')

In [None]:
# This is how we can get summary for the categorical data
Insurance_Data.describe(include=np.object)

In [None]:
Target = 'CarInsurance'
pd.crosstab(Insurance_Data[Target], columns='N', normalize=True)
# pd.crosstab(Insurance_Data[Target], columns='N')

In [None]:
# count every row of class 1 as 2 rows of Class 1
0.599/0.401

In [None]:
num_cols = Insurance_Data.select_dtypes(include=[np.number]).columns.tolist()
non_num_cols = Insurance_Data.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
num_cols

In [None]:
non_num_cols

In [None]:
# Lets drop columns which we will not use
num_cols = Insurance_Data.drop(['Id', 'CarInsurance'],axis=1).select_dtypes(include=[np.number]).columns.tolist()
non_num_cols = Insurance_Data.drop(['CallStart', 'CallEnd'],axis=1).select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
print('Numeric Columns \n', num_cols)
print('Non-Numeric Columns \n', non_num_cols)

In [None]:
# Lets drop CarLoan, HHInsurance, Default from the numeric columns as these are dummies
num_cols_viz = ['DaysPassed', 'Age', 'NoOfContacts', 'PrevAttempts', 'LastContactDay', 'Balance']

fig, axes = plt.subplots(3,2,sharex=False,sharey=False, figsize=(15,15))
Insurance_Data.loc[:,[Target]+num_cols_viz].boxplot(by=Target, ax=axes, return_type='axes')

In [None]:
non_num_cols_viz = non_num_cols+['CarLoan', 'HHInsurance', 'Default']
fig, axes = plt.subplots(len(non_num_cols_viz),sharex=False,sharey=False, figsize=(15,50))
for i in range(len(non_num_cols_viz)):
    pd.crosstab(Insurance_Data[non_num_cols_viz[i]], Insurance_Data[Target]).plot(kind='bar', 
                                                                                  stacked=True, 
                                                                                  grid=False, 
                                                                                  ax=axes[i],
                                                                                  rot=0)

In [None]:
Insurance_Data.isnull().sum()

In [None]:
Insurance_Data_Org = Insurance_Data.copy()

In [None]:
Insurance_Data['Job'].value_counts(dropna=False)

In [None]:
Insurance_Data['Job'] = Insurance_Data['Job'].fillna('None')
Insurance_Data['Job'].isnull().sum()

In [None]:
Insurance_Data['Job'].value_counts()

In [None]:
# Fill missing education with the most common education level by job type

# Create job-education level mode mapping
edu_mode=[]

# What are different Job Types
job_types = Insurance_Data.Job.value_counts().index
job_types

In [None]:
# Now according to the job type we will crate a mapping where the job and mode of education is there.
# It means when there are many people in the managment job then most of them are in which education.
# We can find that in below mapping

for job in job_types:
    mode = Insurance_Data[Insurance_Data.Job==job]['Education'].value_counts().nlargest(1).index
    edu_mode = np.append(edu_mode,mode)
edu_map=pd.Series(edu_mode,index=Insurance_Data.Job.value_counts().index)

edu_map

In [None]:
# Apply the mapping to missing education obs. We will replace education now by jobs value
for j in job_types:
    Insurance_Data.loc[(Insurance_Data['Education'].isnull()) & (Insurance_Data['Job']==j),'Education'] = edu_map.loc[edu_map.index==j][0]

# For those who are not getting mapped we will create a new category as None
Insurance_Data['Education'].fillna('None',inplace=True)

In [None]:
Insurance_Data.isnull().sum()

In [None]:
# Fill missing communication with none 
Insurance_Data['Communication'].value_counts(dropna=False)

In [None]:
Insurance_Data['Communication'] = Insurance_Data['Communication'].fillna('None')

In [None]:
# Check for missing value in Outcome
Insurance_Data['Outcome'].value_counts(dropna=False)

In [None]:
# Fill missing outcome as not in previous campaign, we are adding one category to Outcome
# We will add category if the value of DaysPassed is -1

Insurance_Data.loc[Insurance_Data['DaysPassed']==-1,'Outcome']= 'NoPrev'
Insurance_Data['Outcome'].value_counts(dropna=False)

In [None]:
# Check if we have any missing values left
Insurance_Data.isnull().sum()

In [None]:
Insurance_Data_num = Insurance_Data[num_cols+['Id', 'CarInsurance']]

In [None]:
# Categorical columns data
Insurance_Data_cat = Insurance_Data[non_num_cols]
non_num_cols

In [None]:
# Create dummies
Insurance_Data_cat_dummies = pd.get_dummies(Insurance_Data_cat)  #One-Hot Embedding
print(Insurance_Data_cat_dummies.shape)
Insurance_Data_cat_dummies.head()

In [None]:
Insurance_Data_final = pd.concat([Insurance_Data_num, Insurance_Data_cat_dummies], axis=1)
print(Insurance_Data_final.shape)
Insurance_Data_final.head()

In [None]:
# Checking if there are missing values before we run model
Insurance_Data_final.isnull().sum(axis = 0)

In [None]:
train_df = Insurance_Data_final.drop(['Id', 'CarInsurance'], axis=1) #X
train_label = Insurance_Data_final['CarInsurance'] #y

In [None]:
#random_state is the seed used by the random number generator. It can be any integer.
# Train test split
X_train, X_test, y_train, y_test = train_test_split(train_df, train_label, train_size=0.7 , stratify=train_label, random_state=100)

In [None]:
# Example: 10 rows of data, 7 rows class 0, 3 rows class 1

In [None]:
# 70-30 split, random splitting, train will have 7 rows(class 0), test will have 3 rows(class 1)

In [None]:
# Stratify option will make sure that train has both the classes, and also test has both the classes in 70-30
# Guarantee that:
# train 7 rows(5 rows class 0, 2 rows class 1)
# test 3 rows (2 rows class 0, 1 row of class1)

In [None]:
print(y_train.shape[0]) # 2800, 1123 are 1s and the rest (2800-1123) 0s
print(np.sum(y_train))
print(y_test.shape[0]) # 1200, 481 are 1s and the rest (1200-481) 0s
print(np.sum(y_test))

In [None]:
print('Train shape - ', X_train.shape)
print('Test shape  - ', X_test.shape)

In [None]:
# Define Model parameters to tune
model_parameters = {
        'penalty':['none'],
#         'penalty':['l1', 'l2', None],
        'class_weight': ['balanced', None]
    }

In [None]:
# Gridsearch the parameters to find the best parameters.
model = LogisticRegression(random_state=1)

gscv = GridSearchCV(estimator=model, 
                    param_grid=model_parameters, 
                    cv=5,  # 5-Fold Cross Validation
                    verbose=1, #To print what it is doing
                    n_jobs=-1, #fastest possible depending in the laptop
                    scoring='f1') #tell us 12 f1-scores, 1 f1-score per combination

gscv.fit(X_train, y_train)

In [None]:
print('The best parameter are -', gscv.best_params_)

In [None]:
print(gscv.best_score_)
print(gscv.best_estimator_)
print(gscv.scorer_)

In [None]:
# Re-fit the model with the best parameters
final_mod = LogisticRegression(**gscv.best_params_, random_state=1)
final_mod.fit(X_train, y_train)

In [None]:
print('AUC on test by final_mod =', roc_auc_score(y_true=y_test,
                                                        y_score=final_mod.predict_proba(X_test)[:, 1]))

In [None]:
print('AUC on test by gscv =', roc_auc_score(y_true=y_test,
                                                        y_score=gscv.predict_proba(X_test)[:, 1]))

In [None]:
# View the model coefficients
list(zip(X_train.columns, final_mod.coef_[0]))

In [None]:
# Prediction
train_pred = final_mod.predict(X_train)
test_pred = final_mod.predict(X_test)

In [None]:
print('Classification report for train data is : \n',
      classification_report(y_train, train_pred))
print('Classification report for test data is : \n',
      classification_report(y_test, test_pred))

In [None]:
# Save the variables used in the model as it will be required in future for new datasets prediction
final_mod.variables = X_train.columns

In [None]:
joblib.dump(final_mod, 'best_model.joblib')

In [None]:
# Generate ROC
plt.subplots(figsize=(10, 5))
train_prob = final_mod.predict_proba(X_train)[:, 1]
test_prob = final_mod.predict_proba(X_test)[:, 1]

plot_roc_curve(y_train, train_prob,
               y_test, test_prob)

In [None]:
coefs = pd.DataFrame(list(final_mod.coef_.ravel()),
                        list(X_train.columns)).reset_index()
coefs.columns = ['feature', 'coefficient']
coefs = coefs.sort_values(by='coefficient', ascending=False)
coefs

In [None]:
# Select top 5 features to display
top_features = coefs[:5]
top_features = top_features.sort_values(by='coefficient', ascending=False)

# Select bottom 5 features to display
bottom_features = coefs[-5:]
bottom_features = bottom_features.sort_values(
    by='coefficient', ascending=False)

# Display 10 most important features
pd.concat([top_features, bottom_features], axis=0).plot(
    kind='barh', figsize=(10, 5), x='feature')

In [None]:
# Load the saved model

best_model = joblib.load('best_model.joblib')

In [None]:
# Load the test data
Insurance_test = pd.read_csv('carInsurance_test.csv')
print('Test Data Shape  - ', Insurance_test.shape)
Insurance_test.head()

In [None]:
# Handle missing values on the test data
# The function takes the dataframe and does the same preprocessing that was done for train data

def handle_missing_values(df):
    #Job 
    df['Job'] = df['Job'].fillna('None')
    
    #Education
    # Apply the mapping to missing eductaion obs. We will replace education now by jobs value
    for j in job_types:
        df.loc[(df['Education'].isnull()) & (df['Job']==j),'Education'] = edu_map.loc[edu_map.index==j][0]

    # For those who are not getting mapped we will create a new category as None
    df['Education'] = df['Education'].fillna('None')
    
    #Communication
    df['Communication'] = df['Communication'].fillna('None')
    
    #Outcome
    df.loc[df['DaysPassed']==-1,'Outcome']='NoPrev'
    
    return df

In [None]:
Insurance_test_Org = Insurance_test.copy()

In [None]:
# Handle the missing values the same we had done for Train
Insurance_test = handle_missing_values(Insurance_test)

In [None]:
Insurance_test.isnull().sum()

In [None]:
# Convert Categorical to dummies
dummy_cols = pd.get_dummies(Insurance_test[non_num_cols])
dummy_cols.head()

In [None]:
# Append the columns
new_data = pd.concat([Insurance_test[num_cols], dummy_cols], axis=1)
print(new_data.shape)
new_data.head()

In [None]:
# Check if all the variables of train are present in test
# Variables in model
best_model.variables

In [None]:
# Variables missing in test data. This happens sometimes because of some categories not present in the new data
vars_missing = list(set(best_model.variables) - set(new_data.columns))
vars_missing

In [None]:
# Create the missing columns in the dataset and fill them with 0
# This will create columns bonly if there are missing values
for i in vars_missing:
    new_data[i] = 0
    
print(new_data.shape)
new_data.head()

In [None]:
# Get the new dataset in the same order of the variables used in train
new_data_final = new_data[best_model.variables]
new_data.head()

In [None]:
# Predict on the new data
new_data_final['Predicted'] = best_model.predict(new_data_final)
new_data_final.head()

In [None]:
# Export the results
new_data_final.to_csv('Predicted.csv', index=False)