# 1.Import Data

In [None]:
from pyspark.context import SparkContext
from pyspark.sql import HiveContext, SparkSession


# Creating a SparkContext is a must
sc = SparkContext(appName="<app_name")
# Optional creation of a HiveContext
sql_context = HiveContext(sc)
# Optional creation of a SparkSession
spark = SparkSession(sc)
spark = (SparkSession.builder.enableHiveSupport().getOrCreate())

In [None]:
spark.sparkContext.stop() 
spark = SparkSession.builder.config('spark.kryoserializer.buffer.max', '1g').config('spark.driver.maxResultSize', '25g').getOrCreate()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.offline as py#visualization
py.init_notebook_mode(connected=True)#visualization
import plotly.graph_objs as go#visualization
import plotly.tools as tls#visualization
import plotly.figure_factory as ff#visualization
import seaborn as sns

In [None]:
context= spark.sql("use anp_catdidai1_sandbox")
spark_df = spark.sql ("select * from anp_catdidai1_sandbox.tmi_data_all_in_a")
df_1 = spark_df.toPandas()

In [None]:
context= spark.sql("use anp_catdidai1_sandbox")
spark_df = spark.sql ("select * from anp_catdidai1_sandbox.tmi_data_all_in_b")
df_2 = spark_df.toPandas()

In [None]:
context= spark.sql("use anp_catdidai1_sandbox")
spark_df = spark.sql ("select * from anp_catdidai1_sandbox.tmi_data_all_in_c")
df_3 = spark_df.toPandas()

In [None]:
context= spark.sql("use anp_catdidai1_sandbox")
spark_df = spark.sql ("select * from anp_catdidai1_sandbox.tmi_data_all_in_d")
df_4 = spark_df.toPandas()

In [None]:
context= spark.sql("use anp_catdidai1_sandbox")
spark_df = spark.sql ("select * from anp_catdidai1_sandbox.tmi_data_all_in_e")
df_5 = spark_df.toPandas()

In [None]:
tmi_data=pd.concat([df_1,df_2,df_3,df_4,df_5])

# 1.1Data Overview & Manipulation

In [None]:
#sort data and display the first 5 rows of the dataset
tmi_data=tmi_data.sort_values(by=['cust_id'])
tmi_data.head()

In [None]:
tmi_data.info()

# 1.2.1 Data Manipulation - Converting variables into the right type

In [None]:
#convert to float type
convert_cols = [ 'cust_mnyin_prtbal_am', 'sum_tran_am_jan', 'sum_tran_am_feb', 'sum_tran_am_mar', 'sum_tran_am_apr', 
                'sum_tran_am_may', 'sum_tran_am_jun', 'sum_tran_am_jul', 'sum_tran_am_aug', 'sum_tran_am_sep', 
                'sum_tran_am_oct', 'sum_tran_am_nov', 'sum_tran_am_dec', 'sum_tran_am_monday', 'sum_tran_am_tuesday', 
                'sum_tran_am_wednesday', 'sum_tran_am_thursday', 'sum_tran_am_friday', 'sum_tran_am_saturday', 
                'sum_tran_am_sunday', 'sum_tran_am_spring', 'sum_tran_am_summer', 'sum_tran_am_fall', 'sum_tran_am_winter', 
                'sum_tran_am_xmas', 'sum_tran_am_last_3mo', 'sum_tran_am_last_6mo', 'sum_tran_am_last_12mo', 
                'sum_tran_am_last_24mo',  'sum_tran_am_in_3yr']

for i in convert_cols : 
    tmi_data[i]  = tmi_data[i].astype(float)

# 1.2.2 Data Manipulation - Missing value imputation

In [None]:
## Null Counting Fucntion
def null_values(df):
    
    sum_null = df.isnull().sum()
    total = df.isnull().count()
    percent_nullvalues = 100* sum_null / total 
    df_null = pd.DataFrame()
    df_null['Total'] = total
    df_null['Null_Count'] = sum_null
    df_null['Percent'] = round(percent_nullvalues,2)
    df_null = df_null.sort_values(by='Null_Count',ascending = False)
    df_null = df_null[df_null.Null_Count > 0]
    
    return(df_null)

In [None]:
#Find missing value percentage (before imputation)
null_values(tmi_data)

We divided the variables that contains NA into three major groups.

Group 1 -- demograhics -- such as age, tenure, beacon score and etc. Instead of deleting all NA entries(this may cause information loss), I have categorized NAs into Unknowns. For example, if the age is NA, then it will be grouped into Age_Group as 'Unknown'. We keep original data to calculate statistical summary in later step. 

Group 2 -- transactional -- such as num_travel_tran_jan, sum_tran_am_jan and etc. Most of NAs are coming from customers who have never use TD to book travel in the past three years. Logically speaking, if a customer have never book travel with TD, the number of travel transactions in January should be zero. Therefore, I imputed all the NAs that follow into this group as '0'. 

Group 3 -- transactional -- such as num_day_since_last_book, avg_day_between_travel and etc. These variables calculate the recency, for example, num_day_since_last_book calculates how many days it has been since last travel booking, and avg_day_between_travel simply calculate the average of the time interval between each travel booking. NAs are coming from those who have never book travel with TD, or booked only once with TD (therefore, cannot calculate the average), or booked less than three times. Therefore, I imputed all the NAs into 1096 (365* 3 + 1 = 1096) to represent the last time of booking is three years ago/a very long time ago. 

Please note, this imputation method makes sense logically, however, with this imputation, our datasets is heavily skewed. 

In [None]:
#NA imputation based on assumptions above
tmi_data.update(tmi_data[['num_trav_tran_jan', 'num_trav_tran_feb', 'num_trav_tran_mar', 'num_trav_tran_apr', 
                          'num_trav_tran_may', 'num_trav_tran_jun', 'num_trav_tran_jul', 'num_trav_tran_aug', 
                          'num_trav_tran_sep', 'num_trav_tran_oct', 'num_trav_tran_nov', 'num_trav_tran_dec', 
                          'num_trav_tran_monday', 'num_trav_tran_tuesday', 'num_trav_tran_wednesday', 'num_trav_tran_thursday', 
                          'num_trav_tran_friday', 'num_trav_tran_saturday', 'num_trav_tran_sunday', 'num_trav_tran_spring',
                          'num_trav_tran_summer', 'num_trav_tran_fall', 'num_trav_tran_winter', 'num_trav_tran_xmas', 
                          'num_trav_tran_last_3mo', 'num_trav_tran_last_6mo', 'num_trav_tran_last_12mo', 
                          'num_trav_tran_last_24mo', 'sum_tran_am_jan', 'sum_tran_am_feb', 'sum_tran_am_mar', 
                          'sum_tran_am_apr', 'sum_tran_am_may', 'sum_tran_am_jun', 'sum_tran_am_jul', 'sum_tran_am_aug', 
                          'sum_tran_am_sep', 'sum_tran_am_oct', 'sum_tran_am_nov', 'sum_tran_am_dec', 'sum_tran_am_monday', 
                          'sum_tran_am_tuesday', 'sum_tran_am_wednesday', 'sum_tran_am_thursday', 'sum_tran_am_friday', 
                          'sum_tran_am_saturday', 'sum_tran_am_sunday', 'sum_tran_am_spring', 'sum_tran_am_summer', 
                          'sum_tran_am_fall', 'sum_tran_am_winter', 'sum_tran_am_xmas', 'sum_tran_am_last_3mo', 
                          'sum_tran_am_last_6mo', 'sum_tran_am_last_12mo', 'sum_tran_am_last_24mo', 'num_trav_tran_in_3yr', 
                          'sum_tran_am_in_3yr']].fillna(0))

tmi_data.update(tmi_data[['num_day_since_last_book', 'num_day_since_2nd_last_book', 
                          'num_day_since_3rd_last_book', 'avg_day_between_travel']].fillna(1096))


In [None]:
# Drop original columns that contains NAs, as we have binned the data into multiple buckets (eg. age --> age_group) 
to_drop = ['id', 'beacon_sc', 'cust_mnyin_prtbal_am', 'customer_age','total_hhold_member_ct','customer_tenure', 'num_td_rewards_visa_card']

tmi_data_copy=tmi_data.drop(columns=to_drop)

In [None]:
# For now, we have imputed all the NAs. 
print ("\nMissing values :  ", tmi_data_copy.isnull().sum().values.sum())

# 2. Exploratory Data Analysis

This section is under construction.

# 2.1 Travel Booking in Data

In [None]:
trav_book_next3mo     = tmi_data_copy[tmi_data_copy["target_istravel"] == 1]
no_trav_book_next3mo = tmi_data_copy[tmi_data_copy["target_istravel"] == 0]

In [None]:
Id_col     = ['cust_id']
target_col = ['target_istravel']
cat_cols   = ['is_staff', 'gender', 'age_group', 'province', 'customer_tenure', 'ins_life_health_solict_in',
              'beacon_credit_score', 'total_hhold_member_ct', 'income_group']
num_cols   = [x for x in tmi_data_copy.columns if x not in cat_cols + target_col + Id_col]

In [None]:
#labels
lab = tmi_data_copy["target_istravel"].value_counts().keys().tolist()
#values
val = tmi_data_copy["target_istravel"].value_counts().values.tolist()

In [None]:
trace = go.Pie(labels = lab ,
               values = val ,
               marker = dict(colors =  [ 'royalblue' ,'lime'],
                             line = dict(color = "white",
                                         width =  1.3)
                            ),
               rotation = 90,
               hoverinfo = "label+value+text",
               hole = .5
              )
layout = go.Layout(dict(title = "Customer Travel Booking in Next 3 Months",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                       )
                  )

data = [trace]
fig = go.Figure(data = data,layout = layout)
py.iplot(fig)

# 2.2 Variable Distribution in Travel Booking

In [None]:
def plot_pie(column) :
    
    trace1 = go.Pie(values  = trav_book_next3mo[column].value_counts().values.tolist(),
                    labels  = trav_book_next3mo[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    domain  = dict(x = [0,.48]),
                    name    = "Travel Booking Customers",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    hole    = .6
                   )
    trace2 = go.Pie(values  = no_trav_book_next3mo[column].value_counts().values.tolist(),
                    labels  = no_trav_book_next3mo[column].value_counts().keys().tolist(),
                    hoverinfo = "label+percent+name",
                    marker  = dict(line = dict(width = 2,
                                               color = "rgb(243,243,243)")
                                  ),
                    domain  = dict(x = [.52,1]),
                    hole    = .6,
                    name    = "No Travel Booking Customers" 
                   )


    layout = go.Layout(dict(title = column + " distribution in travel booking ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            annotations = [dict(text = "Travel Booking Customers",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .15, y = .5),
                                           dict(text = "No Travel Booking Customers",
                                                font = dict(size = 13),
                                                showarrow = False,
                                                x = .88,y = .5
                                               )
                                          ]
                           )
                      )
    data = [trace1,trace2]
    fig  = go.Figure(data = data,layout = layout)
    py.iplot(fig)


#function  for histogram for travel booking 
def histogram(column) :
    trace1 = go.Histogram(x  = trav_book_next3mo[column],
                          histnorm= "percent",
                          name = "Travel Booking Customers",
                          marker = dict(line = dict(width = .5,
                                                    color = "black"
                                                    )
                                        ),
                         opacity = .9 
                         ) 
    
    trace2 = go.Histogram(x  = no_trav_book_next3mo[column],
                          histnorm = "percent",
                          name = "No Travel Booking Customers",
                          marker = dict(line = dict(width = .5,
                                              color = "black"
                                             )
                                 ),
                          opacity = .9
                         )
    
    data = [trace1,trace2]
    layout = go.Layout(dict(title =column + " distribution in travel booking ",
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = column,
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                            yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                             title = "percent",
                                             zerolinewidth=1,
                                             ticklen=5,
                                             gridwidth=2
                                            ),
                           )
                      )
    fig  = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)

In [None]:
#for all categorical columns plot pie
cat_cols_1   = ['is_staff', 'gender', 'province', 'ins_life_health_solict_in',
              'beacon_credit_score', 'total_hhold_member_ct', 'income_group']
for i in cat_cols_1 :
    plot_pie(i)

In [None]:
#customer travel booking in tenure groups
ag_tb  =  trav_book_next3mo["age_group"].value_counts().reset_index()
ag_tb.columns  = ["age_group","count"]
ag_ntb =  no_trav_book_next3mo["age_group"].value_counts().reset_index()
ag_ntb.columns = ["age_group","count"]

#bar - travel booking
trace1 = go.Bar(x = ag_tb["age_group"]  , y = ag_tb["count"],
                name = "Travel Booking Customers",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

#bar - no travel booking
trace2 = go.Bar(x = ag_ntb["age_group"] , y = ag_ntb["count"],
                name = "No Travel Booking Customers",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

layout = go.Layout(dict(title = "Travel Booking in Age Groups",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "Age Group",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "Count",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                       )
                  )
data = [trace1,trace2]
fig  = go.Figure(data=data,layout=layout)
py.iplot(fig)

In [None]:
#customer travel booking in tenure groups
ct_tb  =  trav_book_next3mo["customer_tenure"].value_counts().reset_index()
ct_tb.columns  = ["customer_tenure","count"]
ct_ntb =  no_trav_book_next3mo["customer_tenure"].value_counts().reset_index()
ct_ntb.columns = ["customer_tenure","count"]

#bar - travel booking
trace1 = go.Bar(x = ct_tb["customer_tenure"]  , y = ct_tb["count"],
                name = "Travel Booking Customers",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

#bar - no travel booking
trace2 = go.Bar(x = ct_ntb["customer_tenure"] , y = ct_ntb["count"],
                name = "No Travel Booking Customers",
                marker = dict(line = dict(width = .5,color = "black")),
                opacity = .9)

layout = go.Layout(dict(title = "Travel Booking in Customer Tenure",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "Customer Tenure Group",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "Count",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                       )
                  )
data = [trace1,trace2]
fig  = go.Figure(data=data,layout=layout)
py.iplot(fig)

In [None]:
#for all numerical columns plot histogram   
num_cols_1 = ['num_acct', 'num_money_builder_account', 'num_investment_builder_account', 'num_selectline_line', 
              'num_td_rsp_plan', 'num_usd_daily_interest_chequing_account', 'num_td_first_class_travel_visa_infinite', 
              'num_td_aeroplan_visa_infinite', 'num_td_cash_back_visa', 'num_td_cash_back_visa_infinite', 
              'num_td_business_travel', 'num_td_emerald_flex_rate_visa']
              
              #'num_day_since_last_book',
              #'num_day_since_2nd_last_book', 'num_day_since_3rd_last_book', 'avg_day_between_travel', 'num_trav_in_3yr', 
              #'sum_trav_tran_am_in_3yr', 'num_trav_in_last_3mo', 'sum_trav_tran_am_in_last_3mo', 'num_trav_in_last_6mo', 
              #'sum_trav_tran_am_in_last_6mo', 'num_trav_in_last_12mo', 'sum_trav_tran_am_in_last_12mo', 'num_trav_in_last_24mo',
              #'sum_trav_tran_am_in_last_24mo', 'num_trav_around_xmas', 'sum_trav_tran_am_around_xmas', 'num_trav_in_spring', 
              #'sum_trav_tran_am_in_spring', 'num_trav_in_summer', 'sum_trav_tran_am_in_summer', 'num_trav_in_fall', 
              #'sum_trav_tran_am_in_fall', 'num_trav_in_winter', 'sum_trav_tran_am_in_winter', 'target_istravel']
for i in num_cols_1 :
    histogram(i)

# 3. Statistical Summary

In [None]:
summary = (tmi_data[[i for i in tmi_data.columns if i not in Id_col + Id2_col]].
           describe().transpose().reset_index())

summary = summary.rename(columns = {"index" : "feature"})
summary = np.around(summary,3)

val_lst = [summary['feature'], summary['count'],
           summary['mean'],summary['std'],
           summary['min'], summary['25%'],
           summary['50%'], summary['75%'], summary['max']]

trace  = go.Table(header = dict(values = summary.columns.tolist(),
                                line = dict(color = ['#506784']),
                                fill = dict(color = ['#119DFF']),
                               ),
                  cells  = dict(values = val_lst,
                                line = dict(color = ['#506784']),
                                fill = dict(color = ["lightgrey",'#F5F8FF'])
                               ),
                  columnwidth = [200,60,100,100,60,60,80,80,80])
layout = go.Layout(dict(title = "Variable Summary"))
figure = go.Figure(data=[trace],layout=layout)
py.iplot(figure)

# 4. Data Preprocessing

In [None]:
#customer id col
Id_col     = ['cust_id']


#target variable col
target_col = ['target_istravel']
#categorical columns
cat_cols   = ['is_staff', 'gender_ds', 'age_group', 'contry_region_ds',
              'tenure_group', 'ins_life_health_solict_in', 'beacon_score_group', 'family_size_group', 'income_group']
    
#numerical columns
num_cols   = [x for x in tmi_data_copy.columns if x not in cat_cols + target_col + Id_col]

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


#Binary columns with 2 values
bin_cols   = ['is_staff', 'ins_life_health_solict_in']
#Columns more than 2 values
multi_cols = ['gender_ds','age_group', 'contry_region_ds', 'tenure_group','beacon_score_group', 'family_size_group', 'income_group']

#Label encoding Binary columns
le = LabelEncoder()
for i in bin_cols :
    tmi_data_copy[i] = le.fit_transform(tmi_data_copy[i])
    
#Duplicating columns for multi value columns
tmi_data_copy = pd.get_dummies(data = tmi_data_copy,columns = multi_cols)

#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(tmi_data_copy[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

#dropping original values merging scaled values for numerical columns
df_tmi_data_copy_og = tmi_data_copy.copy()
tmi_data_copy = tmi_data_copy.drop(columns = num_cols,axis = 1)
tmi_data_copy = tmi_data_copy.merge(scaled,left_index=True,right_index=True,how = "left")

In [None]:
tmi_data_copy.head(10)

# 5. Correlation Analysis

In [None]:
#correlation
correlation = tmi_data_copy.corr()
#tick labels
matrix_cols = correlation.columns.tolist()
#convert to array
corr_array  = np.array(correlation)

#Plotting
trace = go.Heatmap(z = corr_array,
                   x = matrix_cols,
                   y = matrix_cols,
                   colorscale = "Viridis",
                   colorbar   = dict(title = "Pearson Correlation coefficient",
                                     titleside = "right"
                                    ) ,
                  )

layout = go.Layout(dict(title = "Correlation Matrix for variables",
                        autosize = False,
                        height  = 720,
                        width   = 800,
                        margin  = dict(r = 0 ,l = 210,
                                       t = 25,b = 210,
                                      ),
                        yaxis   = dict(tickfont = dict(size = 9)),
                        xaxis   = dict(tickfont = dict(size = 9))
                       )
                  )

data = [trace]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

In [None]:
# Threshold for removing correlated variables
threshold = 0.7

# Absolute value correlation matrix
corr_matrix = tmi_data.corr().abs()
corr_matrix.head()

# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

# Select columns with correlations above threshold
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(to_drop)))
to_drop

In [None]:
to_drop

In [None]:
tmi_data_copy=tmi_data_copy.drop(columns=to_drop)

# 6. Model Building

In [None]:
#Import libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.over_sampling import SMOTE
from imblearn.metrics import classification_report_imbalanced
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

This is a regular train_test_split, the ratio of target variable 0s:1s approx. = 6.4 : 1 (86.5% vs. 13.5%) 

In [None]:
from sklearn.model_selection import train_test_split
#splitting train and test data 
train,test = train_test_split(tmi_data_copy,test_size = .30 ,random_state = 123)
    
##seperating dependent and independent variables
cols    = [i for i in tmi_data_copy.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X  = test[cols]
test_Y  = test[target_col]

In order to solve the imbalanced dataset problem, I use random under sampling method to undersampling the majorities (0s), and to reduce the datasets from 12 million records down to roughly 3.2 million records. The new 0s:1s ratio is 1:1

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

id_col = ['cust_id']
target_col = ['target_istravel']
cols    = [i for i in tmi_data_copy.columns if i not in target_col + id_col]

rus_X = tmi_data_copy[cols]
rus_Y = tmi_data_copy[target_col]

rus = RandomUnderSampler(random_state=0)
us_rus_X,us_rus_Y = rus.fit_sample(rus_X,rus_Y.values.ravel())
print('Resampled dataset shape %s' % Counter(us_rus_Y))

us_rus_X = pd.DataFrame(data = us_rus_X,columns=cols)
us_rus_Y = pd.DataFrame(data = us_rus_Y,columns=target_col)



#Split train and test data
rus_train_X,rus_test_X,rus_train_Y,rus_test_Y = train_test_split(us_rus_X,us_rus_Y,
                                                                         test_size = .30 ,
                                                                         random_state = 123)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc

def model(algorithm,dtrain_x,dtrain_y,dtest_x,dtest_y,of_type):
    
    print ("*****************************************************************************************")
    print ("MODEL - OUTPUT")
    print ("*****************************************************************************************")
    algorithm.fit(dtrain_x,dtrain_y)
    predictions = algorithm.predict(dtest_x)
    
    print (algorithm)
    print ("\naccuracy_score :",accuracy_score(dtest_y,predictions))
    
    print ("\nclassification report :\n",(classification_report(dtest_y,predictions)))
        
    plt.figure(figsize=(13,10))
    plt.subplot(221)
    sns.heatmap(confusion_matrix(dtest_y,predictions),annot=True,fmt = "d",linecolor="k",linewidths=3)
    plt.title("CONFUSION MATRIX",fontsize=20)
    
    predicting_probabilites = algorithm.predict_proba(dtest_x)[:,1]
    fpr,tpr,thresholds = roc_curve(dtest_y,predicting_probabilites)
    plt.subplot(222)
    plt.plot(fpr,tpr,label = ("Area_under the curve :",auc(fpr,tpr)),color = "r")
    plt.plot([1,0],[1,0],linestyle = "dashed",color ="k")
    plt.legend(loc = "best")
    plt.title("ROC - CURVE & AREA UNDER CURVE",fontsize=20)
    
    if  of_type == "feat":
        
        dataframe = pd.DataFrame(algorithm.feature_importances_,dtrain_x.columns).reset_index()
        dataframe = dataframe.rename(columns={"index":"features",0:"coefficients"})
        dataframe = dataframe.sort_values(by="coefficients",ascending = False)
        plt.figure(figsize=(50,80))
        plt.subplot(223)
        ax = sns.barplot(x = "coefficients" ,y ="features",data=dataframe,palette="husl")
        plt.title("FEATURE IMPORTANCES",fontsize =22)
        for i,j in enumerate(dataframe["coefficients"]):
            ax.text(.011,i,j,weight = "bold")
        print(dataframe)
    
    elif of_type == "coef" :
        
        dataframe = pd.DataFrame(algorithm.coef_.ravel(),dtrain_x.columns).reset_index()
        dataframe = dataframe.rename(columns={"index":"features",0:"coefficients"})
        dataframe = dataframe.sort_values(by="coefficients",ascending = False)
        plt.figure(figsize=(50,80))
        plt.subplot(223)
        ax = sns.barplot(x = "coefficients" ,y ="features",data=dataframe,palette="husl")
        plt.title("FEATURE IMPORTANCES",fontsize =22)
        for i,j in enumerate(dataframe["coefficients"]):
            ax.text(.011,i,j,weight = "bold")
        print(dataframe)
            
    elif of_type == "none" :
        return (algorithm)

# 6.1 Random Forest

In [None]:
#random forest rus_removed highly correlated variables + num_days_since_last_X, avg_num_days

from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier()
model(rf,train_X,train_Y.values.ravel(),test_X,test_Y.values.ravel(),"feat")

In [None]:
#random forest rus_removed highly correlated variables + num_days_since_last_X, avg_num_days
#tuning parameters - need to adjust weight for classes , adjust randomness, max feature = 50
rf =RandomForestClassifier()
model(rf,rus_train_X,rus_train_Y.values.ravel(),test_X,test_Y.values.ravel(),"feat")

This random forest model will be selected as our baseline model for the following reasons:
-- AUR = 0.8, Higher the AUC, better the model is at predicting 0s as 0s and 1s as 1s. in another word, higher the auc, better the model is at distinguishing between customers those who will book travel vs those who won't.
-- recall/true positive rate = 0.79, among those who are actually booking travel in next 3 months with TD, 79% of the times the model is predicting it as positive. 

However, further improvements: 
-- instead of using undersampled datasets for training(this may cause information loss), use regular full datasets by adjusting weights for classes. Give a higher weight for 1s, and lower weights for 0s. 
-- hyperparameters tuning: 
adjust n_estimators, to increase the number of trees. 
adjust randomness. right now, it's by default, try to adjust the randomness to a higher number. 
adjust max_feature, try to select more variables for each trees, to have a better understanding of feature importance. 
-- cross validation 

In [None]:
#random forest rus_with all variables

from sklearn.ensemble import RandomForestClassifier
rf =RandomForestClassifier()
model(rf,rus_train_X,rus_train_Y.values.ravel(),test_X,test_Y.values.ravel(),"feat")

In [None]:
feature_list = list(rus_train_X.columns)
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

In [None]:
x_values = list(range(len(importances)))
# List of features sorted from most to least important
sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)
# Make a line graph
plt.plot(x_values, cumulative_importances, 'g-')
# Draw line at 95% of importance retained
plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')
# Format x ticks and labels
plt.xticks(x_values, sorted_features, rotation = 'vertical')
# Axis labels and title
plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');

In [None]:
# Find number of features for cumulative importance of 95%
# Add 1 because Python is zero-indexed
print('Number of features for 70% importance:', np.where(cumulative_importances > 0.7)[0][0] + 1)

In [None]:
# Extract the names of the most important features
important_feature_names = [feature[0] for feature in feature_importances[0:45]]
# Find the columns of the most important features
important_indices = [feature_list.index(feature) for feature in important_feature_names]
# Create training and testing sets with only the important features

#important_train_features = rus_train_X[:,important_indices]
#important_test_features = test_X[:,important_indices]

#important_train_features

# Sanity check on operations
#print('Important train features shape:', important_train_features.shape)
#print('Important test features shape:', important_test_features.shape)

In [None]:
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from time import time
import matplotlib.pyplot as plt
from operator import itemgetter

In [None]:
param_grid = [
{"n_estimators": [10, 20, 25],
 "max_depth": [3, 6, None],
 #'min_samples_split' : [2, 5, 10],
#'min_samples_leaf' : [1, 3, 5] ,
 }
]

grid_search_forest = GridSearchCV(rf, param_grid, cv=3)
grid_search_forest.fit(rus_train_X, rus_train_Y.values.ravel())

In [None]:
grid_search_forest.best_params_

# 6.2 XGBOOST

XGBOOST is considered for baseline model as it works well with imbalanced data, and can handle missing values. (two major problems with our datasets) 

In [None]:
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
                        min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                        objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

    if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        # xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

   
    print('Start Training')
    alg.fit(X_train, y_train, eval_metric='auc')

    # param_test1 = {}
    # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
    #                                                 min_child_weight=3, gamma=0.2, subsample=0.8,
    #                                                 colsample_bytree=1.0,
    #                                                 objective='binary:logistic', nthread=4, scale_pos_weight=1,
    #                                                 seed=27),
    #                         param_grid=param_test1,
    #                         scoring='f1',
    #                         n_jobs=4, iid=False, cv=5)
    # gsearch1.fit(X_train, y_train)
    # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)

  
    print("Start Predicting")
    predictions = alg.predict(X_test)
    pred_proba = alg.predict_proba(X_test)[:, 1]


 
    print("Accuracy - Train : %.4g" % metrics.accuracy_score(y_test, predictions))
    print("AUC - Train: %f" % metrics.roc_auc_score(y_test, pred_proba))
    print("F1 Score - Train): %f" % metrics.f1_score(y_test, predictions))

    feat_imp = alg.feature_importances_
    feat = X_train.columns.tolist()
    # clf.best_estimator_.booster().get_fscore()
    res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False)
    res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
    print(res_df)

In [None]:
#xgb removed highly correlated variables + num_days_since_last_X, avg_num_days  
xgb_classifier(train_X, test_X, train_Y, test_Y)

In [None]:
#xgb with all variables
xgb_classifier(rus_train_X, test_X, rus_train_Y, test_Y)

Hyperparameter tuning

# 6.3 XGBOOST V2

https://github.com/ParrotPrediction/docker-course-xgboost/tree/master/notebooks/3.%20Going%20deeper

# 6.3.1 Spotting the important features 

In [None]:
#Specify training parameters - we are going to use 5 stump decision trees with average learning rate.
# specify training parameters
params = {
    'objective':'binary:logistic',
    'max_depth':1,
    'silent':1,
    'eta':0.5
}

num_rounds = 5

In [None]:
#Train the model. In the same time specify watchlist to observe it's performance on the test set.
# see how does it perform
watchlist  = [(dtest,'test'), (dtrain,'train')] # native interface only
bst = xgb.train(params, train_X, num_rounds, watchlist)

Hopefully there are better ways to figure out which features really matter. We can use built-in function plot_importance that will create a plot presenting most important features due to some criterias. We will analyze the impact of each feature for all splits and all trees and visualize results.

In [None]:
#See which feature provided the most gain:
xgb.plot_importance(bst, importance_type='gain', xlabel='Gain')

In [None]:
xgb.plot_importance(bst)

F-score - sums up how many times a split was performed on each feature.

In [None]:
importances = bst.get_fscore()
importances

In [None]:
# create df
importance_df = pd.DataFrame({
        'Splits': list(importances.values()),
        'Feature': list(importances.keys())
    })
importance_df.sort_values(by='Splits', inplace=True)
importance_df.plot(kind='barh', x='Feature', figsize=(8,6), color='orange')

# 6.3.2 Bias/Variance trade-off

There are two general types of errors made by classifiers - bias and variance errors.
Bias error is the overall difference between expected predictions made by the model and true values.
Variance error describes how much predictions for the given point vary.
The desired state is when both errors are as low as possible.


Knowing the errors introduced with bias and variance we can proceed to how these relate to training the model. We will use the plot taken from scikit-learn docs to help us visualize the underfitting and overfitting issues.
For underfitting we say that model suffers from high bias (too simple) (low variance)
For overfitting we say that model suffers from high variance (over-complicated, unstable) (low bias)


In [None]:
%matplotlib inline

from sklearn.learning_curve import validation_curve
from sklearn.datasets import load_svmlight_files
from sklearn.cross_validation import StratifiedKFold
from sklearn.datasets import make_classification
from xgboost.sklearn import XGBClassifier
from scipy.sparse import vstack

# reproducibility
seed = 123
np.random.seed(seed)

In [None]:
#We will divide into 5 stratified folds (the same distibution of labels in each fold) for testing

cv = StratifiedKFold(y, n_folds=5, shuffle=True, random_state=seed)

In [None]:
default_params = {
    'objective': 'binary:logistic',
    'max_depth': 1,
    'learning_rate': 0.3,
    'silent': 1.0
}

n_estimators_range = np.linspace(1, 200, 10).astype('int')

train_scores, test_scores = validation_curve(
    XGBClassifier(**default_params),
    X, y,
    param_name = 'n_estimators',
    param_range = n_estimators_range,
    cv=cv,
    scoring='accuracy'
)

In [None]:
#Show the validation curve plot

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

fig = plt.figure(figsize=(10, 6), dpi=100)

plt.title("Validation Curve with XGBoost (eta = 0.3)")
plt.xlabel("number of trees")
plt.ylabel("Accuracy")
plt.ylim(0.7, 1.1)

plt.plot(n_estimators_range,
             train_scores_mean,
             label="Training score",
             color="r")

plt.plot(n_estimators_range,
             test_scores_mean, 
             label="Cross-validation score",
             color="g")

plt.fill_between(n_estimators_range, 
                 train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, 
                 alpha=0.2, color="r")

plt.fill_between(n_estimators_range,
                 test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std,
                 alpha=0.2, color="g")

plt.axhline(y=1, color='k', ls='dashed')

plt.legend(loc="best")
plt.show()

i = np.argmax(test_scores_mean)
print("Best cross-validation result ({0:.2f}) obtained for {1} trees".format(test_scores_mean[i], n_estimators_range[i]))

Dealing with high variance: In XGBoost you can try to:

reduce depth of each tree (max_depth),
increase min_child_weight parameter,
increase gamma parameter,
add more randomness using subsample, colsample_bytree parameters,
increase lambda and alpha regularization parameters

Dealing with high bias
In XGBoost you can do it by:

increase depth of each tree (max_depth),
decrease min_child_weight parameter,
decrease gamma parameter,
decrease lambda and alpha regularization parameters 

In [None]:
default_params = {
    'objective': 'binary:logistic',
    'max_depth': 2, # changed
    'learning_rate': 0.3,
    'silent': 1.0,
    'colsample_bytree': 0.6, # added
    'subsample': 0.7 # added
}

n_estimators_range = np.linspace(1, 200, 10).astype('int')

train_scores, test_scores = validation_curve(
    XGBClassifier(**default_params),
    X, y,
    param_name = 'n_estimators',
    param_range = n_estimators_range,
    cv=cv,
    scoring='accuracy'
)

In [None]:
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

fig = plt.figure(figsize=(10, 6), dpi=100)

plt.title("Validation Curve with XGBoost (eta = 0.3)")
plt.xlabel("number of trees")
plt.ylabel("Accuracy")
plt.ylim(0.7, 1.1)

plt.plot(n_estimators_range,
             train_scores_mean,
             label="Training score",
             color="r")

plt.plot(n_estimators_range,
             test_scores_mean, 
             label="Cross-validation score",
             color="g")

plt.fill_between(n_estimators_range, 
                 train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, 
                 alpha=0.2, color="r")

plt.fill_between(n_estimators_range,
                 test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std,
                 alpha=0.2, color="g")

plt.axhline(y=1, color='k', ls='dashed')

plt.legend(loc="best")
plt.show()

i = np.argmax(test_scores_mean)
print("Best cross-validation result ({0:.2f}) obtained for {1} trees".format(test_scores_mean[i], n_estimators_range[i]))

# 6.3.3 Hyperparameter Tuning

In [None]:
from xgboost.sklearn import XGBClassifier

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.cross_validation import StratifiedKFold

from scipy.stats import randint, uniform

# reproducibility
seed = 123
np.random.seed(seed)

In [None]:
cv = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=seed)

Randomized Grid-Search
When the number of parameters and their values is getting big traditional grid-search approach quickly becomes ineffective. A possible solution might be to randomly pick certain parameters from their distribution. While it's not an exhaustive solution, it's worth giving a shot.

In [None]:

params_dist_grid = {
    'max_depth': [1, 2, 3, 4],
    'gamma': [0, 0.5, 1],
    'n_estimators': randint(1, 1001), # uniform discrete random distribution
    'learning_rate': uniform(), # gaussian distribution
    'subsample': uniform(), # gaussian distribution
    'colsample_bytree': uniform() # gaussian distribution
}

In [None]:
rs_grid = RandomizedSearchCV(
    estimator=XGBClassifier(**params_fixed, seed=seed),
    param_distributions=params_dist_grid,
    n_iter=10,
    cv=cv,
    scoring='auc',
    random_state=seed
)

In [None]:
rs_grid.fit(train_X, train_Y)

In [None]:
rs_grid.grid_scores_

In [None]:
rs_grid.best_estimator_
rs_grid.best_params_
rs_grid.best_score_

# 6.3.4 Adjust weights for imbalanced data

In [None]:
params['scale_pos_weight'] = 1/6.4

In [None]:
bst = xgb.train(params, dtrain, num_rounds)
y_test_preds = (bst.predict(dtest) > 0.5).astype('int')

pd.crosstab(
    pd.Series(y_test, name='Actual'),
    pd.Series(y_test_preds, name='Predicted'),
    margins=True
)

# Other Methodologies Attempted 

# 6.1 Logistic Regression

In [None]:
#Logistic regression removed highly correlated variables + num_days_since_last_X, avg_num_days 
#using regular train_test datasets 
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
model(log,train_X,train_Y.values.ravel(),test_X,test_Y.values.ravel(),"coef")

recall of 1.0 indicates that the model starts to cheat itself, the model is unusable.

In [None]:
#Logistic regression removed highly correlated variables + num_days_since_last_X, avg_num_days 
#using undersampled datasets (0s:1s = 1:1) for training, and regular datasets (0s:1s = 6.4:1) for testing.
log = LogisticRegression()
model(log,rus_train_X,rus_train_Y.values.ravel(),test_X,test_Y.values.ravel(),"coef")

# 6.1 Logistic Regression v2

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.metrics import roc_auc_score,roc_curve,scorer
from sklearn.metrics import f1_score
import statsmodels.api as sm
from sklearn.metrics import precision_score,recall_score
#from yellowbrick.classifier import DiscriminationThreshold
#splitting train and test data 
train,test = train_test_split(tmi_data_copy,test_size = .30 ,random_state = 111)
    
##seperating dependent and independent variables
cols    = [i for i in tmi_data_copy.columns if i not in Id_col + target_col]
train_X = train[cols]
train_Y = train[target_col]
test_X  = test[cols]
test_Y  = test[target_col]

#Function attributes
#dataframe     - processed dataframe
#Algorithm     - Algorithm used 
#training_x    - predictor variables dataframe(training)
#testing_x     - predictor variables dataframe(testing)
#training_y    - target variable(training)
#training_y    - target variable(testing)
#cf - ["coefficients","features"](cooefficients for logistic 
                                 #regression,features for tree based models)

#threshold_plot - if True returns threshold plot for model
    
def travel_booking_prediction(algorithm,training_x,testing_x,
                             training_y,testing_y,cols,cf) :
    
    #model
    algorithm.fit(training_x,training_y.values.ravel())
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    #coeffs
    if   cf == "coefficients" :
        coefficients  = pd.DataFrame(algorithm.coef_.ravel())
    elif cf == "features" :
        coefficients  = pd.DataFrame(algorithm.feature_importances_)
        
    column_df     = pd.DataFrame(cols)
    coef_sumry    = (pd.merge(coefficients,column_df,left_index= True,
                              right_index= True, how = "left"))
    coef_sumry.columns = ["coefficients","features"]
    coef_sumry    = coef_sumry.sort_values(by = "coefficients",ascending = False)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy   Score : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc,"\n")
    fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
    
    #plot confusion matrix
    trace1 = go.Heatmap(z = conf_matrix ,
                        x = ["No Travel Booking","Travel Booking"],
                        y = ["No Travel Booking","Travel Booking"],
                        showscale  = False,colorscale = "Picnic",
                        name = "matrix")
    
    #plot roc curve
    trace2 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2))
    trace3 = go.Scatter(x = [0,1],y=[0,1],
                        line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                        dash = 'dot'))
    
    #plot coeffs
    trace4 = go.Bar(x = coef_sumry["features"],y = coef_sumry["coefficients"],
                    name = "coefficients",
                    marker = dict(color = coef_sumry["coefficients"],
                                  colorscale = "Picnic",
                                  line = dict(width = .6,color = "black")))
    
    #subplots
    fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                            subplot_titles=('Confusion Matrix',
                                            'Receiver operating characteristic',
                                            'Feature Importances'))
    
    fig.append_trace(trace1,1,1)
    fig.append_trace(trace2,1,2)
    fig.append_trace(trace3,1,2)
    fig.append_trace(trace4,2,1)
    
    fig['layout'].update(showlegend=False, title="Model performance" ,
                         autosize = False,height = 900,width = 800,
                         plot_bgcolor = 'rgba(240,240,240, 0.95)',
                         paper_bgcolor = 'rgba(240,240,240, 0.95)',
                         margin = dict(b = 195))
    fig["layout"]["xaxis2"].update(dict(title = "false positive rate"))
    fig["layout"]["yaxis2"].update(dict(title = "true positive rate"))
    fig["layout"]["xaxis3"].update(dict(showgrid = True,tickfont = dict(size = 10),
                                        tickangle = 90))
    py.iplot(fig)
    
    #if threshold_plot == True : 
        #visualizer = DiscriminationThreshold(algorithm)
        #visualizer.fit(training_x,training_y)
        #visualizer.poof()
        
logit  = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

travel_booking_prediction(logit,train_X,test_X,train_Y,test_Y,
                         cols,"coefficients")

# 6.2 Logistic Regression with Random Undersampling the Majority Class

- Randomly pick a point from the majority class.

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

id_col = ['cust_id']
target_col = ['target_istravel']
cols    = [i for i in tmi_data_copy.columns if i not in target_col + id_col]

rus_X = tmi_data_copy[cols]
rus_Y = tmi_data_copy[target_col]

rus = RandomUnderSampler(random_state=0)
us_rus_X,us_rus_Y = rus.fit_sample(rus_X,rus_Y.values.ravel())
print('Resampled dataset shape %s' % Counter(us_rus_Y))

us_rus_X = pd.DataFrame(data = us_rus_X,columns=cols)
us_rus_Y = pd.DataFrame(data = us_rus_Y,columns=target_col)



#Split train and test data
rus_train_X,rus_test_X,rus_train_Y,rus_test_Y = train_test_split(us_rus_X,us_rus_Y,
                                                                         test_size = .30 ,
                                                                         random_state = 123)

In [None]:
logit_rus = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

travel_booking_prediction(logit_rus,rus_train_X,test_X,rus_train_Y,test_Y,
                         cols,"coefficients")

# 6.3 Recursive Feature Elimination

Recursive Feature Elimination (RFE) is based on the idea to repeatedly construct a model and choose either the best or worst performing feature, setting the feature aside and then repeating the process with the rest of the features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively considering smaller and smaller sets of features.

In [None]:
from sklearn.feature_selection import RFE

logit = LogisticRegression()

rfe = RFE(logit,10)
rfe = rfe.fit(us_rus_X,us_rus_Y.values.ravel())

rfe.support_
rfe.ranking_

#identified columns Recursive Feature Elimination
idc_rfe = pd.DataFrame({"rfe_support" :rfe.support_,
                       "columns" : [i for i in tmi_data_copy.columns if i not in Id_col + target_col],
                       "ranking" : rfe.ranking_,
                      })
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist()


#separating train and test data
train_rf_X = us_rus_X[cols]
train_rf_Y = us_rus_Y
test_rf_X  = test[cols]
test_rf_Y  = test[target_col]

logit_rfe = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
#applying model
travel_booking_prediction(logit_rfe,train_rf_X,test_rf_X,train_rf_Y,test_rf_Y,
                         cols,"coefficients")

tab_rk = ff.create_table(idc_rfe)
py.iplot(tab_rk)

# 6.4 Univariate Selection

- Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
- uses the chi squared (chi^2) statistical test for non-negative features to select the best features

In [None]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

#select columns
cols = [i for i in tmi_data_copy.columns if i not in Id_col + target_col ]

#dataframe with non negative values
df_x = df_tmi_data_copy_og[cols]
df_y = df_tmi_data_copy_og[target_col]

#fit model with k= 3
select = SelectKBest(score_func = chi2,k = 3)
fit    = select.fit(df_x,df_y)

#Summerize scores
print ("scores")
print (fit.scores_)
print ("P - Values")
print (fit.pvalues_)

#create dataframe
score = pd.DataFrame({"features":cols,"scores":fit.scores_,"p_values":fit.pvalues_ })
score = score.sort_values(by = "scores" ,ascending =False)


#createing new label for categorical and numerical columns
score["feature_type"] = np.where(score["features"].isin(num_cols),"Numerical","Categorical")

#plot
trace  = go.Scatter(x = score[score["feature_type"] == "Categorical"]["features"],
                    y = score[score["feature_type"] == "Categorical"]["scores"],
                    name = "Categorial",mode = "lines+markers",
                    marker = dict(color = "red",
                                  line = dict(width =1))
                   )

trace1 = go.Bar(x = score[score["feature_type"] == "Numerical"]["features"],
                y = score[score["feature_type"] == "Numerical"]["scores"],name = "Numerical",
                marker = dict(color = "royalblue",
                              line = dict(width =1)),
                xaxis = "x2",yaxis = "y2"
               )
layout = go.Layout(dict(title = "Scores for Categorical & Numerical features",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     tickfont = dict(size =10),
                                     domain=[0, 0.7],
                                     tickangle = 90,zerolinewidth=1,
                                     ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "scores",
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        margin = dict(b=200),
                        xaxis2=dict(domain=[0.8, 1],tickangle = 90,
                                    gridcolor = 'rgb(255, 255, 255)'),
                        yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
                        )
                  )

data=[trace,trace1]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

# 6.5 Decision Tree Based

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn import tree
from graphviz import Source
from IPython.display import SVG,display

In [None]:
#top 5 categorical features
features_cat  = score[score["feature_type"] == "Categorical"]["features"][:5].tolist()

#top 5 numerical features
features_num  = score[score["feature_type"] == "Numerical"]["features"][:5].tolist()


#Function attributes
#columns        - selected columns
#maximum_depth  - depth of tree
#criterion_type - ["gini" or "entropy"]
#split_type     - ["best" or "random"]
#Model Performance - True (gives model output)

def plot_decision_tree(columns,maximum_depth,criterion_type,
                       split_type,model_performance = None) :
    
    #separating dependent and in dependent variables
    dtc_x = df_x[columns]
    dtc_y = df_y[target_col]
    
    #model
    dt_classifier = DecisionTreeClassifier(max_depth = maximum_depth,
                                           splitter  = split_type,
                                           criterion = criterion_type,
                                          )
    dt_classifier.fit(dtc_x,dtc_y)
    
    #plot decision tree
    graph = Source(tree.export_graphviz(dt_classifier,out_file=None,
                                        rounded=True,proportion = False,
                                        feature_names = columns, 
                                        precision  = 2,
                                        class_names=["No Travel Booking","Travel Booking"],
                                        filled = True                         
                                       )
                  )
    
    #model performance
    if model_performance == True :
        travel_booking_prediction(dt_classifier,
                                 dtc_x,test_X[columns],
                                 dtc_y,test_Y,
                                 columns,"features")
    display(graph)
    
plot_decision_tree(features_num,5,"gini","best")

In [None]:
plot_decision_tree(features_cat,5,"entropy","best",
                   model_performance = True ,)

Need to edit following code

In [None]:
#using contract,tenure and paperless billing variables
columns = ['tenure','Contract_Month-to-month', 'PaperlessBilling',
           'Contract_One year', 'Contract_Two year']

plot_decision_tree(columns,5,"gini","best",model_performance= True)

# 6.6 KNN Classifier

Applying knn algorithm to random undersampled data.

In [None]:
def travel_booking_prediction_alg(algorithm,training_x,testing_x,
                                 training_y,testing_y) :
    
    #model
    algorithm.fit(training_x,training_y)
    predictions   = algorithm.predict(testing_x)
    probabilities = algorithm.predict_proba(testing_x)
    
    print (algorithm)
    print ("\n Classification report : \n",classification_report(testing_y,predictions))
    print ("Accuracy Score   : ",accuracy_score(testing_y,predictions))
    #confusion matrix
    conf_matrix = confusion_matrix(testing_y,predictions)
    #roc_auc_score
    model_roc_auc = roc_auc_score(testing_y,predictions) 
    print ("Area under curve : ",model_roc_auc)
    fpr,tpr,thresholds = roc_curve(testing_y,probabilities[:,1])
     
    #plot roc curve
    trace1 = go.Scatter(x = fpr,y = tpr,
                        name = "Roc : " + str(model_roc_auc),
                        line = dict(color = ('rgb(22, 96, 167)'),width = 2),
                       )
    trace2 = go.Scatter(x = [0,1],y=[0,1],
                        line = dict(color = ('rgb(205, 12, 24)'),width = 2,
                        dash = 'dot'))
    
    #plot confusion matrix
    trace3 = go.Heatmap(z = conf_matrix ,x = ["No Travel Booking","Travel Booking"],
                        y = ["No Travel Booking","Travel Booking",
                        showscale  = False,colorscale = "Blues",name = "matrix",
                        xaxis = "x2",yaxis = "y2"
                       )
    
    layout = go.Layout(dict(title="Model performance" ,
                            autosize = False,height = 500,width = 800,
                            showlegend = False,
                            plot_bgcolor  = "rgb(243,243,243)",
                            paper_bgcolor = "rgb(243,243,243)",
                            xaxis = dict(title = "false positive rate",
                                         gridcolor = 'rgb(255, 255, 255)',
                                         domain=[0, 0.6],
                                         ticklen=5,gridwidth=2),
                            yaxis = dict(title = "true positive rate",
                                         gridcolor = 'rgb(255, 255, 255)',
                                         zerolinewidth=1,
                                         ticklen=5,gridwidth=2),
                            margin = dict(b=200),
                            xaxis2=dict(domain=[0.7, 1],tickangle = 90,
                                        gridcolor = 'rgb(255, 255, 255)'),
                            yaxis2=dict(anchor='x2',gridcolor = 'rgb(255, 255, 255)')
                           )
                  )
    data = [trace1,trace2,trace3]
    fig = go.Figure(data=data,layout=layout)
    
    py.iplot(fig)
    
                             
    ##if threshold_plot == True : 
        ##visualizer = DiscriminationThreshold(algorithm)
        ##visualizer.fit(training_x,training_y)
        ##visualizer.poof()

    
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
travel_booking_prediction_alg(knn,rus_train_X,test_X,
                             rus_train_Y,test_Y)

# 6.7 Visualizing a decision tree from random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

#function attributes
#columns  - column used
#nf_estimators   - The number of trees in the forest.
#estimated_tree  - tree number to be displayed
#maximum_depth   - depth of the tree
#criterion_type  - split criterion type ["gini" or "entropy"]
#Model performance - prints performance of model

def plot_tree_randomforest(columns,nf_estimators,
                           estimated_tree,maximum_depth,
                           criterion_type,model_performance = None) :
    
    dataframe = df_tmi_data_copy_og[columns + target_col].copy()
    
    #train and test datasets
    rf_x     = dataframe[[i for i in columns if i not in target_col]]
    rf_y     = dataframe[target_col]
    
    #random forest classifier
    rfc   = RandomForestClassifier(n_estimators = nf_estimators,
                                   max_depth = maximum_depth,
                                   criterion = criterion_type,
                                  )
    rfc.fit(rf_x,rf_y.values.ravel())
    
    estimated_tree = rfc.estimators_[estimated_tree]
    
    graph = Source(tree.export_graphviz(estimated_tree,out_file=None,
                                        rounded=True,proportion = False,
                            feature_names = columns, 
                            precision  = 2,
                            class_names=["No Travel Booking","Travel Booking"],
                            filled = True))
    display(graph)
    
    #model performance
    if model_performance == True :
        travel_booking_prediction(rfc,
                                 rf_x,test_X[columns],
                                 rf_y,test_Y,
                                 columns,"features")
        

cols1 = [ i for i in train_X.columns if i not in target_col + Id_col] 
plot_tree_randomforest(cols1,100,99,3,"entropy",True)

# 6.8 Random Forest Classifier

In [None]:
#making 10 trees with random forest.
n = np.arange(0,10).tolist()
cols1 = [ i for i in train_X.columns if i not in target_col + Id_col] 
for i in n :
    plot_tree_randomforest(cols1,10,i,3,"entropy",model_performance=False)

In [None]:
#making 10 trees with random forest for columns 
#selected from recursive feature elimination

n = np.arange(0,10).tolist()
cols = idc_rfe[idc_rfe["rfe_support"] == True]["columns"].tolist() 
for i in n :
    plot_tree_randomforest(cols,10,i,3,"gini",model_performance=False)

# 6.9 Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(priors=None)

travel_booking_prediction_alg(gnb,rus_train_X,test_X,rus_train_Y,test_Y)

# 6.10 Support Vector Machine 

“Support Vector Machine” (SVM) is a supervised machine learning algorithm which can be used for both classification or regression challenges. it is mostly used in classification problems. In this algorithm, we plot each data item as a point in n-dimensional space .where n is number of features you have) with the value of each feature being the value of a particular coordinate. Then, we perform classification by finding the hyper-plane that differentiate the two classes

In [None]:
from sklearn.svm import SVC

#Support vector classifier
#using linear hyper plane
svc_lin  = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
               decision_function_shape='ovr', degree=3, gamma=1.0, kernel='linear',
               max_iter=-1, probability=True, random_state=None, shrinking=True,
               tol=0.001, verbose=False)

cols = [i for i in tmi_data_copy.columns if i not in Id_col + target_col]
travel_booking_prediction(svc_lin,rus_train_X,test_X,rus_train_Y,test_Y,
                         cols,"coefficients")

# 6.11 Tuning Parameters for Support Vector Machine

In [None]:
#tuning parameters
#Support vector classifier
#using non-linear hyper plane("rbf")

svc_rbf  = SVC(C=1.0, kernel='rbf', 
               degree= 3, gamma=1.0, 
               coef0=0.0, shrinking=True,
               probability=True,tol=0.001,
               cache_size=200, class_weight=None,
               verbose=False,max_iter= -1,
               random_state=None)

travel_booking_prediction_alg(svc_rbf,rus_train_X,test_X,rus_train_Y,test_Y)

# 6.12 LightGBMClassifier

In [None]:
from lightgbm import LGBMClassifier

lgbm_c = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
                        learning_rate=0.5, max_depth=7, min_child_samples=20,
                        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
                        n_jobs=-1, num_leaves=500, objective='binary', random_state=None,
                        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
                        subsample_for_bin=200000, subsample_freq=0)

cols = [i for i in tmi_data_copy.columns if i not in Id_col + target_col]
travel_booking_prediction(lgbm_c,rus_train_X,test_X,rus_train_Y,test_Y,
                         cols,"features")

# 6.13 XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

xgc = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                    colsample_bytree=1, gamma=0, learning_rate=0.9, max_delta_step=0,
                    max_depth = 7, min_child_weight=1, missing=None, n_estimators=100,
                    n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
                    reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                    silent=True, subsample=1)


travel_booking_prediction(xgc,rus_train_X,test_X,rus_train_Y,test_Y,
                         cols,"features")

# 7 Model Performance

# 7.1 Model Performance Metrics

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

#gives model report in dataframe
def model_report(model,training_x,testing_x,training_y,testing_y,name) :
    model.fit(training_x,training_y)
    predictions  = model.predict(testing_x)
    accuracy     = accuracy_score(testing_y,predictions)
    recallscore  = recall_score(testing_y,predictions)
    precision    = precision_score(testing_y,predictions)
    roc_auc      = roc_auc_score(testing_y,predictions)
    f1score      = f1_score(testing_y,predictions) 
    kappa_metric = cohen_kappa_score(testing_y,predictions)
    
    df = pd.DataFrame({"Model"           : [name],
                       "Accuracy_score"  : [accuracy],
                       "Recall_score"    : [recallscore],
                       "Precision"       : [precision],
                       "f1_score"        : [f1score],
                       "Area_under_curve": [roc_auc],
                       "Kappa_metric"    : [kappa_metric],
                      })
    return df

#outputs for every model
model1 = model_report(logit,train_X,test_X,train_Y,test_Y,
                      "Logistic Regression(Baseline_model)")
model2 = model_report(logit_rus,rus_train_X,test_X,rus_train_Y,test_Y,
                      "Logistic Regression(Random Undersampling)")
model3 = model_report(logit_rfe,train_rf_X,test_rf_X,train_rf_Y,test_rf_Y,
                      "Logistic Regression(RFE)")
decision_tree = DecisionTreeClassifier(max_depth = 9,
                                       random_state = 123,
                                       splitter  = "best",
                                       criterion = "gini",
                                      )
model4 = model_report(decision_tree,train_X,test_X,train_Y,test_Y,
                      "Decision Tree")
model5 = model_report(knn,rus_train_X,test_X,rus_train_Y,test_Y,
                      "KNN Classifier")
rfc = RandomForestClassifier(n_estimators = 1000,
                             random_state = 123,
                             max_depth = 9,
                             criterion = "gini")
model6 = model_report(rfc,train_X,test_X,train_Y,test_Y,
                      "Random Forest Classifier")
model7 = model_report(gnb,rus_train_X,test_X,rus_train_Y,test_Y,
                      "Naive Bayes")
model8 = model_report(svc_lin,rus_train_X,test_X,rus_train_Y,test_Y,
                      "SVM Classifier Linear")
model9 = model_report(svc_rbf,rus_train_X,test_X,rus_train_Y,test_Y,
                      "SVM Classifier RBF")
model10 = model_report(lgbm_c,rus_train_X,test_X,rus_train_Y,test_Y,
                      "LGBM Classifier")
model11 = model_report(xgc,rus_train_X,test_X,rus_train_Y,test_Y,
                      "XGBoost Classifier")

#concat all models
model_performances = pd.concat([model1,model2,model3,
                                model4,model5,model6,
                                model7,model8,model9,
                                model10,model11],axis = 0).reset_index()

model_performances = model_performances.drop(columns = "index",axis =1)

table  = ff.create_table(np.round(model_performances,4))

py.iplot(table)

# 7.2 Compare Model Metrics

In [None]:
model_performances
def output_tracer(metric,color) :
    tracer = go.Bar(y = model_performances["Model"] ,
                    x = model_performances[metric],
                    orientation = "h",name = metric ,
                    marker = dict(line = dict(width =.7),
                                  color = color)
                   )
    return tracer

layout = go.Layout(dict(title = "Model performances",
                        plot_bgcolor  = "rgb(243,243,243)",
                        paper_bgcolor = "rgb(243,243,243)",
                        xaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     title = "metric",
                                     zerolinewidth=1,
                                     ticklen=5,gridwidth=2),
                        yaxis = dict(gridcolor = 'rgb(255, 255, 255)',
                                     zerolinewidth=1,ticklen=5,gridwidth=2),
                        margin = dict(l = 250),
                        height = 780
                       )
                  )


trace1  = output_tracer("Accuracy_score","#6699FF")
trace2  = output_tracer('Recall_score',"red")
trace3  = output_tracer('Precision',"#33CC99")
trace4  = output_tracer('f1_score',"lightgrey")
trace5  = output_tracer('Kappa_metric',"#FFCC99")

data = [trace1,trace2,trace3,trace4,trace5]
fig = go.Figure(data=data,layout=layout)
py.iplot(fig)

# 7.3 Confusion Matrices for models 

In [None]:
lst    = [logit,logit_rus,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(Random Undersampling)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

fig = plt.figure(figsize=(13,15))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    plt.subplot(4,3,j+1)
    predictions = i.predict(test_X)
    conf_matrix = confusion_matrix(predictions,test_Y)
    sns.heatmap(conf_matrix,annot=True,fmt = "d",square = True,
                xticklabels=["No Travel Booking","Travel Booking"],
                yticklabels=["No Travel Booking","Travel Booking"],
                linewidths = 2,linecolor = "w",cmap = "Set1")
    plt.title(k,color = "b")
    plt.subplots_adjust(wspace = .3,hspace = .3)

# 7.4 ROC-Curve for models 

In [None]:
lst    = [logit,logit_rus,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(Random Undersampling)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

plt.style.use("dark_background")
fig = plt.figure(figsize=(12,16))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    qx = plt.subplot(4,3,j+1)
    probabilities = i.predict_proba(test_X)
    predictions   = i.predict(test_X)
    fpr,tpr,thresholds = roc_curve(test_Y,probabilities[:,1])
    plt.plot(fpr,tpr,linestyle = "dotted",
             color = "royalblue",linewidth = 2,
             label = "AUC = " + str(np.around(roc_auc_score(test_Y,predictions),3)))
    plt.plot([0,1],[0,1],linestyle = "dashed",
             color = "orangered",linewidth = 1.5)

# 7.5 Precision recall curves

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score


lst    = [logit,logit_rus,decision_tree,knn,rfc,
          gnb,svc_lin,svc_rbf,lgbm_c,xgc]

length = len(lst)

mods   = ['Logistic Regression(Baseline_model)','Logistic Regression(Random Undersampling)',
          'Decision Tree','KNN Classifier','Random Forest Classifier',"Naive Bayes",
          'SVM Classifier Linear','SVM Classifier RBF', 'LGBM Classifier',
          'XGBoost Classifier']

fig = plt.figure(figsize=(13,17))
fig.set_facecolor("#F3F3F3")
for i,j,k in itertools.zip_longest(lst,range(length),mods) :
    
    qx = plt.subplot(4,3,j+1)
    probabilities = i.predict_proba(test_X)
    predictions   = i.predict(test_X)
    recall,precision,thresholds = precision_recall_curve(test_Y,probabilities[:,1])
    plt.plot(recall,precision,linewidth = 1.5,
             label = ("avg_pcn : " + 
                      str(np.around(average_precision_score(test_Y,predictions),3))))
    plt.plot([0,1],[0,0],linestyle = "dashed")
    plt.fill_between(recall,precision,alpha = .2)
    plt.legend(loc = "lower left",
               prop = {"size" : 10})
    qx.set_facecolor("k")
    plt.grid(True,alpha = .15)
    plt.title(k,color = "b")
    plt.xlabel("recall",fontsize =7)
    plt.ylabel("precision",fontsize =7)
    plt.xlim([0.25,1])
    plt.yticks(np.arange(0,1,.3))
    

# 8 Lift Analysis 

https://stackoverflow.com/questions/42699243/how-to-build-a-lift-chart-a-k-a-gains-chart-in-python