# Model Development Script

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import pickle

In [2]:
# Loading the data
df = pd.read_csv("trainDataFromS3/churn_train.csv")

In [3]:
# Replacing empty spaces with Null values
df = df.replace(r'^\s+$', np.nan, regex=True)

In [4]:
# Dropping NA values
df = df.dropna()

In [5]:
# Deleting the custumerID column
del df["customerID"]
# Removing TotalCharges variable from the data
del df["TotalCharges"]
#Converting SeniorCitizen variable into categorical and mapping values of Yes & No to 1 & 0 respectively
df['SeniorCitizen'] = df.SeniorCitizen.map({0:'No', 1:'Yes'})

In [6]:
all_columns_list = df.columns.tolist()
numerical_columns_list = ['tenure','MonthlyCharges']
categorical_columns_list = [e for e in all_columns_list if e not in numerical_columns_list]
for index in categorical_columns_list:
    df[index] = pd.Categorical(df[index])
for index in numerical_columns_list:
    df[index] = pd.to_numeric(df[index])

In [7]:
# Splitting data according to datatypes
num = ['float64', 'int64']
num_df = df.select_dtypes(include=num)
obj_df = df.select_dtypes(exclude=num)

In [8]:
# Creating bins for numerical variables for extensive prediction of churn
tenure_bins = pd.cut(num_df["tenure"], bins=[0,20,60,80], labels=['low','medium','high'])
MonthlyCharges_bins = pd.cut(num_df["MonthlyCharges"], bins=[0,35,60,130], labels=['low','medium','high'])

In [9]:
# Saving numeric variable bins into a dataframe
bins = pd.DataFrame([tenure_bins, MonthlyCharges_bins]).T

In [10]:
# Concatenate bins with object variables
transformed_df = pd.concat([bins,obj_df],axis=1)

In [11]:
dummy_columns = [e for e in transformed_df.columns if e != 'Churn']

In [12]:
# Creating dataframe of dummy variables
df_dummies = pd.get_dummies(data=transformed_df, columns=dummy_columns)

In [13]:
df_dummies_features = df_dummies.drop(["Churn"], axis=1).columns
X_all = df_dummies[df_dummies_features]
y_all = df_dummies["Churn"]

In [14]:
# Change X and y to its values
X_boruta = X_all.values
y_boruta = y_all.values

# Define random forest classifier, with utilising all cores and sampling in proportion to y labels
rfc = RandomForestClassifier(n_jobs = -1)
 
# Define Boruta feature selection method
feature_selector = BorutaPy(rfc, n_estimators='auto', verbose=2, random_state=1)
 
# Find all relevant features
feature_selector.fit(X_boruta, y_boruta)

#Transposing dataframe for ranking
df_features_rank = df_dummies.drop(['Churn'],axis=1).T

# Check ranking of features
df_features_rank['Boruta_Rank'] = feature_selector.ranking_
df_features_rank

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	49
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	49
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	49
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	49
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	49
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	49
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	49
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	7
Tentative: 	6
Rejected: 	36


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	9 / 100
Confirmed: 	7
Tentative: 	6
Rejected: 	36


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	10 / 100
Confirmed: 	7
Tentative: 	6
Rejected: 	36


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	11 / 100
Confirmed: 	7
Tentative: 	6
Rejected: 	36


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	12 / 100
Confirmed: 	7
Tentative: 	5
Rejected: 	37


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	13 / 100
Confirmed: 	7
Tentative: 	5
Rejected: 	37


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	14 / 100
Confirmed: 	7
Tentative: 	5
Rejected: 	37


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	15 / 100
Confirmed: 	7
Tentative: 	5
Rejected: 	37


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	16 / 100
Confirmed: 	7
Tentative: 	4
Rejected: 	38


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	17 / 100
Confirmed: 	7
Tentative: 	4
Rejected: 	38


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	18 / 100
Confirmed: 	7
Tentative: 	4
Rejected: 	38


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	19 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	20 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	21 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	22 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	23 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	24 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	25 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	26 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	27 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Iteration: 	28 / 100
Confirmed: 	7
Tentative: 	1
Rejected: 	41
Iteration: 	29 / 100
Confirmed: 	7
Tentative: 	0
Rejected: 	42


BorutaPy finished running.

Iteration: 	30 / 100
Confirmed: 	7
Tentative: 	0
Rejected: 	42


  hits = np.where(cur_imp[0] > imp_sha_max)[0]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5625,5626,5627,5628,5629,5630,5631,5632,5633,Boruta_Rank
tenure_high,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
tenure_low,0,1,0,1,0,1,1,0,1,0,...,0,0,0,0,1,0,0,1,0,1
tenure_medium,0,0,1,0,1,0,0,1,0,1,...,1,1,1,1,0,1,1,0,1,3
MonthlyCharges_high,0,1,1,0,0,0,0,0,1,1,...,1,0,1,1,1,0,0,1,0,31
MonthlyCharges_low,1,0,0,0,1,0,1,1,0,0,...,0,1,0,0,0,1,0,0,1,34
MonthlyCharges_medium,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,32
gender_Female,0,0,0,0,1,1,1,1,1,0,...,0,1,1,1,0,1,1,0,1,11
gender_Male,1,1,1,1,0,0,0,0,0,1,...,1,0,0,0,1,0,0,1,0,10
SeniorCitizen_No,1,1,1,1,0,1,1,1,1,1,...,1,1,1,0,1,1,1,1,1,20
SeniorCitizen_Yes,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,19


In [15]:
# Adding a variable 'Feature' in the dataframe
df_features_rank['Feature']=  df_features_rank.index

In [16]:
# Sort the dataframe as per Rank
df_features_rank = df_features_rank.sort_values('Boruta_Rank')

In [17]:
# Selecting important featutres
selected_features = df_features_rank.head(9).index

In [18]:
X_selected = df_dummies[selected_features]
y_selected = df_dummies["Churn"]

In [19]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_selected, y_selected, test_size=0.20, random_state=7)

In [20]:
scoring = 'accuracy'
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RF', RandomForestClassifier()))
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=7)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.791064 (0.013238)
KNN: 0.756837 (0.022710)
CART: 0.788175 (0.008664)
RF: 0.782397 (0.010558)


In [21]:
# Make predictions on test dataset
models = []
accuracy_list = []
trained_models = {}
models.append(('LogisticRegression', LogisticRegression()))
models.append(('KNeighborsClassifier', KNeighborsClassifier()))
models.append(('DecisionTreeClassifier', DecisionTreeClassifier()))
models.append(('RandomForestClassifier', RandomForestClassifier()))

for name, model in models:
    model.fit(X_train, y_train)
    trained_models[name] = model
    predictions = model.predict(X_test)
    acc = accuracy_score(y_test, predictions)
    accuracy_list.append((name,acc))

In [22]:
#Creating a dataframe for the models metrics
models_metrics = pd.DataFrame(accuracy_list, columns=["Model", "Accuracy"]) 

In [23]:
models_metrics['Model_Rank'] = models_metrics['Accuracy'].rank(ascending=False, method='first')
models_metrics

Unnamed: 0,Model,Accuracy,Model_Rank
0,LogisticRegression,0.791111,2.0
1,KNeighborsClassifier,0.780444,4.0
2,DecisionTreeClassifier,0.792,1.0
3,RandomForestClassifier,0.790222,3.0


In [29]:
# Store the result into csv
models_metrics.to_csv('metrics_score.csv', index=False)

In [25]:
rank_dict = pd.Series(models_metrics.Model_Rank.values, index=models_metrics.Model.values).to_dict()

In [26]:
trained_models_with_rank = {}
for key, value in rank_dict.items():
    trained_models_with_rank[rank_dict[key]] = [value1 for key1, value1 in trained_models.items() if key == key1]
    trained_models_with_rank[rank_dict[key]].append(key)

In [27]:
trained_models_with_rank

{1.0: [DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, presort=False, random_state=None,
              splitter='best'), 'DecisionTreeClassifier'],
 2.0: [LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False),
  'LogisticRegression'],
 3.0: [RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_

In [28]:
# Save the model to disk
filename = 'pickled_models.pkl'
pickle.dump(trained_models_with_rank, open(filename, 'wb'), protocol=2)