In [1]:
import sklearn
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import pickle

np.random.seed(42)

#Set the rc params for axes, xtick, and ytick
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

#Finally, we will import warnings and ensure that the non-essential warnings are ignored
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

#Load the dataset
churn = pd.read_csv('churn_train.csv')

#Drop basic attributes like State, Area code
churn = churn.drop(['State','Area code'], axis=1)

'''
for column in churn.select_dtypes(include=["object"]).columns:
    if column != "Churn":
        display(pd.crosstab(index=churn[column], columns=churn["Churn"], normalize="columns"))

for column in churn.select_dtypes(exclude=["object"]).columns:
    print(column)
    hist = churn[[column, "Churn"]].hist(by="Churn", bins=30)
    plt.show()
  
display(churn.corr(numeric_only=True))
pd.plotting.scatter_matrix(churn, figsize=(12, 12))
plt.show()

df['color'] = df['color'].astype('category')
df['color_encoded'] = df['color'].cat.codes
'''

#Let us convert categorical fields into numerics - Voice mail plan, International plan, Churn
churn['Voicemail_plan_upd'] = churn['Voice mail plan'].astype('category').cat.codes
churn['International_plan_upd'] = churn['International plan'].astype('category').cat.codes
churn['Churn_upd'] = churn['Churn'].astype('category').cat.codes

# Run the Correlation matrix of each feature against the label and Identify what attributes are unnecessary
#print(churn.corr())

#Going by the Correlation matrix, Total day charge, Total eve charge, Total night charge are just repetitive. Hence dropping them off
churn = churn.drop(['Voice mail plan','International plan','Churn','Total day charge','Total eve charge','Total night charge'],axis=1)

'''
from scipy.stats import pearsonr

l1=churn['Voice mail plan']
l2=churn['Churn']

corr, _ = pearsonr(l1, l2)
print('Pearsons correlation: %.3f' % corr)
'''

#Checking the data scatter, nulls & other metrics of data
#churn.describe()

# Store the labels in a variable
churn_label = churn['Churn_upd'].copy()

#Drop Churn_upd from dataframe
churn = churn.drop(['Churn_upd'],axis=1)

#Split the data into Train, Validation & Test data sets
#First split the data into Train & Test. Next split the Train data into Train & Validation

X_train, X_test, y_train, y_test = train_test_split(churn, churn_label, test_size=0.4, random_state=42)

#Start evaluating different ML models for best performance



In [2]:
#Start with Logistic Regression

lr = LogisticRegression()

# Fit the model on the training data
lr.fit(X_train, y_train)

# Make predictions on the testing data
lr_y_pred = lr.predict(X_test)

# Evaluate the model
lr_accuracy = accuracy_score(y_test, lr_y_pred)
print("Accuracy:", lr_accuracy)

#AUC Score for Logistic Regression
#First, we need to use predic_prob method

lr_y_pred_prob = lr.predict_proba(X_test)

lr_auc_score = roc_auc_score(y_test, lr_y_pred_prob[:,1])
print("Logistic Regression AUC Score is",lr_auc_score)

#Steps to save the LR model
#lr_model = pickle.dumps(lr)

lr_model_pkl_file = "lr_model_pkl_file.pkl"  

with open(lr_model_pkl_file, 'wb') as file:  
    pickle.dump(lr, file)


Accuracy: 0.8375
Logistic Regression AUC Score is 0.7168666294642857


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [3]:
from sklearn.linear_model import SGDClassifier
import numpy as np

# Using SGD Classifier algorithm
sgd_clf = SGDClassifier(random_state=42, max_iter=10)

#Train the classifier
sgd_clf.fit(X_train, y_train)

#Predict the Test data
y_pred_sgd = sgd_clf.predict(X_test.values)
#y_pred_sgd = sgd_clf.predict([X_test.iloc[251]])

#print(y_pred_sgd)

#Predict Probability
#y_pred_prob_sgd = sgd_clf.predict_proba(X_test.values)

# Evaluate the model
sgd_accuracy = accuracy_score(y_test, y_pred_sgd)
print("SGD Accuracy:", sgd_accuracy)

sgd_auc_score = roc_auc_score(y_test, y_pred_sgd)
print("SGD Predict AUC Score is",sgd_auc_score)

#sgd_auc_score_prob = roc_auc_score(y_test, y_pred_prob_sgd[:,1])
#print("SGD Predict Probability AUC Score is",sgd_auc_score_prob)

#steps to Save the model
sgd_model_pkl_file = "sgd_model_pkl_file.pkl"  

with open(sgd_model_pkl_file, 'wb') as file:  
    pickle.dump(sgd_clf, file)

SGD Accuracy: 0.84
SGD Predict AUC Score is 0.5




In [4]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

#Now let's train the DecisionTreeRegressor
dt_clf = DecisionTreeClassifier(random_state=42,max_depth=3)
dt_clf.fit(X_train, y_train)

#Now let's predict using our model using the predict method
y_pred_dt = dt_clf.predict(X_test)

#cv_score = cross_val_score(dt_clf, X_train, y_train, cv=10)

#print(cv_score)

# Evaluate the model
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print("DT Accuracy:", dt_accuracy)

dt_auc_score = roc_auc_score(y_test, y_pred_dt)
print("DT Predict AUC Score is",dt_auc_score)


#steps to Save the model
dt_model_pkl_file = "dt_model_pkl_file.pkl"  

with open(dt_model_pkl_file, 'wb') as file:  
    pickle.dump(dt_clf, file)


DT Accuracy: 0.85875
DT Predict AUC Score is 0.5617559523809523


In [5]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

#Now let's train the DecisionTreeRegressor
rf_clf = RandomForestClassifier(random_state=42, max_depth=10)
rf_clf.fit(X_train, y_train)

#Now let's predict using our model using the predict method
y_pred_rf = rf_clf.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print("RF Accuracy:", rf_accuracy)

rf_auc_score = roc_auc_score(y_test, y_pred_rf)
print("RF Predict AUC Score is",rf_auc_score)

#Steps to save the model
rf_model_pkl_file = "rf_model_pkl_file.pkl"  

with open(rf_model_pkl_file, 'wb') as file:  
    pickle.dump(rf_clf, file)


RF Accuracy: 0.91
RF Predict AUC Score is 0.7219122023809523


In [6]:
#XGBoost
import xgboost as xgb

# Create an XGBoost classifier
xgb_pretrain_clf = xgb.XGBClassifier(
    objective='binary:logistic',  # For Binary-class classification
    learning_rate=0.1,
    max_depth=3,
    n_estimators=50,
    eval_metric="auc"
       
)

#eval_metric=["auc", "error", "error@0.6"]

xgb_pretrain_clf.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_test, y_test)])

y_pred = xgb_pretrain_clf.predict(X_test)

xgb_pretrain_accuracy = accuracy_score(y_test, y_pred)
print("XGB Accuracy is", xgb_pretrain_accuracy)

#Steps to save the model
xgb_model_PreTrain_pkl_file = "xgb_model_PreTrain_pkl_file.pkl"  

with open(xgb_model_PreTrain_pkl_file, 'wb') as file:  
    pickle.dump(xgb_pretrain_clf, file)


[0]	validation_0-auc:0.847726
Will train until validation_0-auc hasn't improved in 5 rounds.
[1]	validation_0-auc:0.863758
[2]	validation_0-auc:0.868908
[3]	validation_0-auc:0.870582
[4]	validation_0-auc:0.872332
[5]	validation_0-auc:0.876529
[6]	validation_0-auc:0.879668
[7]	validation_0-auc:0.880226
[8]	validation_0-auc:0.878738
[9]	validation_0-auc:0.878476
[10]	validation_0-auc:0.882394
[11]	validation_0-auc:0.88344
[12]	validation_0-auc:0.883859
[13]	validation_0-auc:0.887381
[14]	validation_0-auc:0.885905
[15]	validation_0-auc:0.893125
[16]	validation_0-auc:0.893683
[17]	validation_0-auc:0.894432
[18]	validation_0-auc:0.894816
[19]	validation_0-auc:0.894525
[20]	validation_0-auc:0.903378
[21]	validation_0-auc:0.903367
[22]	validation_0-auc:0.904053
[23]	validation_0-auc:0.904925
[24]	validation_0-auc:0.904431
[25]	validation_0-auc:0.905611
[26]	validation_0-auc:0.910987
[27]	validation_0-auc:0.910238
[28]	validation_0-auc:0.909575
[29]	validation_0-auc:0.908035
[30]	validation_0-

In [7]:
#XGBoost
#Since XGBoost's base model performance was better than the rest, tune the XGB model to find the best set of hyperparameters
#Find the best set of hyperparameters for XGBoost

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1]
    }

# Create an XGBoost classifier
xgb_hyper_clf = xgb.XGBClassifier(objective='binary:logistic',  # For Binary-class classification
    n_estimators=50, eval_metric="auc")

#eval_metric=["auc", "error", "error@0.6"]

# Create the GridSearchCV object
grid_search = GridSearchCV(xgb_hyper_clf, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best set of hyperparameters:  {'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.5}
Best score:  0.9516213389121339


In [8]:
#XGBoost
#Apply the best hyperparameters from above and use it for train & predict
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Create an XGBoost classifier
xgb_clf = xgb.XGBClassifier(objective='binary:logistic',  # For Binary-class classification
    learning_rate= 0.1, max_depth =5, subsample=0.5, n_estimators=50, eval_metric="auc")

# Fit the GridSearchCV object to the training data
xgb_clf.fit(X_train, y_train)

# Predict the data for X_test
y_pred_xgb = xgb_clf.predict(X_test)

xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print("XGB Accuracy is", xgb_accuracy)

#Steps to save the model
xgb_model_pkl_file = "xgb_model_pkl_file.pkl"  

with open(xgb_model_pkl_file, 'wb') as file:  
    pickle.dump(xgb_clf, file)


XGB Accuracy is 0.94125
