In [1]:
#Import libraries 

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, recall_score, plot_confusion_matrix, classification_report, f1_score, accuracy_score

from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE

from HelperFunctions import compare_models, transform_split_data

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Import the CSV with clean data
churn = pd.read_csv('data/clean_churn_cata.csv')

#Preview
churn.head()

Unnamed: 0,state,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,...,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,churn,total_calls,total_minutes,total_charge
0,KS,128,0,1,25,265.1,110,45.07,197.4,99,...,91,11.01,10.0,3,2.7,1,0,303,717.2,75.56
1,OH,107,0,1,26,161.6,123,27.47,195.5,103,...,103,11.45,13.7,3,3.7,1,0,332,625.2,59.24
2,NJ,137,0,0,0,243.4,114,41.38,121.2,110,...,104,7.32,12.2,5,3.29,0,0,333,539.4,62.29
3,OH,84,1,0,0,299.4,71,50.9,61.9,88,...,89,8.86,6.6,7,1.78,2,0,255,564.8,66.8
4,OK,75,1,0,0,166.7,113,28.34,148.3,122,...,121,8.41,10.1,3,2.73,3,0,359,512.0,52.09


### Pre-Processing: Hot Encoding The Categorical Features, Scaling

In [3]:
df = pd.get_dummies(churn, drop_first=True)
df.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,...,0,0,0,0,0,0,0,0,0,0
1,107,0,1,26,161.6,123,27.47,195.5,103,16.62,...,0,0,0,0,0,0,0,0,0,0
2,137,0,0,0,243.4,114,41.38,121.2,110,10.3,...,0,0,0,0,0,0,0,0,0,0
3,84,1,0,0,299.4,71,50.9,61.9,88,5.26,...,0,0,0,0,0,0,0,0,0,0
4,75,1,0,0,166.7,113,28.34,148.3,122,12.61,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 71 columns):
account_length            3333 non-null int64
international_plan        3333 non-null int64
voice_mail_plan           3333 non-null int64
number_vmail_messages     3333 non-null int64
total_day_minutes         3333 non-null float64
total_day_calls           3333 non-null int64
total_day_charge          3333 non-null float64
total_eve_minutes         3333 non-null float64
total_eve_calls           3333 non-null int64
total_eve_charge          3333 non-null float64
total_night_minutes       3333 non-null float64
total_night_calls         3333 non-null int64
total_night_charge        3333 non-null float64
total_intl_minutes        3333 non-null float64
total_intl_calls          3333 non-null int64
total_intl_charge         3333 non-null float64
customer_service_calls    3333 non-null int64
churn                     3333 non-null int64
total_calls               3333 non-null int64
tot

In [5]:
# Split the data into train and test sets
X = df.drop(['churn'], axis=1)
y = df['churn']
x_columns = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [8]:
type(y_test)

pandas.core.series.Series

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
#Scaling with StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
scaled_X_train = pd.DataFrame(X_train_scaled, columns = x_columns)
scaled_X_test = pd.DataFrame(X_test_scaled, columns = x_columns)

In [None]:
scaled_X_train

### Balancing the Dataset using SMOTE

In [9]:
os = SMOTE(random_state=0)

os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X)
os_data_y= pd.Series(data=os_data_y)

In [10]:
os_data_y

0       0
1       0
2       0
3       0
4       0
       ..
4277    1
4278    1
4279    1
4280    1
4281    1
Name: churn, Length: 4282, dtype: int64

### Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_hat = logreg.predict(X_test)
plot_confusion_matrix(logreg, X_test, y_test)
print(recall_score(y_test, y_hat))

In [None]:
print(classification_report(y_test, y_hat))

In [None]:
print("The length of oversampled data is ",len(os_data_X))
print("Number of loyal customers in oversampled data",len(os_data_y[os_data_y['churn']==0]))
print("Number of churn customers",len(os_data_y[os_data_y['churn']==1]))
print("Proportion of loyal customers in oversampled data is ",len(os_data_y[os_data_y['churn']==0])/len(os_data_X))
print("Proportion of churn customers in oversampled data is ",len(os_data_y[os_data_y['churn']==1])/len(os_data_X))

In [None]:
log_reg = LogisticRegression(fit_intercept = False, C = 1e12)
log_reg.fit(os_data_X, os_data_y)

In [None]:
y_pred = log_reg.predict(scaled_X_test)

In [None]:
plot_confusion_matrix(log_reg, scaled_X_test, y_test)
print(recall_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
#def plot_conf_matrix(y_true, y_pred):
    
   # """
   # Plots a prettier confusion matrix than matplotlib.
   # """
    
    #cm = confusion_matrix(y_true, y_pred)
    #plt.figure(figsize=(10, 7))
    #sns.heatmap(cm, annot=True, cmap=sns.color_palette('Blues_d'), fmt='0.5g', annot_kws={"size": 16})
    #plt.xlabel('Predicted')
    #plt.ylabel('Actual')
    #plt.ylim([0,2])
    #plt.show()

### KNN: Vanilla Model

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(os_data_X, os_data_y)

In [None]:
y_predict = knn.predict(scaled_X_test)
plot_confusion_matrix(knn, scaled_X_test, y_test)

In [None]:
print("Here's how the KNN model performed with the TRAINING data: \n")
print(classification_report(os_data_y, knn.predict(os_data_X)))
print('-'*60)
print("Here's how the KNN model performed with the TESTING data: \n")
print(classification_report(y_test, knn.predict(scaled_X_test)))

### Bayes Classification Model

In [None]:
nb_model = GaussianNB()
nb_model.fit(os_data_X, os_data_y)

In [None]:
bayes_predict = nb_model.predict(scaled_X_test)

In [None]:
plot_confusion_matrix(nb_model, scaled_X_test, y_test)

In [None]:
print("Here's how the Naive Bayes model performed with the TRAINING data: \n")
print(classification_report(os_data_y, nb_model.predict(os_data_X)))
print('-'*60)
print("Here's how the Naive Bayes model performed with the TESTING data: \n")
print(classification_report(y_test, nb_model.predict(scaled_X_test)))

### Decision Tree

In [None]:
#Train the Model
clf = DecisionTreeClassifier()
clf = clf.fit(os_data_X, os_data_y)

#Make a prediction
clf_pred = clf.predict(scaled_X_test)

In [None]:
plot_confusion_matrix(clf, scaled_X_test, y_test)

In [None]:
print("Here's how the DecisionTree model performed with the TRAINING data: \n")
print(classification_report(os_data_y, clf.predict(os_data_X)))
print('-'*60)
print("Here's how the DecisionTree performed with the TESTING data: \n")
print(classification_report(y_test, clf.predict(scaled_X_test)))

### Ensemble Methods

##### Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=0)
rf.fit(os_data_X, os_data_y)

In [None]:
rf_pred = rf.predict(scaled_X_test)

In [None]:
plot_confusion_matrix(rf, scaled_X_test, y_test)

In [None]:
print("Here's how the Random Forest model performed with the TRAINING data: \n")
print(classification_report(os_data_y, rf.predict(os_data_X)))
print('-'*60)
print("Here's how the Random Forest performed with the TESTING data: \n")
print(classification_report(y_test, rf.predict(scaled_X_test)))

##### Bagging Classifier

In [None]:
bagged_tree = BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5), n_estimators=20)
bagged_tree.fit(os_data_X, os_data_y)

In [None]:
bagged_pred = bagged_tree.predict(scaled_X_test)

In [None]:
plot_confusion_matrix(bagged_tree, scaled_X_test, y_test)

In [None]:
print("Here's how the Bagged Classifier performed with the TRAINING data: \n")
print(classification_report(os_data_y, bagged_tree.predict(os_data_X)))
print('-'*60)
print("Here's how the Bagged Classifier performed with the TESTING data: \n")
print(classification_report(y_test, bagged_tree.predict(scaled_X_test)))

##### Adaboost

In [None]:
adb = AdaBoostClassifier(random_state=42)
adb.fit(os_data_X, os_data_y)

In [None]:
plot_confusion_matrix(adb, scaled_X_test, y_test)

In [None]:
print("Here's how the AdaBoost performed with the TRAINING data: \n")
print(classification_report(os_data_y, adb.predict(os_data_X)))
print('-'*60)
print("Here's how the AdaBoost performed with the TESTING data: \n")
print(classification_report(y_test, adb.predict(scaled_X_test)))

##### Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(random_state=42)
gb.fit(os_data_X, os_data_y)

In [None]:
plot_confusion_matrix(gb, scaled_X_test, y_test)

In [None]:
print("Here's how the Gradient Booster performed with the TRAINING data: \n")
print(classification_report(os_data_y, gb.predict(os_data_X)))
print('-'*60)
print("Here's how the Gradient Booster performed with the TESTING data: \n")
print(classification_report(y_test, gb.predict(scaled_X_test)))

##### XGBoost

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(os_data_X, os_data_y)

In [None]:
plot_confusion_matrix(xgb, scaled_X_test, y_test)

In [None]:
print("Here's how the XGBoost model performed with the TRAINING data: \n")
print(classification_report(os_data_y, xgb.predict(os_data_X)))
print('-'*60)
print("Here's how the XGBoost model performed with the TESTING data: \n")
print(classification_report(y_test, xgb.predict(scaled_X_test)))

### SVM

In [None]:
svc = SVC(C=1000)
svc.fit(os_data_X, os_data_y)

In [None]:
plot_confusion_matrix(svc, scaled_X_test, y_test)

In [None]:
print("Here's how the SVC model performed with the TRAINING data: \n")
print(classification_report(os_data_y, svc.predict(os_data_X)))
print('-'*60)
print("Here's how the SVC model performed with the TESTING data: \n")
print(classification_report(y_test, svc.predict(scaled_X_test)))

In [None]:
recall_score(y_test, svc.predict(scaled_X_test))

In [None]:
models = {"Logistic Regression" : log_reg,
          "K-NN" : knn,
          "Naive Bayes" : nb_model ,
          "Decision Tree": clf,
          "Bagging Classifier" : bagged_tree,
          "Adaboost" : adb, 
          "Gradient Boosting": gb,
          "XGBoost" : xgb,
          "SVM": svc}

models_list = list(models.keys())
recall_results = []
accuracy_results = []
f1_scores = []
for _, model in models.items():
    recall = recall_score(y_test, model.predict(scaled_X_test))
    recall_results.append(recall)
    accuracy = accuracy_score(y_test, model.predict(scaled_X_test))
    accuracy_results.append(accuracy)
    f1 = f1_score(y_test, model.predict(scaled_X_test))
    f1_scores.append(f1)

In [None]:
zippedResults =  list(zip(models_list,recall_results,accuracy_results,f1_scores))

In [None]:
results_df = pd.DataFrame(zippedResults, columns = ['Model','Recall' , 'Accuracy', 'F1'])

In [None]:
display(results_df)

In [None]:
def compare_models(X_tr, X_tst, y_tr, y_tst):
        X_train, X_test, y_train, y_test = X_tr, X_tst, y_tr, y_tst
       
        models = [LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(),
                       AdaBoostClassifier(), GradientBoostingClassifier(), XGBClassifier(), SVC()]
        
        names = ['Log Regression','KNN', 'Decision Tree', 'Random Forest','AdaBoost', 'Gradient Boosting', 'XGB','SVM']
       
        recall_results = []
        accuracy_results = []
        f1_scores = []
        
        for i in range(len(models)):
            clf = models[i].fit(X_train, y_train)
            
            print("\n")
            print('-'*100)
            print(names[i])
            cmatrix = plot_confusion_matrix(clf, X_test, y_test)
            plt.show()
            
            recall = recall_score(y_test, clf.predict(X_test))
            recall_results.append(recall)
            accuracy = accuracy_score(y_test, clf.predict(X_test))
            accuracy_results.append(accuracy)
            f1 = f1_score(y_test, clf.predict(X_test))
            f1_scores.append(f1)
            
            print("Here's how the {} model performed with the TRAINING data: \n".format(names[i]))
            print(classification_report(y_train, clf.predict(X_train)))
            print('-'*75)
            print("Here's how the {} model performed with the TESTING data: \n".format(names[i]))
            print(classification_report(y_test, clf.predict(X_test)))
            
            
            #recall_scores.append(recall_score(y_test, clf.predict(X_test))
            #accuracyTrain.append(accuracy_score(y_train, clf.predict(X_train)))
            #accuracyTest.append(accuracy_score(y_test, clf.predict(X_test)))
            #f1_scores.append(f1_score(y_test, clf.predict(X_test))
            
            
        col1 = pd.DataFrame(names)
        col2 = pd.DataFrame(recall_results)
        col3 = pd.DataFrame(accuracy_results)
        col4 = pd.DataFrame(f1_scores)

        results = pd.concat([col1, col2, col3, col4], axis='columns')
        results.columns = ['Model', 'Recall Score', "Accuracy(Test)", "F1 Score",]
        
        return results

In [None]:
os_data_X.shape

In [None]:
os_data_y.shape

In [None]:
scaled_X_test.shape

In [None]:
y_test.shape

In [None]:
compare_models(os_data_X, scaled_X_test, os_data_y, y_test)