In [2]:
import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, classification_report

# Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'gabriel-predictive-analytics'
bucket = s3.Bucket(bucket_name)

# Defining the file to be read from s3 bucket
file_key = "telecom_churn.csv"

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# Reading the csv file
churn = pd.read_csv(file_content_stream)
churn.head(1)

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0


In [3]:
# Relative Frequency table
churn['Churn'].value_counts() / churn.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [4]:
# Defining the input and target variables
X = churn[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
Y = churn['Churn']

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y)

In [5]:
RF_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_train, Y_train)

# Predicting on test dataset
RF_pred = RF_md.predict_proba(X_test)[:,1]

# Computing the ROC curve
fpr, tpr, threshold = roc_curve(Y_test, RF_pred)

# Creating a data-frame
cutoffs = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': threshold})
cutoffs.head()

Unnamed: 0,False_Positive,True_Positive,Cutoff
0,0.0,0.0,1.693042
1,0.0,0.010309,0.693042
2,0.0,0.226804,0.57232
3,0.001754,0.226804,0.57229
4,0.001754,0.28866,0.551659


In [6]:
# Finding the optimal cut-off
cutoffs['True_Positive_minus_1'] = cutoffs['True_Positive'] - 1
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_minus_1']**2)

## Sorting based on the Euclidean distance
cutoffs = cutoffs.sort_values(by = 'Euclidean_dist').reset_index(drop = True)

## Changing likelihoods to labels
RF_pred = np.where(RF_pred < cutoffs['Cutoff'][0], 0, 1)

# Printing classification report
print(classification_report(Y_test, RF_pred))

              precision    recall  f1-score   support

           0       0.98      0.84      0.90       570
           1       0.49      0.90      0.63        97

    accuracy                           0.85       667
   macro avg       0.73      0.87      0.77       667
weighted avg       0.91      0.85      0.86       667



In [8]:
#### AdaBoost Classifier
Ada_md = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_train, Y_train)
    
# Predicting on test
ADA_pred = Ada_md.predict_proba(X_test)[:,1]

# Computing the ROC curve
fpr, tpr, threshold = roc_curve(Y_test, ADA_pred)

# Creating a data-frame
cutoffs_ADA = pd.DataFrame({'False_Positive': fpr, 'True_Positive': tpr, 'Cutoff': threshold})

# Finding the optimal cut-off
cutoffs_ADA['True_Positive_minus_1'] = cutoffs_ADA['True_Positive'] - 1
cutoffs_ADA['Euclidean_dist'] = np.sqrt(cutoffs_ADA['False_Positive']**2 + cutoffs_ADA['True_Positive_minus_1']**2)

## Sorting based on the Euclidean distance
cutoffs_ADA = cutoffs_ADA.sort_values(by = 'Euclidean_dist').reset_index(drop = True)


## Changing likelihoods to labels
ADA_pred = np.where(ADA_pred < cutoffs_ADA['Cutoff'][0], 0, 1)

# Printing classification report
print(classification_report(Y_test, ADA_pred))

              precision    recall  f1-score   support

           0       0.97      0.87      0.91       570
           1       0.52      0.82      0.63        97

    accuracy                           0.86       667
   macro avg       0.74      0.85      0.77       667
weighted avg       0.90      0.86      0.87       667



In [None]:
# I would choose the RandomForestClassifier model because it had a higher recall socore in the observtion 1.