In [1]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, classification_report

s3 = boto3.resource('s3')
bucket_name = 'grant-gonnerman-data-445'
bucket = s3.Bucket(bucket_name)

file_key = 'telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading data file
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [4]:
# creating the frequency table of Churn
churn_data['Churn'].value_counts() / churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [5]:
# defining the input and target variables
x = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
y = churn_data['Churn']

# splitting the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y)

In [6]:
y_train.value_counts() / y_train.shape[0]

0    0.855214
1    0.144786
Name: Churn, dtype: float64

In [7]:
y_test.value_counts() / y_test.shape[0]

0    0.854573
1    0.145427
Name: Churn, dtype: float64

Random Forest

In [10]:
# building the random forest model
rf_md = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(x_train, y_train)

#predicting on the test
rf_pred = rf_md.predict_proba(x_test)[:,1]

# ROC_AUC
fpr, tpr, threshold = roc_curve(y_test, rf_pred)

In [11]:
cutoffs = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'threshold': threshold})
cutoffs

Unnamed: 0,fpr,tpr,threshold
0,0.000000,0.000000,1.719000
1,0.000000,0.010309,0.719000
2,0.000000,0.113402,0.603439
3,0.001754,0.113402,0.603233
4,0.001754,0.206186,0.585676
...,...,...,...
154,0.971930,1.000000,0.050853
155,0.980702,1.000000,0.050840
156,0.987719,1.000000,0.050750
157,0.991228,1.000000,0.050735


In [12]:
cutoffs = cutoffs.drop(cutoffs.index[0], axis = 0)
cutoffs

Unnamed: 0,fpr,tpr,threshold
1,0.000000,0.010309,0.719000
2,0.000000,0.113402,0.603439
3,0.001754,0.113402,0.603233
4,0.001754,0.206186,0.585676
5,0.003509,0.206186,0.582446
...,...,...,...
154,0.971930,1.000000,0.050853
155,0.980702,1.000000,0.050840
156,0.987719,1.000000,0.050750
157,0.991228,1.000000,0.050735


In [14]:
cutoffs['Euclidean_dist'] = np.sqrt(cutoffs['fpr'] ** 2 + (1 - cutoffs['tpr']) ** 2)
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
1,0.000000,0.010309,0.719000,0.989691
2,0.000000,0.113402,0.603439,0.886598
3,0.001754,0.113402,0.603233,0.886600
4,0.001754,0.206186,0.585676,0.793816
5,0.003509,0.206186,0.582446,0.793822
...,...,...,...,...
154,0.971930,1.000000,0.050853,0.971930
155,0.980702,1.000000,0.050840,0.980702
156,0.987719,1.000000,0.050750,0.987719
157,0.991228,1.000000,0.050735,0.991228


In [15]:
cutoffs = cutoffs.sort_values(by = 'Euclidean_dist')
cutoffs

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
65,0.171930,0.824742,0.180677,0.245510
63,0.164912,0.814433,0.206923,0.248256
61,0.157895,0.804124,0.229031,0.251591
64,0.171930,0.814433,0.190506,0.252972
62,0.164912,0.804124,0.208991,0.256054
...,...,...,...,...
155,0.980702,1.000000,0.050840,0.980702
156,0.987719,1.000000,0.050750,0.987719
1,0.000000,0.010309,0.719000,0.989691
157,0.991228,1.000000,0.050735,0.991228


In [16]:
# changing likelihoods to labels
rf_labels = np.where(rf_pred < 0.180677, 0 , 1)

print(classification_report(y_test, rf_labels))

              precision    recall  f1-score   support

           0       0.97      0.83      0.89       570
           1       0.45      0.82      0.58        97

    accuracy                           0.83       667
   macro avg       0.71      0.83      0.74       667
weighted avg       0.89      0.83      0.85       667

