In [1]:
import boto3
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import roc_curve, classification_report

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'craig-shaffer-data-445-bucket'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the datafile
churn_data = pd.read_csv(file_content_stream)
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [2]:
#creating the frequency table of churn
churn_data['Churn'].value_counts()/churn_data.shape[0]

0    0.855086
1    0.144914
Name: Churn, dtype: float64

In [3]:
#defining the input and target variables
x=churn_data[['AccountWeeks','ContractRenewal','CustServCalls','MonthlyCharge','DayMins']]
y=churn_data['Churn']

#splitting the data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = .2,stratify=y)

In [4]:
y_train.value_counts()/y_train.shape[0]

0    0.855214
1    0.144786
Name: Churn, dtype: float64

In [6]:
y_test.value_counts()/y_test.shape[0]

0    0.854573
1    0.145427
Name: Churn, dtype: float64

**Random Forest**

In [7]:
Rf_md=RandomForestClassifier(n_estimators=500,max_depth=3).fit(x_train,y_train)

#predicting on test
Rf_pred=Rf_md.predict_proba(x_test)[:,1]

#ROC_AUC
fpr,tpr,threshold=roc_curve(y_test,Rf_pred)

In [12]:
cutoffs=pd.DataFrame({'fpr':fpr,'tpr':tpr,'threshold':threshold})
cutoffs

Unnamed: 0,fpr,tpr,threshold
0,0.000000,0.000000,1.649393
1,0.000000,0.010309,0.649393
2,0.000000,0.154639,0.591806
3,0.005263,0.154639,0.590321
4,0.005263,0.164948,0.590291
...,...,...,...
136,0.984211,1.000000,0.055339
137,0.991228,1.000000,0.055336
138,0.992982,1.000000,0.055334
139,0.996491,1.000000,0.055292


In [13]:
cutoffs=cutoffs.drop(cutoffs.index[0],axis=0)
cutoffs

Unnamed: 0,fpr,tpr,threshold
1,0.000000,0.010309,0.649393
2,0.000000,0.154639,0.591806
3,0.005263,0.154639,0.590321
4,0.005263,0.164948,0.590291
5,0.008772,0.164948,0.586535
...,...,...,...
136,0.984211,1.000000,0.055339
137,0.991228,1.000000,0.055336
138,0.992982,1.000000,0.055334
139,0.996491,1.000000,0.055292


In [14]:
cutoffs['Euclidean_dist']=np.sqrt(cutoffs['fpr']**2 + (1-(cutoffs['tpr']**2)))
cutoffs=cutoffs.sort_values(by='Euclidean_dist')
cutoffs             

Unnamed: 0,fpr,tpr,threshold,Euclidean_dist
70,0.182456,0.907216,0.124396,0.458529
68,0.178947,0.896907,0.128526,0.477053
69,0.182456,0.896907,0.126811,0.478380
66,0.161404,0.886598,0.156475,0.489893
67,0.178947,0.886598,0.129264,0.495950
...,...,...,...,...
137,0.991228,1.000000,0.055336,0.991228
138,0.992982,1.000000,0.055334,0.992982
139,0.996491,1.000000,0.055292,0.996491
1,0.000000,0.010309,0.649393,0.999947


In [16]:
#changing likelihoods to labels
Rf_labels=np.where(Rf_pred<0.124396,0,1)

print(classification_report(y_test,Rf_labels))

              precision    recall  f1-score   support

           0       0.98      0.82      0.89       570
           1       0.46      0.90      0.60        97

    accuracy                           0.83       667
   macro avg       0.72      0.86      0.75       667
weighted avg       0.90      0.83      0.85       667

