In [32]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [3]:
df = pd.read_csv('tel_churn.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,seniorcitizen,monthlycharges,totalcharges,churn,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,...,deviceprotection_Yes,techsupport_No,techsupport_No internet service,techsupport_Yes,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,65.6,593.3,0,True,False,False,True,False,...,False,False,False,True,True,False,False,False,False,False
1,1,0,59.9,542.4,0,False,True,True,False,True,...,False,True,False,False,True,False,False,False,False,False
2,2,0,73.9,280.85,1,False,True,True,False,True,...,True,True,False,False,True,False,False,False,False,False
3,3,1,98.0,1237.85,1,False,True,False,True,True,...,True,True,False,False,False,True,False,False,False,False
4,4,1,83.9,267.4,1,True,False,False,True,True,...,False,False,False,True,True,False,False,False,False,False


In [5]:
df  = df.drop("Unnamed: 0", axis=1)

In [7]:
# We create our feature columns
x = df.drop('churn', axis=1)
x.head()

Unnamed: 0,seniorcitizen,monthlycharges,totalcharges,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,dependents_Yes,contract_Month-to-month,...,deviceprotection_Yes,techsupport_No,techsupport_No internet service,techsupport_Yes,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,65.6,593.3,True,False,False,True,False,True,False,...,False,False,False,True,True,False,False,False,False,False
1,0,59.9,542.4,False,True,True,False,True,False,True,...,False,True,False,False,True,False,False,False,False,False
2,0,73.9,280.85,False,True,True,False,True,False,True,...,True,True,False,False,True,False,False,False,False,False
3,1,98.0,1237.85,False,True,False,True,True,False,True,...,True,True,False,False,False,True,False,False,False,False
4,1,83.9,267.4,True,False,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False


In [13]:
# create our target column
y = df['churn']
y.head()

0    0
1    0
2    1
3    1
4    1
Name: churn, dtype: int64

In [17]:
# splitting data set 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [18]:
# create our model
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [19]:
# fit our model
model_dt.fit(x_train, y_train)

In [21]:
# predict the model
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [22]:
# check our model accuracy
model_dt.score(x_test, y_test)

0.7910447761194029

In [23]:
print(classification_report(y_test, y_pred, labels=[0, 1]))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1039
           1       0.63      0.48      0.54       368

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.70      1407
weighted avg       0.78      0.79      0.78      1407


In [33]:
# our accuracy is low so we will use SMOTETomek to balance our data
smt = SMOTEENN()
X_resampled, y_resampled = smt.fit_resample(x, y)

In [34]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled,y_resampled,test_size=0.2)

In [35]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [36]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9278350515463918
              precision    recall  f1-score   support

           0       0.93      0.90      0.92       520
           1       0.92      0.95      0.94       644

    accuracy                           0.93      1164
   macro avg       0.93      0.93      0.93      1164
weighted avg       0.93      0.93      0.93      1164


In [37]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[469  51]
 [ 33 611]]


In [38]:
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [42]:
model_rf.fit(x_train,y_train)

In [43]:
y_pred=model_rf.predict(x_test)

In [44]:
model_rf.score(x_test,y_test)

0.8038379530916845

In [45]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1039
           1       0.66      0.51      0.57       368

    accuracy                           0.80      1407
   macro avg       0.75      0.71      0.72      1407
weighted avg       0.79      0.80      0.79      1407


In [48]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x, y)

In [49]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [51]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [52]:
model_rf_smote.fit(xr_train1,yr_train1)

In [53]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [55]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)
print(model_score_r1)

0.9269328802039083


In [56]:
print(confusion_matrix(yr_test1, yr_predict1))

[[470  58]
 [ 28 621]]


In [58]:
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [59]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [60]:
model.fit(xr_train_pca,yr_train1)

In [61]:
RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [62]:
yr_predict_pca = model.predict(xr_test_pca)

In [63]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [64]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7204757858963466
              precision    recall  f1-score   support

           0       0.72      0.62      0.66       528
           1       0.72      0.80      0.76       649

    accuracy                           0.72      1177
   macro avg       0.72      0.71      0.71      1177
weighted avg       0.72      0.72      0.72      1177


In [65]:
import pickle

In [67]:
filename = "model.sav"

In [68]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [69]:
loaded_model = pickle.load(open(filename, 'rb'))

In [71]:
model_score_r1 = loaded_model.score(xr_test1, yr_test1)

In [72]:
model_score_r1

0.9269328802039083