In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [3]:
df=df.drop('Unnamed: 0',axis=1)

In [4]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,True,False,False,True,True,False,True,...,False,False,True,False,True,False,False,False,False,False
1,0,56.95,1889.50,False,True,True,False,True,False,False,...,False,False,False,True,False,False,True,False,False,False
2,0,53.85,108.15,False,True,True,False,True,False,False,...,False,False,False,True,True,False,False,False,False,False
3,0,42.30,1840.75,False,True,True,False,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,True,False,True,False,True,False,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,False,True,False,True,False,True,False,...,False,False,False,True,False,True,False,False,False,False
7028,0,103.20,7362.90,True,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,False,True
7029,0,29.60,346.45,True,False,False,True,False,True,True,...,False,False,True,False,True,False,False,False,False,False
7030,1,74.40,306.60,False,True,False,True,True,False,False,...,False,False,False,True,True,False,False,False,False,False


In [5]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

## Decision Tree Classifier

In [8]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)


In [9]:
model_dt.fit(x_train,y_train)

In [10]:
y_pred=model_dt.predict(x_test)
y_pred

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [11]:
model_dt.score(x_test,y_test)

0.7945984363894811

In [12]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1026
           1       0.67      0.48      0.56       381

    accuracy                           0.79      1407
   macro avg       0.75      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [13]:
# the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
#  we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

In [15]:
from imblearn.combine import SMOTEENN

sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x, y)


In [16]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [17]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [18]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.921280276816609
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       540
           1       0.93      0.93      0.93       616

    accuracy                           0.92      1156
   macro avg       0.92      0.92      0.92      1156
weighted avg       0.92      0.92      0.92      1156



In [19]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[495  45]
 [ 46 570]]


In [20]:
#Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.
#Let's try with some other classifier.

## Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [23]:
model_rf.fit(x_train,y_train)

In [24]:
y_pred=model_rf.predict(x_test)

In [25]:
model_rf.score(x_test,y_test)

0.7995735607675906

In [26]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.94      0.87      1026
           1       0.72      0.43      0.54       381

    accuracy                           0.80      1407
   macro avg       0.77      0.68      0.70      1407
weighted avg       0.79      0.80      0.78      1407



In [30]:
from imblearn.combine import SMOTEENN

sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x, y)


In [31]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [32]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [33]:
model_rf_smote.fit(xr_train1,yr_train1)

In [34]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [35]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [36]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.9354561101549054
              precision    recall  f1-score   support

           0       0.96      0.90      0.93       540
           1       0.92      0.96      0.94       622

    accuracy                           0.94      1162
   macro avg       0.94      0.93      0.93      1162
weighted avg       0.94      0.94      0.94      1162



In [37]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[487  53]
 [ 22 600]]


In [38]:
# With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.
# We can now further go ahead and create multiple classifiers to see how the model performance is, but that's not covered here, so you can do it by yourself :)


## Performing PCA

In [39]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [40]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [41]:
model.fit(xr_train_pca,yr_train1)

In [42]:

yr_predict_pca = model.predict(xr_test_pca)

In [43]:

model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [44]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.7495697074010327
              precision    recall  f1-score   support

           0       0.78      0.65      0.71       540
           1       0.73      0.84      0.78       622

    accuracy                           0.75      1162
   macro avg       0.75      0.74      0.74      1162
weighted avg       0.75      0.75      0.75      1162



In [45]:
# With PCA, we couldn't see any better results, hence let's finalise the model which was created by RF Classifier, and save the model so that we can use it in a later stage :)

## Pickling the model

In [46]:

import pickle

In [47]:
filename = 'model.sav'

In [48]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [49]:
load_model = pickle.load(open(filename, 'rb'))

In [50]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [51]:
model_score_r1

0.9354561101549054