In [30]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [31]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,True,False,False,True,True,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.5,0,False,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,1,False,True,True,False,True,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.3,1840.75,0,False,True,True,False,True,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.7,151.65,1,True,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False


In [32]:
x=df.drop('Churn',axis=1)
x

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,True,False,False,True,True,False,...,False,False,True,False,True,False,False,False,False,False
1,1,0,56.95,1889.50,False,True,True,False,True,False,...,False,False,False,True,False,False,True,False,False,False
2,2,0,53.85,108.15,False,True,True,False,True,False,...,False,False,False,True,True,False,False,False,False,False
3,3,0,42.30,1840.75,False,True,True,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,4,0,70.70,151.65,True,False,True,False,True,False,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,7038,0,84.80,1990.50,False,True,False,True,False,True,...,False,False,False,True,False,True,False,False,False,False
7028,7039,0,103.20,7362.90,True,False,False,True,False,True,...,False,True,False,False,False,False,False,False,False,True
7029,7040,0,29.60,346.45,True,False,False,True,False,True,...,False,False,True,False,True,False,False,False,False,False
7030,7041,1,74.40,306.60,False,True,False,True,True,False,...,False,False,False,True,True,False,False,False,False,False


In [33]:
y=df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

In [34]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [35]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [36]:
model_dt.fit(x_train,y_train) # "fit" ML method to train the models(The data will get trained after compiltion of fit)

In [37]:
y_pred=model_dt.predict(x_test)#the trained decision tree model (model_dt) to make predictions on a new dataset (x_test) that it hasn't seen before.
y_pred #y_pred, is a list of all the answers (predictions) the model came up with for the new problems in x_test

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [38]:
model_dt.score(x_test,y_test)#calculates the accuracy of your trained DecisionTreeClassifier model.

0.7860696517412935

In [39]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1030
           1       0.63      0.50      0.56       377

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



##### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
##### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.
##### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [40]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [41]:
from imblearn.combine import SMOTEENN

# Set random state for reproducibility
sm = SMOTEENN(random_state=42)
X_resampled, y_resampled = sm.fit_resample(x, y)

# Add random_state here too
xr_train, xr_test, yr_train, yr_test = train_test_split(
    X_resampled, y_resampled, 
    test_size=0.2, 
    random_state=42
)

model_dt_smote = DecisionTreeClassifier(
    criterion="gini", 
    random_state=100,  # Good - you already have this
    max_depth=6, 
    min_samples_leaf=8
)

model_dt_smote.fit(xr_train, yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)

print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))
print(metrics.confusion_matrix(yr_test, yr_predict))

0.9012096774193549
              precision    recall  f1-score   support

           0       0.90      0.89      0.89       452
           1       0.91      0.91      0.91       540

    accuracy                           0.90       992
   macro avg       0.90      0.90      0.90       992
weighted avg       0.90      0.90      0.90       992

[[401  51]
 [ 47 493]]


#### Random Forest Clasiifier 

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8) # n_estimator(number of trees)

In [44]:
model_rf.fit(x_train,y_train)

In [45]:
RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)


In [46]:
y_pred=model_rf.predict(x_test)

In [47]:
model_rf.score(x_test,y_test)

0.7995735607675906

In [48]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1030
           1       0.68      0.47      0.56       377

    accuracy                           0.80      1407
   macro avg       0.75      0.70      0.71      1407
weighted avg       0.79      0.80      0.79      1407



In [49]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

In [50]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [51]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [52]:
model_rf_smote.fit(xr_train1,yr_train1)

In [53]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [54]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [55]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.8918650793650794
              precision    recall  f1-score   support

           0       0.88      0.86      0.87       421
           1       0.90      0.91      0.91       587

    accuracy                           0.89      1008
   macro avg       0.89      0.89      0.89      1008
weighted avg       0.89      0.89      0.89      1008



In [56]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))


[[364  57]
 [ 52 535]]


##### With RF Classifier, also we are able to get quite good results, infact better than Decision Tree.
##### We can now further go ahead and create multiple classifiers to see how the model performance is, but that's not covered here, so you can do it by yourself :)

#### Pickel the Model

In [57]:
import pickle # save the file and reuse it whenever we want
filename = 'model.sav'
pickle.dump(model_rf_smote, open(filename, 'wb')) # wb-opens a write binary mode and saves the model into that file
load_model = pickle.load(open(filename, 'rb')) # rb-opens read binary mode and loads the model from that file into python
model_score_r1 = load_model.score(xr_test1, yr_test1) # calulates the accuracy of model
model_score_r1

0.8918650793650794