# Model Building
### Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTEENN #Used to Handle the IMBALANCED Data-Set

In [2]:
df = pd.read_csv('tel_churn.csv') #Read the telecom data file || That was converted to Dummies = For Model Building

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [4]:
df = df.drop('Unnamed: 0', axis=1)

In [5]:
df.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,0,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,0,1,1,0,1,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,1,0,1,1,0,1,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,1,0,1,0,1,0,...,0,0,1,0,1,0,0,0,0,0


In [6]:
#Creating X, & Y variables

x=df.drop('Churn',axis=1)
x.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.5,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.3,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.7,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0


In [7]:
y=df['Churn']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: Churn, dtype: int64

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

### Decision Tree Classifier

In [9]:
model_dt = DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [10]:
model_dt.fit(x_train, y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [11]:
#Predictions
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 1, ..., 0, 0, 0], dtype=int64)

In [12]:
#Model Evaluation
print(confusion_matrix(y_test,y_pred))
print('\n')
print(classification_report(y_test,y_pred))

[[911 114]
 [189 193]]


              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1025
           1       0.63      0.51      0.56       382

    accuracy                           0.78      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.77      0.78      0.78      1407



* Observation: Accuracy is Cursed, as data is Imbalanced
* Resolution: Use SMOTE Analysis using SMOTEENN
### SMOTE+ENN (Imbalanced Data Handling) - Decision Tree based
* To improve the model accuracy

In [13]:
sm = SMOTEENN()

In [14]:
X_resampled, y_resampled = sm.fit_resample(x,y) #Did Over sampling using SMOTE and Data Cleaning using ENN
#Handled the Imbalanced Dataset

In [15]:
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [16]:
model_dt_r = DecisionTreeClassifier(criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [17]:
model_dt_r.fit(xr_train, yr_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [18]:
#Prediction of resampled data
y_pred_smote = model_dt_r.predict(xr_test)
y_pred_smote

array([1, 0, 0, ..., 1, 1, 1], dtype=int64)

In [19]:
#Model Evaluation
print(confusion_matrix(yr_test,y_pred_smote))
print('\n')
print(classification_report(yr_test,y_pred_smote))

[[495  51]
 [ 20 606]]


              precision    recall  f1-score   support

           0       0.96      0.91      0.93       546
           1       0.92      0.97      0.94       626

    accuracy                           0.94      1172
   macro avg       0.94      0.94      0.94      1172
weighted avg       0.94      0.94      0.94      1172



##### Conclusion: Decision Tree Classifier provides || Accuracy : 93.75%
* Accuracy using Confusion Matrix =  Sum of (TP+TN)/ Sum (TP+TN+FP+FN)

### Random Forest Classifier

In [20]:
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

In [21]:
#Model Fitting || Resampled Features
model_rf.fit(xr_train, yr_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [22]:
#Predictions || Use resampled x test feature
y_pred_rf = model_rf.predict(x_test)

In [23]:
print(confusion_matrix(y_test, y_pred_rf))
print('\n')
print(classification_report(y_test, y_pred_rf))

[[712 313]
 [ 62 320]]


              precision    recall  f1-score   support

           0       0.92      0.69      0.79      1025
           1       0.51      0.84      0.63       382

    accuracy                           0.73      1407
   macro avg       0.71      0.77      0.71      1407
weighted avg       0.81      0.73      0.75      1407



* Observation: Accuracy is Cursed, as data is Imbalanced
* Resolution: Use SMOTE Analysis using SMOTEENN
### SMOTE+ENN (Imbalanced Data Handling) - Random Forest based
* To improve the model accuracy

In [24]:
sm_rf = SMOTEENN()

In [25]:
#Resample the variables
X_resampled, y_resampled = sm_rf.fit_resample(x,y)

In [26]:
#Trained the model || Used Resampled features
xr_train, xr_test, yr_train, yr_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=100)

In [27]:
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=6, min_samples_leaf=8)

In [28]:
#Model Fitting
model_rf_smote.fit(xr_train, yr_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8)

In [29]:
y_pred_rf_smote = model_rf_smote.predict(xr_test)

In [30]:
print(confusion_matrix(yr_test, y_pred_rf_smote))
print('\n')
print(classification_report(yr_test, y_pred_rf_smote))

[[504  44]
 [ 22 608]]


              precision    recall  f1-score   support

           0       0.96      0.92      0.94       548
           1       0.93      0.97      0.95       630

    accuracy                           0.94      1178
   macro avg       0.95      0.94      0.94      1178
weighted avg       0.94      0.94      0.94      1178



##### Conclusion: Random Forest Classifier provides || Accuracy : 94.39%
* Accuracy using Confusion Matrix =  Sum of (TP+TN)/ Sum (TP+TN+FP+FN)

### Save Model

In [31]:
import pickle

In [32]:
filename = 'churn_telecom_model.sav'

In [33]:
pickle.dump(model_rf_smote, open(filename, 'wb')) #Write Mode

### Load the Model || perform Checks

In [34]:
load_model = pickle.load(open(filename, 'rb'))

In [35]:
load_model.score(xr_test, yr_test)

0.9439728353140917