In [1]:
# Importing libraries
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [2]:
df = pd.read_csv("data_mdl.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,0,1,0,1,0,0,0,0,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,0,1,0,0,1,0,0,0
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,0,0,1,1,0,0,0,0,0
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,0,1,0,1,0,0,0,0,0


In [3]:
df = df.drop('Unnamed: 0',axis=1)

In [4]:
x = df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_group_1 - 12,tenure_group_13 - 24,tenure_group_25 - 36,tenure_group_37 - 48,tenure_group_49 - 60,tenure_group_61 - 72
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,1,0,0,0,0,0,0,1,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,0,0,0,1,0,1,0,0,0,0
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,0,1,0,1,0,0,0,0,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,0,0,1,1,0,0,0,0,0


In [5]:
y = df['Churn']
y

0       0
1       0
2       1
3       0
4       1
       ..
7027    0
7028    0
7029    0
7030    1
7031    0
Name: Churn, Length: 7032, dtype: int64

## Train Test Split

In [6]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

## Classification algorithms

Classification algorithms are a type of supervised learning algorithm that are used to predict the category of a new observation based on a set of training data. The training data consists of a set of observations, each of which has been labeled with its corresponding category. The classification algorithm learns from the training data to identify the features that are most important for predicting the category of a new observation

There are many different types of classification algorithms, each with its own strengths and weaknesses. Some of the most common classification algorithms include:

Decision trees

Random forests

Logistic regression

naive Bayes classifier

## Decision Tree Classifier

Decision trees: Decision tree is a popular classification algorithm. They are easy to understand and interpret, and they can be used to model complex relationships between features and labels. However, decision trees can be sensitive to overfitting, and they can be difficult to generalize to new data

In [7]:
model_dt = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth = 6, min_samples_leaf=8)

In [8]:
model_dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [9]:
y_pred = model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [10]:
model_dt.score(x_test,y_test)

0.7903340440653873

In [11]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1049
           1       0.61      0.49      0.54       358

    accuracy                           0.79      1407
   macro avg       0.72      0.69      0.70      1407
weighted avg       0.78      0.79      0.78      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [12]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x,y)

In [13]:
xr_train,xr_test,yr_train,yr_test = train_test_split(X_resampled, y_resampled,test_size=0.2)

In [14]:
model_dt_smote = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [15]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)

0.9402220324508966


In [16]:
print(metrics.classification_report(yr_test, yr_predict))

              precision    recall  f1-score   support

           0       0.95      0.92      0.94       550
           1       0.93      0.96      0.94       621

    accuracy                           0.94      1171
   macro avg       0.94      0.94      0.94      1171
weighted avg       0.94      0.94      0.94      1171



In [17]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[506  44]
 [ 26 595]]


Now we can see quite better results, i.e. Accuracy: 94%, and a very good recall, precision & f1 score for minority class.
Let's try with some other classifier.

## Random Forest Classifier

random forest classifier is a type of machine learning algorithm that can be used for classification tasks. A random forest classifier is an ensemble of decision trees, which means that it is made up of multiple decision trees. Each decision tree is trained on a different subset of the data, and the predictions from all of the decision trees are then combined to make a final prediction.

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
model_rf = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

The RandomForestClassifier class has a number of parameters that can be used to control its behavior. Some of the most important parameters include:

n_estimators: The number of decision trees in the forest. 

max_depth: The maximum depth of the decision trees.

min_samples_split: The minimum number of samples required to split a node in a decision tree.

random_state: A random number generator seed that can be used to ensure that the results are reproducible.

In [20]:
model_rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [21]:
y_pred = model_rf.predict(x_test)

In [22]:
model_rf.score(x_test,y_test)

0.798862828713575

In [23]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1049
           1       0.64      0.49      0.55       358

    accuracy                           0.80      1407
   macro avg       0.74      0.70      0.71      1407
weighted avg       0.79      0.80      0.79      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [24]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_resample(x,y)

SMOTEENN is a combination of the SMOTE and Edited Nearest Neighbors algorithms for dealing with imbalanced datasets. The SMOTE algorithm generates synthetic data points for the minority class, while the Edited Nearest Neighbors algorithm removes noisy data points from both the minority and majority classes

The fit_sample() method on the SMOTEENN class takes two arguments: the training data and the training labels. The method returns two outputs: the resampled training data and the resampled training labels

The resampled data and labels can then be used to train a machine learning model. The model will be less likely to be biased towards the majority class because the resampling process has helped to balance the dataset

Here is a more detailed explanation of the SMOTE and Edited Nearest Neighbors algorithms:

SMOTE (Synthetic Minority Oversampling Technique) is an oversampling algorithm that generates synthetic data points for the minority class. The synthetic data points are created by finding the k nearest neighbors of a minority data point and then creating a new data point that is a linear combination of the k nearest neighbors

Edited Nearest Neighbors (ENN) is an undersampling algorithm that removes noisy data points from both the minority and majority classes. The ENN algorithm works by finding the k nearest neighbors of each data point and then removing the data point if it is misclassified by its k nearest neighbors


The combination of SMOTE and ENN can be very effective for dealing with imbalanced datasets. The SMOTE algorithm helps to balance the dataset by generating synthetic data points for the minority class, while the ENN algorithm helps to remove noisy data points from both the minority and majority classes. This can lead to a more accurate machine learning model

In [25]:
xr_train1,xr_test1,yr_train1,yr_test1 = train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [26]:
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [27]:
model_rf_smote.fit(xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [28]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [29]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)
print(model_score_r1)

0.9358196010407632


In [30]:
print(metrics.classification_report(yr_test1, yr_predict1))

              precision    recall  f1-score   support

           0       0.94      0.91      0.93       503
           1       0.93      0.95      0.94       650

    accuracy                           0.94      1153
   macro avg       0.94      0.93      0.93      1153
weighted avg       0.94      0.94      0.94      1153



In [31]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[459  44]
 [ 30 620]]


With RF Classifier, also we are able to get quite good results  ie Accuracy = 94%, infact better than Decision Tree.
We can now further go ahead and create multiple classifiers to see how the model performance is.

## Performing PCA

principal component analysis (PCA), is a dimensionality reduction technique that can be used to reduce the number of features in a dataset while preserving as much of the information as possible.

In [32]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [33]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [34]:
model.fit(xr_train_pca,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [35]:
yr_predict_pca = model.predict(xr_test_pca)

In [36]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [37]:
print(model_score_r_pca)

0.7267996530789246


In [38]:
print(metrics.classification_report(yr_test1, yr_predict_pca))

              precision    recall  f1-score   support

           0       0.71      0.63      0.67       503
           1       0.74      0.80      0.77       650

    accuracy                           0.73      1153
   macro avg       0.72      0.72      0.72      1153
weighted avg       0.73      0.73      0.72      1153



With PCA, we couldn't see any better results 

## Logistic regression

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [40]:
scaler = StandardScaler()
X_train_lg = scaler.fit_transform(xr_train1)
X_test_lg = scaler.transform(xr_test1)

In [41]:
model_lg_smote = LogisticRegression(random_state = 100,max_iter = 1000)

In [42]:
model_lg_smote.fit(X_train_lg,yr_train1)

LogisticRegression(max_iter=1000, random_state=100)

In [43]:
yr_predict_lg = model_lg_smote.predict(X_test_lg )

In [44]:
model_score_lg = model_lg_smote.score(X_test_lg , yr_test1)

In [45]:
print(model_score_lg)

0.9401561144839549


In [46]:
print(metrics.classification_report(yr_test1, yr_predict_lg))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       503
           1       0.95      0.94      0.95       650

    accuracy                           0.94      1153
   macro avg       0.94      0.94      0.94      1153
weighted avg       0.94      0.94      0.94      1153



In [47]:
print(metrics.confusion_matrix(yr_test1, yr_predict_lg))

[[472  31]
 [ 38 612]]


Logistic regression classifier Accuracy = 94%

## Naive Bayes classifier

In [48]:
from sklearn.naive_bayes import GaussianNB

In [49]:
clf = GaussianNB()

In [56]:
X_train_nb = X_train_lg
X_test_nb = X_test_lg

In [57]:
clf.fit(X_train_nb,yr_train1)

GaussianNB()

In [58]:
yr_predict_nb = clf.predict(X_test_nb)

In [59]:
model_score_nb = clf.score(X_test_nb , yr_test1)

In [60]:
print(model_score_nb)

0.9011274934952298


In [61]:
print(metrics.classification_report(yr_test1, yr_predict_nb))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       503
           1       0.91      0.91      0.91       650

    accuracy                           0.90      1153
   macro avg       0.90      0.90      0.90      1153
weighted avg       0.90      0.90      0.90      1153



In [62]:
print(metrics.confusion_matrix(yr_test1, yr_predict_nb))

[[447  56]
 [ 58 592]]


Naive Bayes classifier Accuracy = 90%

### Pickling the model

let's finalise the model which was created by RF Classifier, and save the model so that we can use it in a later stage

In [63]:
import pickle

In [64]:
filename = 'model.sav'

In [65]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [66]:
load_model = pickle.load(open(filename, 'rb'))

In [67]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [68]:
model_score_r1

0.9358196010407632

Our final model i.e. RF Classifier with SMOTEENN, is now ready and dumped in model.sav, which we will use and prepare API's so that we can access our model from UI.