In [1]:
#!pip uninstall scikit-learn --yes
#!pip uninstall imblearn --yes
#!pip install scikit-learn==1.2.2
#!pip install imblearn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import sklearn as sk
import warnings
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, confusion_matrix, ConfusionMatrixDisplay, auc
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from scipy import stats

warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv(r"C:\Users\angel\Desktop\Data Analysis\Ironhack\Labs\lab-imbalanced-data\files_for_lab\customer_churn.csv")
data.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
data.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [20]:
#Checking that all values in id have the same format

pattern = re.compile(r'^\d{4}-[A-Z]{1,}$')
checking_id = data["customerID"].apply(lambda x: bool(re.match(pattern, x)))
checking_id[checking_id.values == False].count()

0

In [6]:
# Checking the values of the rest of features

for i in data.columns:
    print(data[i].value_counts())

customerID
7590-VHVEG    1
3791-LGQCY    1
6008-NAIXK    1
5956-YHHRX    1
5365-LLFYV    1
             ..
9796-MVYXX    1
2637-FKFSY    1
1552-AAGRX    1
4304-TSPVK    1
3186-AJIEK    1
Name: count, Length: 7043, dtype: int64
gender
Male      3555
Female    3488
Name: count, dtype: int64
SeniorCitizen
0    5901
1    1142
Name: count, dtype: int64
Partner
No     3641
Yes    3402
Name: count, dtype: int64
Dependents
No     4933
Yes    2110
Name: count, dtype: int64
tenure
1     613
72    362
2     238
3     200
4     176
     ... 
28     57
39     56
44     51
36     50
0      11
Name: count, Length: 73, dtype: int64
PhoneService
Yes    6361
No      682
Name: count, dtype: int64
MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64
InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64
OnlineSecurity
No                     3498
Yes                    2019
No internet service    15

Here we spot that the dependent variable seems to be imbalanced

In [22]:
#Showing the imbalance in the target variable

data["Churn"].value_counts() 

Churn
No     5174
Yes    1869
Name: count, dtype: int64

Implementing the model

In [7]:
X = data[["tenure","SeniorCitizen", "MonthlyCharges"]]
y = data["Churn"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [13]:
scaler = StandardScaler()
scaler.fit(X_train)

In [14]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
model = LogisticRegression()

In [16]:
model.fit(X_train_scaled, y_train)

We see really good results, but thats mainly because of the imbalances that we pointed before

In [17]:
prediction = model.predict(X_test_scaled)
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

          No       0.82      0.91      0.86      1294
         Yes       0.64      0.45      0.53       467

    accuracy                           0.79      1761
   macro avg       0.73      0.68      0.70      1761
weighted avg       0.77      0.79      0.77      1761



Resampling the data

In [23]:
smote = SMOTE()

In [24]:
X_train, y_train = smote.fit_resample(X_train, y_train)
model.fit(X_train, y_train)

Even if the precision is worse now, the model is performing better since its not predicting from imbalanced data

In [25]:
prediction = model.predict(X_test)
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

          No       0.88      0.72      0.79      1294
         Yes       0.48      0.73      0.58       467

    accuracy                           0.72      1761
   macro avg       0.68      0.72      0.69      1761
weighted avg       0.77      0.72      0.74      1761



In [26]:
tomek = TomekLinks()

In [27]:
X_train, y_train = tomek.fit_resample(X_train, y_train)
model.fit(X_train, y_train)

Smote results seems to be better

In [28]:
prediction = model.predict(X_test)
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

          No       0.87      0.74      0.80      1294
         Yes       0.50      0.70      0.58       467

    accuracy                           0.73      1761
   macro avg       0.68      0.72      0.69      1761
weighted avg       0.77      0.73      0.74      1761

