# Lab | Imbalanced data

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Reading the CSV file
churnData = pd.read_csv('customer_churn.csv')
churnData.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Exploring the data
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
# Erasing the 'blank' entries on the 'TotalCharges' variable
display(churnData['TotalCharges'].value_counts())
rows_to_drop = list(churnData[churnData['TotalCharges'] == ' '].index)
churnData.drop(churnData.index[rows_to_drop],inplace=True)
display(churnData['TotalCharges'].value_counts())
churnData['TotalCharges'] = list(map(float,churnData['TotalCharges']))

20.2       11
           11
19.75       9
19.65       8
19.9        8
           ..
455.3       1
1415.55     1
6511.25     1
1187.05     1
471.85      1
Name: TotalCharges, Length: 6531, dtype: int64

20.2       11
19.75       9
20.05       8
19.9        8
19.65       8
           ..
455.3       1
1415.55     1
6511.25     1
1187.05     1
324.25      1
Name: TotalCharges, Length: 6530, dtype: int64

In [5]:
# Checking the results
churnData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [6]:
# Verifying the counts for 'Churn'
churnData['Churn'].value_counts()

No     5163
Yes    1869
Name: Churn, dtype: int64

There is a high imbalance between the two categories 

In [7]:
# Droping the useless columns
churnData_d = churnData.drop(['customerID', 'gender','Partner','Dependents','PhoneService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','MultipleLines', 'InternetService', 'Contract', 'PaymentMethod', 'TotalCharges'], axis=1)
churnData_d.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn
0,0,1,29.85,No
1,0,34,56.95,No
2,0,2,53.85,Yes
3,0,45,42.3,No
4,0,2,70.7,Yes


In [8]:
# Checking the info
churnData_d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SeniorCitizen   7032 non-null   int64  
 1   tenure          7032 non-null   int64  
 2   MonthlyCharges  7032 non-null   float64
 3   Churn           7032 non-null   object 
dtypes: float64(1), int64(2), object(1)
memory usage: 274.7+ KB


In [9]:
# Checking the collinearity
churnData_d[churnData_d.columns[:-1]].corr()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
SeniorCitizen,1.0,0.015683,0.219874
tenure,0.015683,1.0,0.246862
MonthlyCharges,0.219874,0.246862,1.0


It looks okay, no strong correlation among them, we can proceed using this variables

In [10]:
# Let's check our numerical data for outliers, need for scaling, etc

churnData_d.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7032.0,7032.0,7032.0
mean,0.1624,32.421786,64.798208
std,0.368844,24.54526,30.085974
min,0.0,1.0,18.25
25%,0.0,9.0,35.5875
50%,0.0,29.0,70.35
75%,0.0,55.0,89.8625
max,1.0,72.0,118.75


We have different ranges,we need to perform the scalling later

In [11]:
# Creating a df for categorical variables
cat_cols = churnData_d.select_dtypes(include='object')
cat_cols.head()

Unnamed: 0,Churn
0,No
1,No
2,Yes
3,No
4,Yes


In [12]:
# Creating a df for numerical variables
num_cols = churnData_d.select_dtypes(include=['int64','float64'])
num_cols.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


In [13]:
# Getting dummies for 'Churn'
cat_cols['Churn'] = np.where(cat_cols['Churn'] == 'Yes',1,0)
cat_cols.head()

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,1


In [14]:
# Combining the data into X
X = num_cols
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


In [15]:
# Defining the target
y = pd.DataFrame(data=cat_cols)
transformer = StandardScaler().fit(X)
scaled_x = transformer.transform(X)

In [16]:
# Checking the result (Target y)
display(y)

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,1
...,...
7038,0
7039,0
7040,0
7041,1


In [17]:
# Checking the results (Scaled x)
display(scaled_x)

array([[-0.44032709, -1.28024804, -1.16169394],
       [-0.44032709,  0.06430269, -0.26087792],
       [-0.44032709, -1.23950408, -0.36392329],
       ...,
       [-0.44032709, -0.87280842, -1.17000405],
       [ 2.27103902, -1.15801615,  0.31916782],
       [-0.44032709,  1.36810945,  1.35793167]])

In [18]:
# Applying the X-y split
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.3, random_state=100)

In [19]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model is: 0.77 


With this imbalance, when we tried to fit the model it still gave us an accuracy of 77%

In [20]:
print("The accuracy of a blind guess is: %4.2f " % (5163/(5163+1869)))

The accuracy of a blind guess is: 0.73 


In [21]:
print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model is: 0.38 


As we can see the difference between the 'blind guess' and the 'logistic regression' results are too low so every model will give us an accuracy of at least 70%

# Upsampling

In [22]:
#Upsampling
counts = churnData_d['Churn'].value_counts()
yes = churnData_d[churnData_d['Churn']=='Yes'].sample(counts[0], replace=True)
no = churnData_d[churnData_d['Churn']=='No']
churnData_du = pd.concat([yes,no], axis=0)
churnData_du = churnData_du.sample(frac=1)
churnData_du['Churn'].value_counts()

Yes    5163
No     5163
Name: Churn, dtype: int64

In [23]:
#Checking the results
display(churnData_du)

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,Churn
5692,0,9,54.80,No
2300,0,48,103.25,Yes
4329,0,17,94.40,Yes
5052,0,18,45.65,No
326,1,11,82.90,No
...,...,...,...,...
609,1,65,105.25,Yes
4693,0,52,25.60,No
889,0,63,100.55,Yes
687,1,2,49.25,Yes


In contrast, before we had.

In [24]:
counts = churnData_d['Churn'].value_counts()
counts

No     5163
Yes    1869
Name: Churn, dtype: int64

In [25]:
# Creating a new df for categorical variables
cat_cols_u = churnData_du.select_dtypes(include='object')
cat_cols_u.head()

Unnamed: 0,Churn
5692,No
2300,Yes
4329,Yes
5052,No
326,No


In [26]:
# Creating a new df for numerical variables
num_cols_u = churnData_du.select_dtypes(include=['int64','float64'])
num_cols_u.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
5692,0,9,54.8
2300,0,48,103.25
4329,0,17,94.4
5052,0,18,45.65
326,1,11,82.9


In [27]:
# Getting dummies for 'Churn'
cat_cols_u['Churn'] = np.where(cat_cols_u['Churn'] == 'Yes',1,0)
cat_cols_u.head()

Unnamed: 0,Churn
5692,0
2300,1
4329,1
5052,0
326,0


In [28]:
# Combining the data into X again
X = num_cols_u
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
5692,0,9,54.8
2300,0,48,103.25
4329,0,17,94.4
5052,0,18,45.65
326,1,11,82.9


In [29]:
# Defining the target
y = pd.DataFrame(data=cat_cols_u)
transformer = StandardScaler().fit(X)
scaled_x = transformer.transform(X)

In [30]:
# Checking the result (new Target y)
display(y)

Unnamed: 0,Churn
5692,0
2300,1
4329,1
5052,0
326,0
...,...
609,1
4693,0
889,1
687,1


In [31]:
# Checking the result (Scaled x)
display(scaled_x)

array([[-0.48920169, -0.78181906, -0.45242087],
       [-0.48920169,  0.83876072,  1.23416295],
       [-0.48920169, -0.44939244,  0.92608727],
       ...,
       [-0.48920169,  1.46206064,  1.14017376],
       [ 2.04414667, -1.07269236, -0.64562088],
       [-0.48920169,  1.00497404, -1.64121009]])

In [32]:
# Applying the X-y split
X_train, X_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.3, random_state=100)

In [33]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model is: 0.74 


In [34]:
print("The kappa of the logistic regression model is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model is: 0.48 


# RandomUnderSampling

In [35]:
num = churnData_d.select_dtypes(include=['float64','int64'])
X = pd.concat([num], axis=1, sort=False)
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


In [36]:
# Creating another df for categorical variables
cat_cols_d = churnData_d.select_dtypes(include='object')
cat_cols_d.head()

Unnamed: 0,Churn
0,No
1,No
2,Yes
3,No
4,Yes


In [37]:
# Getting dummies for 'Churn'
cat_cols_d['Churn'] = np.where(cat_cols_d['Churn'] == 'Yes',1,0)
cat_cols_d.head()

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,1


In [38]:
# Downsampling
rus = RandomUnderSampler() 
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = cat_cols_d['Churn']
X_rus, y_rus = rus.fit_sample(X, y)

In [39]:
y.value_counts()

0    5163
1    1869
Name: Churn, dtype: int64

In [40]:
y_rus = pd.DataFrame(data=np.array(y_rus).flatten())
y_rus[0].value_counts()

1    1869
0    1869
Name: 0, dtype: int64

In [41]:
transformer = StandardScaler().fit(X_rus)
X = transformer.transform(X_rus)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y_rus, test_size=0.3, random_state=100)

In [43]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model after undersampling is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model after undersampling is: 0.72 


In [44]:
from sklearn.metrics import cohen_kappa_score

print("The kappa of the logistic regression model after undersampling is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model after undersampling is: 0.45 


# RandomOverSampling

In [45]:
num = churnData_d.select_dtypes(include=['float64','int64'])
X = pd.concat([num], axis=1, sort=False)
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


In [46]:
# Creating another df for categorical variables
cat_cols_ros = churnData_d.select_dtypes(include='object')
cat_cols_ros.head()

Unnamed: 0,Churn
0,No
1,No
2,Yes
3,No
4,Yes


In [47]:
# Getting dummies for 'Churn'
cat_cols_ros['Churn'] = np.where(cat_cols_ros['Churn'] == 'Yes',1,0)
cat_cols_ros.head()

Unnamed: 0,Churn
0,0
1,0
2,1
3,0
4,1


In [48]:
ros = RandomOverSampler()
transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = cat_cols_ros['Churn']
X_ros, y_ros = ros.fit_sample(X, y)

In [49]:
y.value_counts()

0    5163
1    1869
Name: Churn, dtype: int64

In [50]:
y_ros = pd.DataFrame(data=np.array(y_ros).flatten())
y_ros[0].value_counts()

1    5163
0    5163
Name: 0, dtype: int64

In [51]:
transformer = StandardScaler().fit(X_ros)
X = transformer.transform(X_ros)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y_ros, test_size=0.3, random_state=100)

In [53]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model after oversampling is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model after oversampling is: 0.73 


In [54]:
print("The kappa of the logistic regression model after undersampling is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model after undersampling is: 0.46 


# Synthetic Minority Oversampling Technique (SMOTE)

In [55]:
num = churnData_d.select_dtypes(include=['float64','int64'])
X = pd.concat([num], axis=1, sort=False)
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


In [56]:
smote = SMOTE()

transformer = StandardScaler().fit(X)
X = transformer.transform(X)
y = churnData_d['Churn']
X_sm, y_sm = smote.fit_sample(X, y)
y_sm = pd.DataFrame(data=np.array(y_sm).flatten())
y_sm[0].value_counts()

Yes    5163
No     5163
Name: 0, dtype: int64

In [57]:
transformer = StandardScaler().fit(X_sm)
X = transformer.transform(X_sm)

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y_sm, test_size=0.3, random_state=100)

In [59]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model after oversampling is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model after oversampling is: 0.73 


In [60]:
print("The kappa of the logistic regression model after undersampling is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model after undersampling is: 0.46 


# UnderSampling using TomekLinks 

In [61]:
num = churnData_d.select_dtypes(include=['float64','int64'])
X = pd.concat([num], axis=1, sort=False)
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


In [62]:
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X, y)
y_tl = pd.DataFrame(data=np.array(y_tl).flatten())
y_tl[0].value_counts()

No     4700
Yes    1869
Name: 0, dtype: int64

In [63]:
transformer = StandardScaler().fit(X_tl)
X = transformer.transform(X_tl)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y_tl, test_size=0.3, random_state=100)

In [65]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model after oversampling is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model after oversampling is: 0.79 


In [66]:
print("The kappa of the logistic regression model after undersampling is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model after undersampling is: 0.46 


In [67]:
X_tl2, y_tl2 = tl.fit_sample(X_tl, y_tl)
y_tl2 = pd.DataFrame(data=np.array(y_tl2).flatten())
y_tl2[0].value_counts()

No     4557
Yes    1869
Name: 0, dtype: int64

In [68]:
transformer = StandardScaler().fit(X_tl2)
X = transformer.transform(X_tl2)

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y_tl2, test_size=0.3, random_state=100)

In [70]:
classification = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, y_train)
y_pred = classification.predict(X_test)
print("The accuracy of the logistic_regression model after oversampling is: %4.2f "% (classification.score(X_test, y_test)))

The accuracy of the logistic_regression model after oversampling is: 0.79 


In [71]:
print("The kappa of the logistic regression model after undersampling is: %4.2f " %(cohen_kappa_score(y_pred,y_test)) )

The kappa of the logistic regression model after undersampling is: 0.45 


As we can see, none of the techniques used has significantly improved the intended model.

Model 1:
    The accuracy of the logistic_regression model is: 0.77 \
    The kappa of the logistic regression model is: 0.38

Model 2 (UpSampling):
    The accuracy of the logistic_regression model is: 0.74 \
    The kappa of the logistic regression model is: 0.48

Model 3 (RUS):
    The accuracy of the logistic_regression model after undersampling is: 0.72 \
    The kappa of the logistic regression model after undersampling is: 0.45 

Model 4 (ROS):
    The accuracy of the logistic_regression model after oversampling is: 0.73 \
    The kappa of the logistic regression model after undersampling is: 0.46 

Model 5 (SMOTE):
    The accuracy of the logistic_regression model after oversampling is: 0.73 \
    The kappa of the logistic regression model after undersampling is: 0.46 

Model 6 (TomekLinks):
    The accuracy of the logistic_regression model after oversampling is: 0.79 \
    The kappa of the logistic regression model after undersampling is: 0.45 

Everything will depend on the reality in which the model will be applied but, overall, we can consider the TomekLinks approach the best technique with an Accuracy of 0.79 and a kappa of 0.45.