# Handling Data Imbalance in Classification Models

## Importing libraries

In [116]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from warnings import simplefilter

In [117]:
simplefilter(action = 'ignore', category = Warning)

## Reading the file

In [118]:
churnData = pd.read_csv('Customer-Churn.csv')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


## Checking the datatypes

In [119]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

## Converting the feature Total Chrges to numeric

In [120]:
churnData['TotalCharges']= pd.to_numeric(churnData['TotalCharges'], errors = 'coerce')

In [121]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [122]:
churnData['TotalCharges']

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64

## Dealing with NaN values

In [123]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [124]:
churnData['TotalCharges'] = churnData['TotalCharges'].fillna(churnData['TotalCharges'].mean())

In [125]:
churnData['TotalCharges']

0         29.85
1       1889.50
2        108.15
3       1840.75
4        151.65
         ...   
7038    1990.50
7039    7362.90
7040     346.45
7041     306.60
7042    6844.50
Name: TotalCharges, Length: 7043, dtype: float64

In [126]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## Selecting features for the model 

In [127]:
model_data = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges', 'Churn']]

In [128]:
model_data

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges,Churn
0,1,0,29.85,29.85,No
1,34,0,56.95,1889.50,No
2,2,0,53.85,108.15,Yes
3,45,0,42.30,1840.75,No
4,2,0,70.70,151.65,Yes
...,...,...,...,...,...
7038,24,0,84.80,1990.50,No
7039,72,0,103.20,7362.90,No
7040,11,0,29.60,346.45,No
7041,4,1,74.40,306.60,Yes


In [129]:
def bool_to_numeric(x):
    if x == 'No':
        return 0
    else:
        return 1


In [130]:
model_data['Churn'] = model_data['Churn'].apply(bool_to_numeric)

In [131]:
model_data['Churn']

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

## X-y split

In [132]:
y = model_data['Churn']
X = model_data.drop(['Churn'], axis=1)

## Train-test split

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.50, random_state = 42)

## Standard Scaler

In [134]:
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(X_train)
x_standardized_train = transformer.transform(X_train)
print(x_standardized_train.shape)
pd.DataFrame(x_standardized_train, columns=X_train.columns)

(3521, 4)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,0.556477,-0.439954,0.675524,0.744238
1,0.720057,-0.439954,1.450611,1.397120
2,-1.283794,-0.439954,-0.630061,-0.990906
3,-0.343211,-0.439954,0.511912,-0.123811
4,1.456165,-0.439954,1.663801,2.469385
...,...,...,...,...
3516,-1.283794,-0.439954,1.012662,-0.968871
3517,-0.384106,-0.439954,0.883756,-0.036334
3518,-0.833950,-0.439954,-1.428285,-0.875298
3519,-0.833950,2.272966,1.159747,-0.478882


In [135]:
x_standardized_test = transformer.transform(X_test)
print(x_standardized_test.shape)
pd.DataFrame(x_standardized_test, columns=X_test.columns)

(3522, 4)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,-1.283794,-0.439954,-1.307642,-0.999995
1,0.352003,-0.439954,-1.292769,-0.569196
2,0.801847,-0.439954,-1.487780,-0.553568
3,-1.283794,-0.439954,0.396228,-0.977140
4,1.415271,-0.439954,-0.456534,0.434435
...,...,...,...,...
3517,-1.283794,-0.439954,0.136763,-0.980620
3518,0.065738,-0.439954,-0.056595,-0.002372
3519,0.679162,-0.439954,0.059089,0.407345
3520,-0.384106,-0.439954,-1.477864,-0.810788


## Fitting a logistic regression model

In [136]:
from sklearn.linear_model import LogisticRegression

linear_regression = LogisticRegression()
linear_regression.fit(X_train, y_train)

LogisticRegression()

In [137]:
print("Test data accuracy was",linear_regression.score(X_test,y_test))
print("Train data accuracy was",linear_regression.score(X_train,y_train))

Test data accuracy was 0.7969903463940943
Train data accuracy was 0.786708321499574


## Cheking for unbalanced data

In [138]:
model_data['Churn'].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

We can see that there is a huge inbalance in the target data

In [139]:
from sklearn.utils import resample

category_0 = model_data[model_data['Churn'] == 0]
category_1 = model_data[model_data['Churn'] == 1]

## Downsampling

In [140]:
category_0_undersampled = resample(category_0, 
                                   replace=False, 
                                   n_samples = len(category_1))

In [141]:
print(category_0_undersampled.shape)
print(category_1.shape)

(1869, 5)
(1869, 5)


In [142]:
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

## Applying the model to the downsampled data

In [143]:
y = data_downsampled['Churn']
X = data_downsampled.drop(['Churn'], axis=1)

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.50, random_state = 42)

In [145]:
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(X_train)
x_standardized_train = transformer.transform(X_train)
print(x_standardized_train.shape)
pd.DataFrame(x_standardized_train, columns=X_train.columns)

(1869, 4)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,0.406068,2.026464,1.179745,0.827371
1,-1.035657,2.026464,-1.283193,-0.912023
2,-0.314795,-0.493470,-1.659548,-0.752964
3,0.618086,-0.493470,0.456793,0.652467
4,-0.442006,-0.493470,0.104945,-0.374603
...,...,...,...,...
1864,1.890197,-0.493470,0.407780,1.771206
1865,1.466160,-0.493470,-0.833317,0.354492
1866,-1.035657,-0.493470,-1.678803,-0.918484
1867,0.660490,2.026464,0.775382,0.880913


In [146]:
x_standardized_test = transformer.transform(X_test)
print(x_standardized_test.shape)
pd.DataFrame(x_standardized_test, columns=X_test.columns)

(1869, 4)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,-0.272391,-0.493470,-1.673552,-0.762774
1,1.508564,-0.493470,0.483051,1.407478
2,1.126931,-0.493470,0.089191,0.888087
3,1.126931,2.026464,-1.696308,-0.461520
4,-0.229987,-0.493470,-1.475747,-0.687520
...,...,...,...,...
1864,0.575683,-0.493470,-1.489751,-0.494230
1865,-0.187584,-0.493470,0.218727,-0.170933
1866,-0.526813,-0.493470,0.712365,-0.361229
1867,0.787701,2.026464,1.100973,1.217347


In [147]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

LogisticRegression()

In [148]:
print("Test data accuracy was",logistic_regression.score(X_test,y_test))
print("Train data accuracy was",logistic_regression.score(X_train,y_train))

Test data accuracy was 0.7158908507223114
Train data accuracy was 0.7180310326377742


Scores are a little bit lower in comparison to the model with unbalanced data. However the model becomes better, because it works with balanced data and the balance of the data doesn't influence the predictions.

## Upsampling

In [149]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [150]:
print(category_0.shape)
print(category_1_oversampled.shape)

(5174, 5)
(5174, 5)


In [151]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

## Applying the model to the upsampled data

In [152]:
y = data_upsampled['Churn']
X = data_upsampled.drop(['Churn'], axis=1)

In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.50, random_state = 42)

In [154]:
x_standardized_train = transformer.transform(X_train)
print(x_standardized_train.shape)
pd.DataFrame(x_standardized_train, columns=X_train.columns)

(5174, 4)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,-0.866043,2.026464,0.279994,-0.698138
1,-1.120465,-0.493470,-1.668300,-0.942001
2,-0.187584,-0.493470,0.976689,0.090437
3,-0.696428,-0.493470,-0.817563,-0.721584
4,-0.993254,-0.493470,0.451542,-0.797361
...,...,...,...,...
5169,-0.866043,-0.493470,0.068185,-0.737072
5170,-0.823639,-0.493470,0.598583,-0.636567
5171,-1.120465,-0.493470,0.248485,-0.915990
5172,1.678179,-0.493470,0.173214,1.420020


In [155]:
x_standardized_test = transformer.transform(X_test)
print(x_standardized_test.shape)
pd.DataFrame(x_standardized_test, columns=X_test.columns)

(5174, 4)


Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,-1.120465,2.026464,-0.780802,-0.929958
1,1.126931,-0.493470,-0.073605,0.717554
2,-0.866043,-0.493470,-0.574245,-0.783179
3,-0.357198,-0.493470,-0.434206,-0.454204
4,-1.120465,-0.493470,-1.654297,-0.941811
...,...,...,...,...
5169,0.024435,-0.493470,-0.105114,-0.095203
5170,-0.399602,2.026464,0.631842,-0.235972
5171,1.635775,-0.493470,1.316284,2.437541
5172,-0.866043,-0.493470,1.072966,-0.612646


In [156]:

logistic_regression.fit(X_train, y_train)

LogisticRegression()

In [157]:
print("Test data accuracy was",logistic_regression.score(X_test,y_test))
print("Train data accuracy was",logistic_regression.score(X_train,y_train))

Test data accuracy was 0.7311557788944724
Train data accuracy was 0.738113645148821


The results of the model are better for the upsampled data than for the undersampled. However they are still