# Lab | Handling Data Imbalance in Classification Models

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

In [37]:
churn_data = pd.read_csv('files_for_lab/Customer-Churn.csv')
churn_data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [38]:
churn_data.columns = [x.lower() for x in churn_data.columns]

In [39]:
churn_data.dtypes

gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [40]:
churn_data['totalcharges'] = pd.to_numeric(churn_data['totalcharges'], errors='coerce')

In [41]:
churn_data.isna().sum()/len(churn_data)

gender              0.000000
seniorcitizen       0.000000
partner             0.000000
dependents          0.000000
tenure              0.000000
phoneservice        0.000000
onlinesecurity      0.000000
onlinebackup        0.000000
deviceprotection    0.000000
techsupport         0.000000
streamingtv         0.000000
streamingmovies     0.000000
contract            0.000000
monthlycharges      0.000000
totalcharges        0.001562
churn               0.000000
dtype: float64

In [42]:
churn_data = churn_data.dropna().reset_index(drop=True)
churn_data.head()

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,monthlycharges,totalcharges,churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [43]:
x = churn_data[['tenure', 'seniorcitizen', 'monthlycharges', 'totalcharges']]
y = churn_data['churn']

In [44]:
x.dtypes

tenure              int64
seniorcitizen       int64
monthlycharges    float64
totalcharges      float64
dtype: object

In [45]:
scaled = StandardScaler().fit_transform(x)
scaled = pd.DataFrame(scaled)
scaled

Unnamed: 0,0,1,2,3
0,-1.280248,-0.440327,-1.161694,-0.994194
1,0.064303,-0.440327,-0.260878,-0.173740
2,-1.239504,-0.440327,-0.363923,-0.959649
3,0.512486,-0.440327,-0.747850,-0.195248
4,-1.239504,-0.440327,0.196178,-0.940457
...,...,...,...,...
7027,-0.343137,-0.440327,0.664868,-0.129180
7028,1.612573,-0.440327,1.276493,2.241056
7029,-0.872808,-0.440327,-1.170004,-0.854514
7030,-1.158016,2.271039,0.319168,-0.872095


In [46]:
X_train, X_test, y_train, y_test = train_test_split(scaled, y, test_size=0.33, random_state=42)

In [47]:
model = LogisticRegression().fit(X_train, y_train)

In [48]:
model.score(X_test, y_test)

0.7858681602757432

## Managing Imbalance

In [49]:
y.value_counts()

No     5163
Yes    1869
Name: churn, dtype: int64

### Upsampling - SMOTE

In [50]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(scaled, y)
y_sm.value_counts()

No     5163
Yes    5163
Name: churn, dtype: int64

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.33, random_state=42)

In [52]:
model_sm = LogisticRegression().fit(X_train, y_train)
model_sm.score(X_test, y_test)

0.7294600938967136

### Downsampling - TomekLinks

In [53]:
tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl = tl.fit_resample(scaled, y)
y_tl.value_counts()

No     4651
Yes    1869
Name: churn, dtype: int64

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.33, random_state=42)

In [55]:
model_tl = LogisticRegression().fit(X_train, y_train)
model_tl.score(X_test, y_test)

0.7908921933085502