# Lab | Handling Data Imbalance in Classification Models

### Import the required libraries and modules that you would need

In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

### Read that data into Python and call the dataframe churnData.

In [3]:
churnData = pd.read_csv('files_for_lab/Customer-Churn.csv')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


### Check the datatypes of all the columns in the data. You would see that the column TotalCharges is object type. Convert this column into numeric type using pd.to_numeric function.

In [4]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [7]:
churnData.TotalCharges.value_counts()

20.2       11
           11
19.75       9
19.9        8
19.65       8
           ..
6224.8      1
2033.85     1
7104.2      1
1442        1
389.95      1
Name: TotalCharges, Length: 6531, dtype: int64

In [11]:
churnData['TotalCharges'] = churnData['TotalCharges'].apply(lambda x: x.replace(" ", ""))
churnData['TotalCharges'] = pd.to_numeric(churnData.TotalCharges)

In [12]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

### Check for null values in the dataframe. Replace the null values.

In [20]:
pd.DataFrame(churnData.isna().sum()/len(churnData), columns=['NaN']).sort_values(by='NaN', ascending=False)

Unnamed: 0,NaN
TotalCharges,0.001562
gender,0.0
SeniorCitizen,0.0
Partner,0.0
Dependents,0.0
tenure,0.0
PhoneService,0.0
OnlineSecurity,0.0
OnlineBackup,0.0
DeviceProtection,0.0


In [21]:
churnData.TotalCharges.value_counts(dropna=False)

NaN        11
20.20      11
19.75       9
19.90       8
20.05       8
           ..
6668.35     1
6096.90     1
140.70      1
797.10      1
5867.00     1
Name: TotalCharges, Length: 6531, dtype: int64

In [22]:
churnData['TotalCharges'] = churnData["TotalCharges"].fillna(np.mean(churnData['TotalCharges']))

### Use the following features: tenure, SeniorCitizen, MonthlyCharges and TotalCharges:

#### - Scale the features either by using normalizer or a standard scaler.

In [71]:
data = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']]
data.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,1,0,29.85,29.85
1,34,0,56.95,1889.5
2,2,0,53.85,108.15
3,45,0,42.3,1840.75
4,2,0,70.7,151.65


In [72]:
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)
data_scaled

array([[0.01388889, 0.        , 0.11542289, 0.0012751 ],
       [0.47222222, 0.        , 0.38507463, 0.21586661],
       [0.02777778, 0.        , 0.35422886, 0.01031041],
       ...,
       [0.15277778, 0.        , 0.11293532, 0.03780868],
       [0.05555556, 1.        , 0.55870647, 0.03321025],
       [0.91666667, 0.        , 0.86965174, 0.78764136]])

#### - Split the data into a training set and a test set.

In [73]:
X_train, X_test, y_train, y_test = train_test_split(data_scaled, churnData['Churn'], random_state=0)

#### - Fit a logistic regression model on the training data.

In [74]:
model = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial').fit(X_train, y_train)

#### - Check the accuracy on the test data.

In [75]:
print("test data accuracy was ",model.score(X_test,y_test))
# 100% on training data
print("train data accuracy was ",model.score(X_train,y_train))

test data accuracy was  0.7825099375354913
train data accuracy was  0.7925028398333964


## Managing imbalance in the dataset

#### Check for the imbalance.

In [76]:
churnData.Churn.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

#### Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.

In [77]:
category_No = churnData[churnData['Churn'] == 'No']
category_Yes = churnData[churnData['Churn'] == 'Yes']

# Downsampling
category_No_undersampled = resample(category_No, replace=False, n_samples = len(category_Yes))
churnData_downsampled = pd.concat([category_No_undersampled, category_Yes], axis=0)

# Upsampling
category_Yes_oversampled = resample(category_Yes, replace=True, n_samples = len(category_No))
churnData_upsampled = pd.concat([category_No, category_Yes_oversampled], axis=0)

#### Each time fit the model and see how the accuracy of the model is.

In [78]:
X_down = churnData_downsampled[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']].copy()
y_down = churnData_downsampled['Churn'].copy()

X_down_scaled = scaler.fit_transform(X_down)

X_train, X_test, y_train, y_test = train_test_split(X_down_scaled, y_down, random_state=0)

model = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial').fit(X_train, y_train)

print("test data accuracy was ",model.score(X_test,y_test))
# 100% on training data
print("train data accuracy was ",model.score(X_train,y_train))

test data accuracy was  0.7165775401069518
train data accuracy was  0.7381377095968605


In [79]:
X_up = churnData_upsampled[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']].copy()
y_up = churnData_upsampled['Churn'].copy()

X_up_scaled = scaler.fit_transform(X_up)

X_train, X_test, y_train, y_test = train_test_split(X_up_scaled, y_up, random_state=0)

model = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial').fit(X_train, y_train)

print("test data accuracy was ",model.score(X_test,y_test))
# 100% on training data
print("train data accuracy was ",model.score(X_train,y_train))

test data accuracy was  0.7197526091998454
train data accuracy was  0.7343125885839453
