In [1]:
## Lab | Handling Data Imbalance in Classification Models

In [2]:
# who will donate (TargetB)
# how much they will give (TargetD)

In [3]:

# Import the required libraries and modules that you would need.

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [4]:
#the data is already cleand 
numerical=pd.read_csv("numerical.csv")
categorical=pd.read_csv("categorical.csv")
target=pd.read_csv("target.csv")

In [5]:
numerical.dtypes

TCODE         int64
AGE         float64
INCOME        int64
WEALTH1       int64
HIT           int64
             ...   
AVGGIFT     float64
CONTROLN      int64
HPHONE_D      int64
RFA_2F        int64
CLUSTER2      int64
Length: 315, dtype: object

In [6]:
categorical.dtypes

STATE           object
CLUSTER          int64
HOMEOWNR        object
GENDER          object
DATASRCE         int64
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B         int64
ODATEW_YR        int64
ODATEW_MM        int64
DOB_YR           int64
DOB_MM           int64
MINRDATE_YR      int64
MINRDATE_MM      int64
MAXRDATE_YR      int64
MAXRDATE_MM      int64
LASTDATE_YR      int64
LASTDATE_MM      int64
FIRSTDATE_YR     int64
FIRSTDATE_MM     int64
dtype: object

In [7]:
#target_B is our first target to calculate
target=target.drop(columns="TARGET_D")

In [8]:
numerical.head(1)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39


In [9]:
# Scale the features either by using normalizer or a standard scaler.

scaler = MinMaxScaler()

for col in numerical.columns:
    numerical[col] = scaler.fit_transform(numerical[[col]])

numerical

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0.000000,0.608247,0.666667,1.000000,0.000000,0.000000,0.393939,0.343434,0.181818,0.101010,...,0.341463,0.005,0.001401,0.010,0.003676,0.006465,0.498045,0.0,1.000000,0.622951
1,0.000014,0.463918,0.833333,1.000000,0.066390,0.000000,0.151515,0.555556,0.111111,0.060606,...,0.024390,0.010,0.004004,0.025,0.016544,0.014399,0.774510,0.0,0.333333,0.000000
2,0.000014,0.624862,0.333333,0.111111,0.008299,0.000000,0.202020,0.292929,0.333333,0.060606,...,0.341463,0.002,0.002202,0.005,0.011029,0.006204,0.078617,1.0,1.000000,0.967213
3,0.000000,0.711340,0.000000,0.444444,0.008299,0.000000,0.232323,0.141414,0.313131,0.030303,...,0.170732,0.002,0.001201,0.010,0.008272,0.005534,0.899764,1.0,1.000000,0.655738
4,0.000000,0.793814,0.333333,0.222222,0.248963,0.010101,0.282828,0.090909,0.535354,0.262626,...,0.195122,0.003,0.002002,0.015,0.012868,0.005586,0.037079,1.0,0.333333,0.409836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.000014,0.624862,0.666667,1.000000,0.000000,0.141414,0.363636,0.474747,0.111111,0.070707,...,0.000000,0.025,0.004004,0.025,0.008272,0.023745,0.962399,0.0,0.000000,0.180328
95408,0.000014,0.484536,1.000000,1.000000,0.004149,0.000000,0.313131,0.434343,0.191919,0.040404,...,0.000000,0.020,0.003003,0.020,0.008272,0.018738,0.639828,1.0,0.000000,0.016393
95409,0.000014,0.608247,0.666667,1.000000,0.000000,0.000000,0.181818,0.464646,0.202020,0.070707,...,0.097561,0.003,0.001001,0.010,0.002757,0.007009,0.988852,1.0,0.666667,0.540984
95410,0.000000,0.587629,1.000000,1.000000,0.000000,0.000000,0.282828,0.353535,0.202020,0.090909,...,0.439024,0.005,0.003203,0.018,0.003676,0.010875,0.024466,1.0,1.000000,0.163934


In [10]:
# Encode the categorical features using One-Hot Encoding or Ordinal Encoding. 

categorical.nunique()

STATE           12
CLUSTER         53
HOMEOWNR         2
GENDER           3
DATASRCE         3
RFA_2R           1
RFA_2A           4
GEOCODE2         4
DOMAIN_A         5
DOMAIN_B         4
ODATEW_YR       15
ODATEW_MM       12
DOB_YR          96
DOB_MM          12
MINRDATE_YR     20
MINRDATE_MM     12
MAXRDATE_YR     18
MAXRDATE_MM     12
LASTDATE_YR      3
LASTDATE_MM     12
FIRSTDATE_YR    26
FIRSTDATE_MM    12
dtype: int64

In [11]:
categorical.columns

Index(['STATE', 'CLUSTER', 'HOMEOWNR', 'GENDER', 'DATASRCE', 'RFA_2R',
       'RFA_2A', 'GEOCODE2', 'DOMAIN_A', 'DOMAIN_B', 'ODATEW_YR', 'ODATEW_MM',
       'DOB_YR', 'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR',
       'MAXRDATE_MM', 'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR',
       'FIRSTDATE_MM'],
      dtype='object')

In [12]:
cols_to_drop = list(('ODATEW_YR', 'ODATEW_MM',
       'DOB_YR', 'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR',
       'MAXRDATE_MM', 'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR',
       'FIRSTDATE_MM','CLUSTER'))

In [13]:
categorical=categorical.drop(columns=cols_to_drop,axis=1)

In [14]:
categorical.nunique()

STATE       12
HOMEOWNR     2
GENDER       3
DATASRCE     3
RFA_2R       1
RFA_2A       4
GEOCODE2     4
DOMAIN_A     5
DOMAIN_B     4
dtype: int64

In [15]:
one_hot_names = []
for col in categorical.columns:
    col_uniques = sorted(categorical[col].astype(str).unique())
    for unique in col_uniques:
        one_hot_names.append(col+"_"+unique)
        
categorical = pd.DataFrame(OneHotEncoder().fit_transform(categorical.astype(str)).toarray())
categorical.columns = one_hot_names
categorical

Unnamed: 0,STATE_CA,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,...,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,DOMAIN_B_1,DOMAIN_B_2,DOMAIN_B_3,DOMAIN_B_4
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
95408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
95409,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95410,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
# CONCATENTE
donors=pd.DataFrame(np.concatenate((numerical,categorical),axis=1)).reset_index(drop=True)
donors.columns = list(numerical.columns) + list(categorical.columns)
donors

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,GEOCODE2_D,DOMAIN_A_C,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,DOMAIN_B_1,DOMAIN_B_2,DOMAIN_B_3,DOMAIN_B_4
0,0.000000,0.608247,0.666667,1.000000,0.000000,0.000000,0.393939,0.343434,0.181818,0.101010,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,0.000014,0.463918,0.833333,1.000000,0.066390,0.000000,0.151515,0.555556,0.111111,0.060606,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.000014,0.624862,0.333333,0.111111,0.008299,0.000000,0.202020,0.292929,0.333333,0.060606,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.000000,0.711340,0.000000,0.444444,0.008299,0.000000,0.232323,0.141414,0.313131,0.030303,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.000000,0.793814,0.333333,0.222222,0.248963,0.010101,0.282828,0.090909,0.535354,0.262626,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,0.000014,0.624862,0.666667,1.000000,0.000000,0.141414,0.363636,0.474747,0.111111,0.070707,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
95408,0.000014,0.484536,1.000000,1.000000,0.004149,0.000000,0.313131,0.434343,0.191919,0.040404,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
95409,0.000014,0.608247,0.666667,1.000000,0.000000,0.000000,0.181818,0.464646,0.202020,0.070707,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95410,0.000000,0.587629,1.000000,1.000000,0.000000,0.000000,0.282828,0.353535,0.202020,0.090909,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
## apply train-test split and Logistic regression model

X_train, X_test, y_train, y_test = train_test_split(donors, target, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [18]:
## checking the accuracy on the test data

accuracy = model.score(X_test, y_test)
accuracy

0.9487501965099827

In [19]:
#Managing imbalance in the dataset

In [20]:
target.value_counts()

TARGET_B
0           90569
1            4843
dtype: int64

In [21]:
## undersampling

rus=RandomUnderSampler(random_state=0)
x_resampled,y_resampled=rus.fit_resample(donors,target)

y_resampled.value_counts()

TARGET_B
0           4843
1           4843
dtype: int64

In [22]:
# checking accuracy
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.5773993808049536

In [23]:
### oversampling

smote = SMOTE()

x_resampled,y_resampled=smote.fit_resample(donors,target)
y_resampled.value_counts()

TARGET_B
0           90569
1           90569
dtype: int64

In [24]:
# checking accuracy
X_train, X_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6077895550403003