# Binary Classification with a Bank Dataset
link: https://www.kaggle.com/competitions/playground-series-s5e8/data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

import csv
import os
import pickle

import warnings
warnings.filterwarnings('ignore')


In [2]:
df = pd.read_csv("train.csv", index_col="id")
df.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [3]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,40.926395,1204.067397,16.117209,256.229144,2.577008,22.412733,0.298545,0.120651
std,10.098829,2836.096759,8.250832,272.555662,2.718514,77.319998,1.335926,0.325721
min,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0,0.0
25%,33.0,0.0,9.0,91.0,1.0,-1.0,0.0,0.0
50%,39.0,634.0,17.0,133.0,2.0,-1.0,0.0,0.0
75%,48.0,1390.0,21.0,361.0,3.0,-1.0,0.0,0.0
max,95.0,99717.0,31.0,4918.0,63.0,871.0,200.0,1.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 17 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   age        750000 non-null  int64 
 1   job        750000 non-null  object
 2   marital    750000 non-null  object
 3   education  750000 non-null  object
 4   default    750000 non-null  object
 5   balance    750000 non-null  int64 
 6   housing    750000 non-null  object
 7   loan       750000 non-null  object
 8   contact    750000 non-null  object
 9   day        750000 non-null  int64 
 10  month      750000 non-null  object
 11  duration   750000 non-null  int64 
 12  campaign   750000 non-null  int64 
 13  pdays      750000 non-null  int64 
 14  previous   750000 non-null  int64 
 15  poutcome   750000 non-null  object
 16  y          750000 non-null  int64 
dtypes: int64(8), object(9)
memory usage: 103.0+ MB


In [5]:
df.shape

(750000, 17)

In [6]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [7]:
df['poutcome'].unique()

array(['unknown', 'other', 'failure', 'success'], dtype=object)

In [8]:
df_unknown_col = [col for col in df.columns if "unknown" in df[col].unique()]
df_unknown_col

['job', 'education', 'contact', 'poutcome']

In [9]:
[df[col].value_counts() for col in df_unknown_col]

[job
 management       175541
 blue-collar      170498
 technician       138107
 admin.            81492
 services          64209
 retired           35185
 self-employed     19020
 entrepreneur      17718
 unemployed        17634
 housemaid         15912
 student           11767
 unknown            2917
 Name: count, dtype: int64,
 education
 secondary    401683
 tertiary     227508
 primary       99510
 unknown       21299
 Name: count, dtype: int64,
 contact
 cellular     486655
 unknown      231627
 telephone     31718
 Name: count, dtype: int64,
 poutcome
 unknown    672450
 failure     45115
 success     17691
 other       14744
 Name: count, dtype: int64]

The value "unknown" seems significant in features such as Education, contact, poutcome.

In [10]:
df["y"].value_counts()

y
0    659512
1     90488
Name: count, dtype: int64

In [11]:
df["y"].value_counts() / df.shape[0] *100

y
0    87.934933
1    12.065067
Name: count, dtype: float64

There exist a imbalance of data. Majority of the data consist of y label 0 comprising an astonishing 87%. 

In [12]:
[f"{col}: {len(df[col].unique())}" for col in df.columns]

['age: 78',
 'job: 12',
 'marital: 3',
 'education: 4',
 'default: 2',
 'balance: 8217',
 'housing: 2',
 'loan: 2',
 'contact: 3',
 'day: 31',
 'month: 12',
 'duration: 1760',
 'campaign: 52',
 'pdays: 596',
 'previous: 50',
 'poutcome: 4',
 'y: 2']

In [13]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int64
dtype: object

In [14]:
numerical_features = [feature for feature in df.columns if df[feature].dtype != 'O' and feature != "y"]
categorical_features = [feature for feature in df.columns if df[feature].dtype == "O" and feature != "y"]
numerical_features, categorical_features

(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'],
 ['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'poutcome'])

In [15]:
df.shape

(750000, 17)

In [16]:
X = df.drop(["y"], axis=1)
y = df["y"]
X.shape, y.shape

((750000, 16), (750000,))

In [17]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

## Numerical

In [18]:
# for feature in numerical_features:
#     sns.kdeplot(data=df, x=feature, hue='y')
#     plt.show()

Normalization is required for most of them as they are left-skewed

In [19]:
# for feature in numerical_features:
#   data = df.copy()
#   data[feature].hist(bins=25)
#   plt.xlabel(feature)
#   plt.ylabel("Count")
#   plt.title(feature)
#   plt.show()

In [20]:
# for feature in numerical_features:
#   data = df.copy()

#   if 0 in data[feature].unique(): 
#     pass
#   else:
#     data[feature] = data[feature]
#     data.boxplot(column=feature)
#     plt.ylabel(feature)
#     plt.title(feature)
#     plt.show()

As evident there exist several outliers in the nuemrical columns, we have to remove them

In [21]:
df[numerical_features].describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,40.926395,1204.067397,16.117209,256.229144,2.577008,22.412733,0.298545
std,10.098829,2836.096759,8.250832,272.555662,2.718514,77.319998,1.335926
min,18.0,-8019.0,1.0,1.0,1.0,-1.0,0.0
25%,33.0,0.0,9.0,91.0,1.0,-1.0,0.0
50%,39.0,634.0,17.0,133.0,2.0,-1.0,0.0
75%,48.0,1390.0,21.0,361.0,3.0,-1.0,0.0
max,95.0,99717.0,31.0,4918.0,63.0,871.0,200.0


In [22]:
# Scaling Standard scaler 
scaler = StandardScaler()
scaler.fit(df[numerical_features])


In [23]:
def create_dir(dir):
    if os.path.exists(dir):
        pass
    else:
        os.makedirs(dir)

In [24]:
def save_preprocessing_tools(tool, file_path):
    create_dir('preprocessing')
    with open(file_path, 'wb') as file:
        pickle.dump(tool, file)

In [25]:
save_preprocessing_tools(
    scaler, 
    "preprocessing/scaler.pkl"
)

## Categorical

In [26]:
# for feature in categorical_features:
#     plt.figure(figsize=(12,5))
#     sns.histplot(data=df, x=feature, hue='y')
#     plt.show()

In [27]:
label_encoder = LabelEncoder()

for feature in categorical_features:
    df[feature] = label_encoder.fit_transform(df[feature])


In [28]:
save_preprocessing_tools(
    label_encoder, 
    "preprocessing/label_encoder.pkl"
)

## Misbalanced Dataset

In [29]:
df.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,42,9,1,1,0,7,0,0,0,25,1,117,3,-1,0,3,0
1,38,1,1,1,0,514,0,0,2,18,6,185,1,-1,0,3,0
2,36,1,1,1,0,602,1,0,2,14,8,111,2,-1,0,3,0
3,27,8,2,1,0,34,1,0,2,28,8,10,2,-1,0,3,0
4,26,9,1,1,0,889,1,0,0,3,3,902,1,-1,0,3,1


In [30]:
# df = df.reset_index()
X = df.drop(["y"], axis=1)
X.reset_index()
y = df[ 'y']

In [53]:
y.head(), type(y)

(id
 0    0
 1    0
 2    0
 3    0
 4    1
 Name: y, dtype: int64,
 pandas.core.series.Series)

In [49]:
X.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,42,9,1,1,0,7,0,0,0,25,1,117,3,-1,0,3
1,38,1,1,1,0,514,0,0,2,18,6,185,1,-1,0,3
2,36,1,1,1,0,602,1,0,2,14,8,111,2,-1,0,3
3,27,8,2,1,0,34,1,0,2,28,8,10,2,-1,0,3
4,26,9,1,1,0,889,1,0,0,3,3,902,1,-1,0,3


In [65]:
# creating a new folder for storing modified x_train and y_train data

def save_training_data(x_train, y_train, name ,dir='modified_training_data'):
    folder_dir = os.path.join(dir, name)
    
    create_dir(folder_dir)

    path_xtrain = os.path.join(folder_dir, 'x_train.csv')
    path_ytrain = os.path.join(folder_dir, 'y_train.csv')

    x_train.to_csv(path_xtrain, index=False) 
    y_train.to_csv(path_ytrain, index=False)

    print('Data saved successfully')

In [66]:
# train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1)

save_training_data(X_train, y_train, 'base')
save_training_data(X_test, y_test, 'test')

X_train.shape, X_test.shape

Data saved successfully
Data saved successfully


((675000, 16), (75000, 16))

In [67]:
Counter(y_test), Counter(y_train)

(Counter({0: 65951, 1: 9049}), Counter({0: 593561, 1: 81439}))

This is notably imbalanced, employing imbalanced data handling techniques

In [68]:
X.columns 

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')

In [69]:
y.name

'y'

In [70]:
X.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome'],
      dtype='object')

In [71]:
# Approach 1: Random Sampling
# Oversampling using RandomOverSampler
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
print("Oversampled class distribution:", Counter(y_over))

save_training_data(X_over, y_over, "oversampling")

# Undersampling using RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')
X_under, y_under = undersample.fit_resample(X, y)
print("Undersampled class distribution:", Counter(y_under))

save_training_data(X_under, y_under, "undersampling")

Oversampled class distribution: Counter({0: 659512, 1: 659512})
Data saved successfully
Undersampled class distribution: Counter({0: 90488, 1: 90488})
Data saved successfully


In [72]:
X.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,42,9,1,1,0,7,0,0,0,25,1,117,3,-1,0,3
1,38,1,1,1,0,514,0,0,2,18,6,185,1,-1,0,3
2,36,1,1,1,0,602,1,0,2,14,8,111,2,-1,0,3
3,27,8,2,1,0,34,1,0,2,28,8,10,2,-1,0,3
4,26,9,1,1,0,889,1,0,0,3,3,902,1,-1,0,3


In [73]:
y_over.head()

0    0
1    0
2    0
3    0
4    1
Name: y, dtype: int64

In [74]:
X_under.head()

Unnamed: 0_level_0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
277281,54,4,0,2,0,990,0,1,0,14,5,71,3,-1,0,3
174899,29,1,1,1,0,1761,0,1,0,21,5,50,1,-1,0,3
26155,33,9,0,1,0,4,1,0,2,21,8,76,27,-1,0,3
391775,54,5,1,1,0,13,1,0,2,5,8,117,1,-1,0,3
536518,57,9,1,3,0,526,0,1,0,11,1,97,8,-1,0,3


In [75]:
# Approach 2: SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

save_training_data(X_train_resampled, y_train_resampled, 'smote')

Data saved successfully


In [76]:
X_train_resampled.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,32,4,1,2,0,1820,1,1,0,17,5,493,3,-1,0,3
1,45,9,0,3,0,456,0,0,0,7,5,1448,2,-1,0,3
2,46,4,1,2,0,705,0,0,0,5,3,108,3,-1,0,3
3,36,0,2,1,0,-158,1,0,2,28,8,399,1,-1,0,3
4,34,4,1,2,0,4928,1,0,2,21,8,577,1,-1,0,3


In [77]:
Counter(y_train_resampled)

Counter({0: 593561, 1: 593561})

In [78]:
print("Pipeline is successfully executed!")

Pipeline is successfully executed!
