In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
bankdata=pd.read_csv('bank-full.csv')
# u can use delimiter=';' or  sep=';'

In [3]:
bankdata.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,Target
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
bankdata.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [5]:
bankdata.shape

(45211, 17)

In [6]:
bankdata.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'Target'],
      dtype='object')

In [7]:
bankdata.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
Target       0
dtype: int64

In [8]:
bankdata.nunique()

age            77
job            12
marital         3
education       4
default         2
balance      7168
housing         2
loan            2
contact         3
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
poutcome        4
Target          2
dtype: int64

In [9]:
bankdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  Target     45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [10]:
# One-Hot Encoding of categrical variables
bank1=pd.get_dummies(bankdata,columns=['job','marital','default','housing','loan','education','contact','poutcome','month'],dtype=int)
bank1['Target']=bank1['Target'].replace({'yes':1, 'no':0})

bank1

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,Target,job_admin.,job_blue-collar,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,58,2143,5,261,1,-1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,44,29,5,151,1,-1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,33,2,5,76,1,-1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,47,1506,5,92,1,-1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,33,1,5,198,1,-1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
45207,71,1729,17,456,2,-1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
45208,72,5715,17,1127,5,184,3,1,0,0,...,0,0,0,0,0,0,0,1,0,0
45209,57,668,17,508,4,-1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [11]:
bank1['Target'].value_counts()

Target
0    39922
1     5289
Name: count, dtype: int64

In [12]:
bank1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 52 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  45211 non-null  int64
 1   balance              45211 non-null  int64
 2   day                  45211 non-null  int64
 3   duration             45211 non-null  int64
 4   campaign             45211 non-null  int64
 5   pdays                45211 non-null  int64
 6   previous             45211 non-null  int64
 7   Target               45211 non-null  int64
 8   job_admin.           45211 non-null  int32
 9   job_blue-collar      45211 non-null  int32
 10  job_entrepreneur     45211 non-null  int32
 11  job_housemaid        45211 non-null  int32
 12  job_management       45211 non-null  int32
 13  job_retired          45211 non-null  int32
 14  job_self-employed    45211 non-null  int32
 15  job_services         45211 non-null  int32
 16  job_student          4

In [13]:
# Dividing our data into input and output variables
x=bank1.drop("Target",axis=1)
y=bank1["Target"]

In [14]:
print(x.shape)
print(y.shape)

(45211, 51)
(45211,)


### COnvert to standard scaler


In [15]:
scaler = StandardScaler()

In [16]:
x_tranform = scaler.fit_transform(x)

In [17]:
x_tranform

array([[ 1.60696496,  0.25641925, -1.29847633, ..., -0.31026348,
        -0.12881901, -0.113898  ],
       [ 0.28852927, -0.43789469, -1.29847633, ..., -0.31026348,
        -0.12881901, -0.113898  ],
       [-0.74738448, -0.44676247, -1.29847633, ..., -0.31026348,
        -0.12881901, -0.113898  ],
       ...,
       [ 2.92540065,  1.42959305,  0.14341818, ...,  3.22306705,
        -0.12881901, -0.113898  ],
       [ 1.51279098, -0.22802402,  0.14341818, ...,  3.22306705,
        -0.12881901, -0.113898  ],
       [-0.37068857,  0.52836436,  0.14341818, ...,  3.22306705,
        -0.12881901, -0.113898  ]])

In [18]:
# Divide the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(x_tranform, y, test_size=0.2, stratify=y, random_state=2)

In [19]:
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

(36168, 51) (9043, 51)
(36168,) (9043,)


### Model BUilding

In [20]:
classifier=LogisticRegression()
classifier.fit(X_train,Y_train)

In [21]:
# Predict for x dataset
y_predict=classifier.predict(X_train)
y_predict

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
y_pred_df=pd.DataFrame({'actual_y':Y_train,'y_pred_prob':y_predict})
y_pred_df

Unnamed: 0,actual_y,y_pred_prob
20300,0,0
37983,0,0
29466,0,0
43517,1,1
9388,0,0
...,...,...
19797,0,0
37672,0,0
4869,0,0
12426,0,0


#### Testing Model Accuracy

In [23]:
# Confusion Matrix for the model accuracy
conf_mat=confusion_matrix(Y_train,y_predict)
conf_mat

array([[31158,   779],
       [ 2752,  1479]], dtype=int64)

In [24]:
acc = accuracy_score(Y_train, y_predict)
acc

0.9023722627737226

In [25]:
# Predict for x dataset
y_predict=classifier.predict(X_test)
y_predict

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [26]:
acc = accuracy_score(Y_test, y_predict)
acc

0.9004755059161783

## Handling the Imbalanced dataset

In [27]:
class_0, class_1 = bank1["Target"].value_counts()
print(class_0, class_1)
class_0_df = bank1[bank1['Target']== 0]
class_1_df = bank1[bank1['Target']== 1]
print(class_0_df.shape, class_1_df.shape)

39922 5289
(39922, 52) (5289, 52)


### Undersampling: 
Undersampling can be defined as removing some observations of the majority class. This is done until the majority and minority class is balanced out.

In [28]:
class_0_under = class_0_df.sample(class_1)
print(class_0_under.shape)
bank_under = pd.concat([class_0_under, class_1_df],axis = 0)

(5289, 52)


In [29]:
bank_under['Target'].value_counts()

Target
0    5289
1    5289
Name: count, dtype: int64

In [30]:
X = bank_under.drop("Target", axis = 1)
Y = bank_under["Target"]

In [31]:
X.shape, Y.shape

((10578, 51), (10578,))

In [32]:
# Perform Standardization
X_tranform_under = scaler.fit_transform(X)
X_tranform_under.shape

(10578, 51)

In [33]:
# Model Training
# Divide the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_tranform_under, Y, test_size=0.2, stratify=Y, random_state=2)

In [34]:
classifier_under=LogisticRegression()
classifier_under.fit(X_train,Y_train)

In [35]:
# Predict for x dataset
y_predict=classifier.predict(X_test)
y_predict

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [36]:
acc_under = accuracy_score(Y_test, y_predict)
acc_under

0.5458412098298677

### Over-smapling
Oversampling can be defined as adding more copies to the minority class. Oversampling can be a good choice when you don’t have a ton of data to work with.

In [37]:
class_1_over = class_1_df.sample(class_0, replace =True)
print(class_1_over.shape)
bank_over = pd.concat([class_1_over, class_0_df],axis = 0)

(39922, 52)


In [38]:
bank_over['Target'].value_counts()

Target
1    39922
0    39922
Name: count, dtype: int64

In [39]:
X = bank_over.drop("Target", axis = 1)
Y = bank_over["Target"]

In [40]:
# Perform Standardization
X_tranform_over = scaler.fit_transform(X)
X_tranform_over.shape

(79844, 51)

In [41]:
# Model Training
# Divide the data into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_tranform_over, Y, test_size=0.2, stratify=Y, random_state=2)

In [42]:
classifier_over=LogisticRegression()
classifier_over.fit(X_train,Y_train)

In [43]:
y_predict=classifier.predict(X_test)
acc_over = accuracy_score(Y_test, y_predict)
acc_over

0.5421754649633665

In [44]:
from xgboost import XGBClassifier

In [45]:
xgb_model = XGBClassifier().fit(X_train, Y_train)

In [46]:
xgb_pred = xgb_model.predict(X_test)
xgb_acc = accuracy_score(Y_test,xgb_pred)
xgb_acc

0.9247917840816582

### Synthetic Minority Oversampling Technique (SMOTE)
SMOTE (Synthetic Minority Oversampling Technique) works by randomly picking a point from the minority class and computing the k-nearest neighbors for this point. The synthetic points are added between the chosen point and its neighbors.

SMOTE algorithm works in 4 simple steps:

- Choose a minority class as the input vector.
- Find its k nearest neighbors (k_neighbors is specified as an argument in the SMOTE() function).
- Choose one of these neighbors and place a synthetic point anywhere on the line joining the point under consideration and its chosen neighbor.
- Repeat the steps until the data is balanced.

In [None]:
from imblearn.over_sampling import SMOTE