In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [2]:
bank = pd.read_csv("bank-full.csv", sep = ";", na_values = "unknown")
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


In [3]:
bank.shape

(45211, 17)

In [4]:
bank.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [5]:
bank["default"] = bank["default"].map({"no": 0, "yes": 1})
bank["housing"] = bank["housing"].map({"no": 0, "yes": 1})
bank["loan"] = bank["loan"].map({"no": 0, "yes": 1})
bank["education"] = bank["education"].map({"primary": 0, "secondary": 1, "tertiary": 2})
bank["y"] = bank["y"].map({"no": 0, "yes": 1})
bank.month = pd.to_datetime(bank.month, format = "%b").dt.month
bank.isnull().sum()


age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
y                0
dtype: int64

In [6]:
bank.drop(["poutcome", "contact"], axis = 1, inplace = True) # Xoá cột 'poutcome', 'contact' 
bank.dropna(inplace = True) # Xoá dòng dữ liệu rỗng
bank = pd.get_dummies(bank, drop_first = True) # Convert sang dummy

bank.y.value_counts()

0    38172
1     5021
Name: y, dtype: int64

## Imbalanced

In [7]:
X = bank.drop("y", axis = 1)
y = bank.y
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)
y_train.value_counts()

0    28628
1     3766
Name: y, dtype: int64

In [8]:
y_test.value_counts()

0    9544
1    1255
Name: y, dtype: int64

In [9]:
from sklearn import svm

model = svm.SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[9537,    7],
       [1244,   11]])

In [10]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Accuracy:  0.8841559403648486
Recall:  0.008764940239043825


## SMOTE

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)

smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
np.bincount(y_train)

array([28628, 28628])

In [12]:
np.bincount(y_test)

array([9544, 1255])

In [13]:
from sklearn import svm
model = svm.SVC()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[7574, 1970],
       [ 379,  876]])

In [14]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Accuracy:  0.7824798592462265
Recall:  0.69800796812749


## NearMiss

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)

nr = NearMiss()
X_train, y_train = nr.fit_sample(X_train, y_train)
np.bincount(y_train)

array([3766, 3766])

In [16]:
np.bincount(y_test)

array([9544, 1255])

In [17]:
from sklearn import svm

model = svm.SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)


array([[4949, 4595],
       [ 212, 1043]])

In [18]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

Accuracy:  0.5548661913140106
Recall:  0.8310756972111554
