## Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from joblib import dump, load

## Read Data

In [2]:
df = pd.read_csv('https://sagemaker-brae-data.s3.eu-west-2.amazonaws.com/bank-full.csv', sep=";")

## Clean Data

In [3]:
df.describe(include='all')

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,45211.0,45211,45211,45211,45211,45211.0,45211,45211,45211,45211.0,45211,45211.0,45211.0,45211.0,45211.0,45211,45211
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,blue-collar,married,secondary,no,,yes,no,cellular,,may,,,,,unknown,no
freq,,9732,27214,23202,44396,,25130,37967,29285,,13766,,,,,36959,39922
mean,40.93621,,,,,1362.272058,,,,15.806419,,258.16308,2.763841,40.197828,0.580323,,
std,10.618762,,,,,3044.765829,,,,8.322476,,257.527812,3.098021,100.128746,2.303441,,
min,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
25%,33.0,,,,,72.0,,,,8.0,,103.0,1.0,-1.0,0.0,,
50%,39.0,,,,,448.0,,,,16.0,,180.0,2.0,-1.0,0.0,,
75%,48.0,,,,,1428.0,,,,21.0,,319.0,3.0,-1.0,0.0,,


In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.dtypes # check data types

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [6]:
df['day'] = df['day'].astype(object)

In [7]:
df.isnull().sum() # check for nulls

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
cat_feature_cols = ["job", "marital", "education", "default", "housing", "loan", "contact", "day", "month", "poutcome"]
num_feature_cols = ["age", "balance", "pdays",  "campaign", "previous"]
feature_cols = cat_feature_cols + num_feature_cols 

In [9]:
X = df[feature_cols].copy()
y = df['y'].apply(lambda x: 1 if x=='yes' else 0).copy()

## Train / Test Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,random_state=42)

## Feature Engineering

In [11]:
enc = OneHotEncoder(handle_unknown='ignore')
X_train_cat_encoded = enc.fit_transform(X_train[cat_feature_cols])
X_test_cat_encoded = enc.transform(X_test[cat_feature_cols])

In [12]:
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train[num_feature_cols])
X_test_num_scaled = scaler.transform(X_test[num_feature_cols])

In [13]:
X_train = np.concatenate((X_train_cat_encoded.toarray(), X_train_num_scaled), axis=1)
X_test = np.concatenate((X_test_cat_encoded.toarray(), X_test_num_scaled), axis=1)

## Imbalanced Data

In [14]:
sm=SMOTE()
X_balanced, y_balanced=sm.fit_resample(X_train, y_train)

## Train Simple Model

In [15]:
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(X_balanced, y_balanced)

RandomForestClassifier(n_estimators=1000)

In [16]:
print('F1 Score:', f1_score(y_test.values, rfc.predict(X_test)))
print('Accuracy:', accuracy_score(y_test.values, rfc.predict(X_test)))

F1 Score: 0.3812405446293495
Accuracy: 0.8869228642521426


## Save Encoder, Scaler & Model

In [17]:
dump(rfc, 'rfc_model.joblib')
dump(enc, 'encoder.joblib')
dump(scaler, 'scaler.joblib');