## Imports

In [1]:

import pandas as pd
import warnings

In [2]:
warnings.filterwarnings("ignore")

#### Model Imports

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, precision_score, accuracy_score, recall_score, f1_score, precision_recall_curve, auc
from sklearn.model_selection import StratifiedKFold,GridSearchCV,cross_val_score
from sklearn.model_selection import train_test_split

#### Preprocessing Imports

In [5]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

## Data Initialization

In [6]:
df = pd.read_csv("bank-additional.csv", header=0, delimiter=";")

In [7]:
missing_values = df.isnull().sum()
print(missing_values)
duplicate_count = df.duplicated().sum()
print('\nDuplicated row count: ' , duplicate_count)

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

Duplicated row count:  0


This dataset doesn't have any missing values and duplicated rows.

## Data Processing

### Categorizing Columns

In [8]:
categorical_cols = ["job", "marital", "education" ,"default", "housing", "loan", "contact","poutcome"]
ordinal_cols = [ "month", "day_of_week"]
numerical_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
target_col = 'y'

### Encodings

In [9]:
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
day_order = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']

preprocessor = ColumnTransformer(transformers=[
   ('onehot',  OneHotEncoder(handle_unknown='ignore'), categorical_cols),
   ('ordinal',  OrdinalEncoder(categories=[month_order, day_order]), ordinal_cols),
   ('scaler', StandardScaler(), numerical_cols)])

### Data Seperation

In [10]:
X = df.drop(columns=[target_col],axis=1)
y = df[target_col].map({'yes': 1, 'no': 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Comparison

In [11]:
logreg = LogisticRegression(random_state=42,max_iter=1000)
rf= RandomForestClassifier(random_state=42)
mlp=MLPClassifier(random_state=42,max_iter=1000)
svc=SVC(random_state=42)

X_transformed= preprocessor.fit_transform(X_train)

model_dict = {0: 'Logistic Regression', 1: 'Random Forest',2:'MLP',3:'SVC'}
models= [logreg,rf,mlp,svc]

for i,model in enumerate(models):
    print("{} Test Recall: {}".format(model_dict[i],cross_val_score(model, X_transformed, y_train, cv=10, scoring ='recall').mean()))

Logistic Regression Test Recall: 0.44269841269841265
Random Forest Test Recall: 0.39531746031746035
MLP Test Recall: 0.46198412698412705
SVC Test Recall: 0.3258730158730159


### Model Scoring

PR-AUC

In [11]:
def pr_auc_scorer(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

In [12]:
scoring = {
    'roc_auc': 'roc_auc',
    'pr_auc' : make_scorer(pr_auc_scorer),
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

In [13]:
def score(grid):
    print("Best Parameters:", grid.best_params_)
    print("Best PR-AUC Score:", grid.best_score_)

    results = grid.cv_results_
    
    for metric in scoring.keys():
        metric_key = f'mean_test_{metric}'
        mean_score = results[metric_key][grid.best_index_]
        print(f"Best {metric.capitalize()}: {mean_score:.4f}")

In [14]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('model', LogisticRegression())
])

param_grid = {
    'model__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'model__max_iter': [100, 1000, 2500, 5000],
    'model__penalty': ['l1', 'l2', 'none'],
    'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}


grid_lr = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), scoring=scoring, refit="pr_auc")
grid_lr.fit(X_train, y_train)


In [15]:
score(grid_lr)

Best Parameters: {'model__C': 100, 'model__max_iter': 100, 'model__penalty': 'l1', 'model__solver': 'liblinear'}
Best PR-AUC Score: 0.6094833808196584
Best Roc_auc: 0.9282
Best Pr_auc: 0.6095
Best Accuracy: 0.9199
Best Precision: 0.7101
Best Recall: 0.4487
Best F1: 0.5455


In [16]:
mlp_pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('mlp', MLPClassifier(random_state=42,max_iter=1000))
])

mlp_param_grid = {
    'mlp__hidden_layer_sizes': [(64,),(64, 32), (128, 64)],
    'mlp__activation': ['tanh', 'relu'],
    'mlp__solver': ['adam', 'sgd'],
    'mlp__alpha': [0.0001, 0.001, 0.01],
    'mlp__learning_rate': ['constant', 'adaptive'],
    'mlp__max_iter': [300, 500]
}


mlp_grid = GridSearchCV(mlp_pipeline, mlp_param_grid, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), scoring=scoring, refit="pr_auc")
mlp_grid.fit(X_train, y_train)

In [17]:
score(mlp_grid)

Best Parameters: {'mlp__activation': 'relu', 'mlp__alpha': 0.01, 'mlp__hidden_layer_sizes': (128, 64), 'mlp__learning_rate': 'constant', 'mlp__max_iter': 300, 'mlp__solver': 'sgd'}
Best PR-AUC Score: 0.6092751111885695
Best Roc_auc: 0.9315
Best Pr_auc: 0.6093
Best Accuracy: 0.9178
Best Precision: 0.6701
Best Recall: 0.4932
Best F1: 0.5605


In [None]:
import pickle

In [None]:
pickle.dump(mlp_grid.best_estimator_, open('tuned_best_model_.pkl', 'wb'))