## Imports

In [19]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, precision_score, accuracy_score,recall_score,classification_report
from sklearn.ensemble import RandomForestClassifier



#### Preprocessing Imports

In [2]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

## Data Initialization

In [3]:
df = pd.read_csv("bank-additional/bank-additional.csv", header=0, delimiter=";")

In [4]:
missing_values = df.isnull().sum()
print(missing_values)
duplicate_count = df.duplicated().sum()
print('\nDuplicated row count: ' , duplicate_count)

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

Duplicated row count:  0


This dataset doesn't have any missing values and duplicated rows.

In [21]:
column_values = df['y'].value_counts()
print(column_values)

y
no     3668
yes     451
Name: count, dtype: int64


## Data Processing

### Categorizing Columns

In [5]:
categorical_cols = ["job", "marital", "default", "housing", "loan", "contact", "education"]
ordinal_cols = [ "month", "day_of_week"]
numerical_cols = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
target_col = 'y'

### Imputation

In [6]:
df.replace('unknown', pd.NA, inplace=True)
df.replace(pd.NA, np.nan, inplace=True)
numerical_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

Unknown values replaced with nan. This replacement enables data to be imputed.

### Encodings

In [11]:
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
day_order = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']

preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', Pipeline(steps=[
            ('imputer', categorical_imputer),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols),
        ('ordinal', Pipeline(steps=[
            ('imputer', categorical_imputer),
            ('encoder', OrdinalEncoder(categories=[month_order, day_order]))
        ]), ordinal_cols),
        ('scaler', Pipeline(steps=[
            ('imputer', numerical_imputer),
            ('scaler', StandardScaler())
        ]), numerical_cols)
    ])

In [None]:
'''y_encoded = y.map({'yes': 1, 'no': 0})
X_processed = preprocessor.fit_transform(X)

smote = SMOTE(random_state=42)
X_sampled, y_sampled = smote.fit_resample(X_processed, y_encoded)

print("Original class distribution:")
print(pd.Series(y_encoded).value_counts())
print("Resampled class distribution:")
print(pd.Series(y_sampled).value_counts())

onehot_cols = preprocessor.transformers_[0][1].named_steps['encoder'].get_feature_names_out(categorical_cols)
all_columns = np.concatenate([onehot_cols, ordinal_cols, numerical_cols])
X_sampled_df = pd.DataFrame(X_sampled, columns=all_columns)
y_sampled_df = pd.DataFrame(y_sampled, columns=[target_col])

df_sampled = pd.concat([X_sampled_df, y_sampled_df], axis=1)
print(df_sampled.head())

('preprocessor', preprocessor),
('smote', SMOTE(random_state=42)),'''

### Data Seperation

In [15]:
X = df.drop(columns=[target_col],axis=1)
y = df[target_col].map({'yes': 1, 'no': 0})

## Random Forest


### Random Forest Metrics

In the pipeline data is imputated and encoded. After the imputation and encoding, data is oversampled using minority class with SMOTE.

In [22]:

pipeline_rf = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
param_grid_rf = {
    'model__n_estimators':[ 50,100,200,300,400,500],
    'model__max_depth': [4, 6, 8, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10, 15]
}

scoring_rf = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring=scoring_rf, refit="accuracy")


grid_rf.fit(X_train, y_train)

print("Best Parameters:", grid_rf.best_params_)
print("Best Roc AUC Score:", grid_rf.best_score_)

best = grid_rf.best_estimator_

y_pred=best.predict(X_test)

results = grid_rf.cv_results_
mean_accuracy = results['mean_test_accuracy'][grid_rf.best_index_]
mean_precision = results['mean_test_precision'][grid_rf.best_index_]


print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'model__max_depth': 20, 'model__min_samples_split': 10, 'model__n_estimators': 300}
Best Roc AUC Score: 0.9192716236722307
Mean Accuracy: 0.9192716236722307
Mean Precision: 0.7144593437276365
Accuracy: 0.9004854368932039

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95       732
           1       0.58      0.38      0.46        92

    accuracy                           0.90       824
   macro avg       0.75      0.67      0.70       824
weighted avg       0.89      0.90      0.89       824

