In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, precision_score, accuracy_score,recall_score

In [3]:
df = pd.read_csv("bank-additional/bank-additional.csv", header=0, delimiter=";")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.idx   4119 non-null   f

In [7]:
df.default.value_counts()

default
no         3315
unknown     803
yes           1
Name: count, dtype: int64

column "default" is extremely unbalanced therefore can be excluded from dataset

In [4]:
import pandas as pd
def clean_data(df : pd.DataFrame, includeDefault = False, includeDate=False) -> pd.DataFrame:
    df_cleaned = df.copy()
    df_cleaned['y'] = df_cleaned['y'].map({'yes': 1, 'no': 0})

    if (not includeDefault):
        df_cleaned = df_cleaned.drop('default', axis=1)

    if (not includeDate):
        df_cleaned = df_cleaned.drop(['day_of_week', 'month'], axis=1)

    return df_cleaned


## Data Preprocessing

### Encoding Categorical Data

In [5]:
import pandas as pd
def preprocess_data(data : pd.DataFrame, includeDate = False) -> pd.DataFrame:
    dummies_contact = pd.get_dummies(data['contact'], prefix='contact', drop_first=True)

    dummies_house = pd.get_dummies(data['housing'], prefix='housing')
    dummies_house = dummies_house.drop('housing_unknown', axis=1)

    dummies_loan = pd.get_dummies(data['loan'], prefix='loan')
    dummies_loan  = dummies_loan.drop('loan_unknown', axis=1)


    dummies_marital = pd.get_dummies(data['marital'], prefix='marital')
    dummies_marital  = dummies_marital.drop('marital_unknown', axis=1)

    dummies_education = pd.get_dummies(data['education'], prefix='education')
    dummies_education  = dummies_education.drop('education_unknown', axis=1)

    dummies_job = pd.get_dummies(data['job'], prefix='job')
    dummies_job  = dummies_job.drop('job_unknown', axis=1)


    dummies_poutcome = pd.get_dummies(data['poutcome'], prefix='poutcome')
    dummies_poutcome  = dummies_poutcome.drop('poutcome_nonexistent', axis=1)


    df_encoded = pd.concat([
        data,
        dummies_contact,
        dummies_house,
        dummies_loan,
        dummies_marital,
        dummies_education,
        dummies_job,
        dummies_poutcome
    ], axis=1).drop([
        'contact',
        'housing',
        'loan',
        'marital', 
        'education', 
        'job', 
        'poutcome', 
    ], axis=1)

    if 'month' in data.columns:
        dummies_month = pd.get_dummies(data['month'], prefix='month', drop_first=True)
        dummies_day_of_week = pd.get_dummies(data['day_of_week'], prefix='day_of_week', drop_first=True)

        df_encoded = pd.concat([
            df_encoded,
            dummies_month,
            dummies_day_of_week
        ], axis=1).drop(['month' ,'day_of_week'], axis=1)

    if 'default' in data.columns:
        dummies_default = pd.get_dummies(data['default'], prefix='default')
        dummies_default  = dummies_default.drop('default_unknown', axis=1)
        df_encoded = pd.concat([df_encoded, dummies_default], axis=1).drop(['default'], axis=1)

    return df_encoded


### Data Scaling

In [6]:
from sklearn.model_selection import train_test_split

clean = clean_data(df)
df_encoded = preprocess_data(clean)

X_train, X_val, y_train, y_val = train_test_split(df_encoded.drop('y', axis=1), df_encoded['y'], test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_val_scaled = scaler.transform(X_val)


## Data Process

## Random Forest


### Random Forest Metrics

In [8]:
from sklearn.ensemble import RandomForestClassifier

pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])

param_grid_rf = {
    'model__n_estimators':[ 50,100,200,300,400,500],
    'model__max_depth': [4, 6, 8, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10, 15]
}

scoring_rf = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score)
}

grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring=scoring_rf, refit="accuracy")

clean_rf = clean_data(df, True, True)
df_encoded_rf = preprocess_data(clean_rf)

grid_rf.fit(df_encoded_rf.drop('y', axis=1), df_encoded_rf['y'])

print("Best Parameters:", grid_rf.best_params_)
print("Best Accuracy Score:", grid_rf.best_score_)

results = grid_rf.cv_results_
mean_accuracy = results['mean_test_accuracy'][grid_rf.best_index_]
mean_precision = results['mean_test_precision'][grid_rf.best_index_]


print("Mean Accuracy:", mean_accuracy)
print("Mean Precision:", mean_precision)

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'model__max_depth': 20, 'model__min_samples_split': 15, 'model__n_estimators': 400}
Best Accuracy Score: 0.9101729405796931
Mean Accuracy: 0.9101729405796931
Mean Precision: 0.6578244631185808


In [21]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score)
}

param_grid = {
    'model__C': [0.1, 1, 10, 100],
    'model__penalty': ['l1', 'l2'],
    'model__solver': ['liblinear']
}

grid = GridSearchCV(pipeline, param_grid, cv=KFold(n_splits=5, shuffle=True, random_state=42), scoring=scoring, refit="roc_auc")

clean = clean_data(df, True, True)
df_encoded = preprocess_data(clean)

grid.fit(df_encoded.drop('y', axis=1), df_encoded['y'])


print("Best Parameters:", grid.best_params_)
print("Best ROC AUC Score:", grid.best_score_)

results = grid.cv_results_
mean_accuracy = results['mean_test_accuracy'][grid.best_index_]
mean_precision = results['mean_test_precision'][grid.best_index_]

print("Best Accuracy:", mean_accuracy)
print("Best Precision:", mean_precision)

Best Parameters: {'model__C': 0.1, 'model__penalty': 'l1', 'model__solver': 'liblinear'}
Best ROC AUC Score: 0.9354564396838077
Best Accuracy: 0.9128428434923144
Best Precision: 0.6634194323969126


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline

# Load and inspect the dataset
def load_data_d(file_path):
    df = pd.read_csv(file_path, delimiter=";")
    return df

def clean_data_d(df, include_default=False, include_date=False):
    df_cleaned = df.copy()
    df_cleaned['y'] = df_cleaned['y'].map({'yes': 1, 'no': 0})

    if not include_default:
        df_cleaned = df_cleaned.drop('default', axis=1)

    if not include_date:
        df_cleaned = df_cleaned.drop(['day_of_week', 'month'], axis=1)

    return df_cleaned

def preprocess_data_d(df, include_date=False):
    # Encode categorical variables
    categorical_cols = ['contact', 'housing', 'loan', 'marital', 'education', 'job', 'poutcome']
    df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    if include_date:
        date_cols = ['day_of_week', 'month']
        df_encoded = pd.get_dummies(df_encoded, columns=date_cols, drop_first=True)

    return df_encoded


def scale_data_d(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler


def evaluate_model_d(model, X_val, y_val):
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
    print("Classification Report:\n", classification_report(y_val, y_pred))
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    return accuracy