In [None]:

# Libraries --------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    LabelEncoder,
    MinMaxScaler
)
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve
)
from lets_plot import *
import matplotlib.pyplot as plt

LetsPlot.setup_html()


# Triaining Data --------------
df_train = pd.read_csv(
    'https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv'
)

In [None]:
def create_unknown_flags_and_return(df, cat_columns):
    for c in cat_columns:
        df[c + "_was_unknown"] = (
            (df[c] == "unknown").astype(int)
        )
    return df


def map_job_category(job_str):
    if job_str in job_map["unpaid"]:
        return 0
    elif job_str in job_map["low_paid"]:
        return 1
    else:
        return 2


job_map = {
    "unpaid": ["unemployed", "retired", "student"],
    "low_paid": ["admin.", "blue-collar", "services", "housemaid"],
    "high_paid": ["entrepreneur", "management", "self-employed", "technician"]
}

day_map = {
    "mon": 1,
    "tue": 2,
    "wed": 3,
    "thu": 4,
    "fri": 5,
    "sat": 6,
    "sun": 7
}

cat_cols = [
    "job",
    "marital",
    "education",
    "default",
    "housing",
    "loan",
    "contact",
    "month",
    "day_of_week",
    "poutcome"
]

num_cols = [
    "age",
    "campaign",
    "emp.var.rate",
    "cons.price.idx",
    "cons.conf.idx",
    "euribor3m",
    "nr.employed",
    "pdays_numeric",
    "previous"
]

df_train = (
    df_train
    .assign(
        pdays_numeric=lambda d:
            np.where(
                d['pdays'] == 999,
                0,
                d['pdays']
            ),
        pday_contacted=lambda d:
            np.where(
                d['pdays'] == 999,
                0,
                1
            )
    )
    .drop(columns=['pdays'])
    .assign(
        y=lambda d:
            d['y'].map({'yes': 1, 'no': 0})
    )
    .pipe(create_unknown_flags_and_return, cat_cols)
)

train_modes = {}
for col in cat_cols:
    mode_val = (
        df_train.loc[df_train[col] != 'unknown', col]
        .mode()[0]
    )
    train_modes[col] = mode_val
    df_train[col] = df_train[col].replace(
        'unknown',
        mode_val
    )

df_train = (
    df_train
    .assign(
        job=lambda d:
            d['job'].apply(map_job_category),
        marital=lambda d:
            np.where(
                d['marital'].isin(["single", "unknown"]),
                1,
                0
            ),
        education=lambda d:
            d['education'].map(
                lambda x: 1
                if x in ['university.degree', 'professional.course']
                else 0
            ),
        default=lambda d:
            d['default'].map(
                lambda x: 1 if x == 'no' else 0
            ),
        housing=lambda d:
            d['housing'].map(
                lambda x: 1 if x == 'yes' else 0
            ),
        loan=lambda d:
            d['loan'].map(
                lambda x: 1 if x == 'yes' else 0
            ),
        contact=lambda d:
            d['contact'].map(
                lambda x: 1 if x == 'cellular' else 0
            ),
        month=lambda d:
            np.where(
                d['month'].isin(["mar", "sep", "oct", "dec"]),
                1,
                0
            ),
        day_of_week=lambda d:
            d['day_of_week'].map(day_map)
    )
)

pout_encoder = LabelEncoder()
df_train = (
    df_train
    .assign(
        poutcome=lambda d:
            pout_encoder.fit_transform(
                d['poutcome'].astype(str)
            )
    )
)

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(
    df_train[num_cols]
)

scaled_cols = [
    c + "_scaled"
    for c in num_cols
]

df_scaled = pd.DataFrame(
    scaled_data,
    columns=scaled_cols
)

df_train = (
    pd.concat([df_train, df_scaled], axis=1)
    .drop(columns=num_cols)
)



# 24     poutcome_was_unknown    0.000000
# 25  day_of_week_was_unknown    0.000000
# 26        month_was_unknown    0.000000
# 27      contact_was_unknown    0.000000
# 28         loan_was_unknown    0.000000
# 29                  default    0.000000


# Drop Unnecessary Columns -------------
columns_to_drop = [
    'poutcome_was_unknown',
    'day_of_week_was_unknown',
    'month_was_unknown',
    'contact_was_unknown',
    'loan_was_unknown',
    'previous_scaled',
    'education_was_unknown',
    'housing',
    'default'

]

df_train = df_train.drop(
    columns=columns_to_drop
)


NameError: name 'df_train' is not defined

In [None]:

# Model Training -------------

X = df_train.drop(columns=['y'])
y = df_train['y']

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

best_xgb_no_thr = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    scale_pos_weight=3.9634256472004816,
    n_estimators=300,
    min_child_weight=2,
    max_depth=3,
    learning_rate=0.1
)

best_xgb_no_thr.fit(
    X_train,
    y_train
)

y_val_pred = best_xgb_no_thr.predict(
    X_val
)

acc = accuracy_score(
    y_val,
    y_val_pred
)

prec = precision_score(
    y_val,
    y_val_pred
)

rec = recall_score(
    y_val,
    y_val_pred
)

f1_ = f1_score(
    y_val,
    y_val_pred
)

feature_names = X_train.columns
importances = best_xgb_no_thr.feature_importances_
feature_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print(feature_importances)

print("=== Validation Metrics (No Threshold Tuning) ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1:        {f1_:.4f}")


In [None]:

df_holdout = pd.read_csv(
    'https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test.csv'
)

#df_holdout.shape

# Predictions in holdout -----------
best_xgb_no_thr.fit(
    X,
    y
)

df_holdout = (
    df_holdout
    .assign(
        pdays_numeric=lambda d:
            np.where(
                d['pdays'] == 999,
                0,
                d['pdays']
            ),
        pday_contacted=lambda d:
            np.where(
                d['pdays'] == 999,
                0,
                1
            )
    )
    .drop(columns=['pdays'])
    .pipe(create_unknown_flags_and_return, cat_cols)
)

for c in cat_cols:
    df_holdout[c] = df_holdout[c].replace(
        'unknown',
        train_modes[c]
    )

df_holdout = (
    df_holdout
    .assign(
        job=lambda d:
            d['job'].apply(map_job_category),
        marital=lambda d:
            np.where(
                d['marital'].isin(["single", "unknown"]),
                1,
                0
            ),
        education=lambda d:
            d['education'].map(
                lambda x: 1
                if x in ['university.degree', 'professional.course']
                else 0
            ),
        default=lambda d:
            d['default'].map(
                lambda x: 1 if x == 'no' else 0
            ),
        housing=lambda d:
            d['housing'].map(
                lambda x: 1 if x == 'yes' else 0
            ),
        loan=lambda d:
            d['loan'].map(
                lambda x: 1 if x == 'yes' else 0
            ),
        contact=lambda d:
            d['contact'].map(
                lambda x: 1 if x == 'cellular' else 0
            ),
        month=lambda d:
            np.where(
                d['month'].isin(["mar", "sep", "oct", "dec"]),
                1,
                0
            ),
        day_of_week=lambda d:
            d['day_of_week'].map(day_map)
    )
)

df_holdout = (
    df_holdout
    .assign(
        poutcome=lambda d:
            pout_encoder.transform(
                d['poutcome'].astype(str)
            )
    )
)

scaled_data_holdout = scaler.transform(
    df_holdout[num_cols]
)
df_scaled_holdout = pd.DataFrame(
    scaled_data_holdout,
    columns=[
        c + "_scaled"
        for c in num_cols
    ]
)

df_holdout = (
    pd.concat([df_holdout, df_scaled_holdout], axis=1)
    .drop(columns=num_cols)
)

# Drop Unnecessary Columns -------------
columns_to_drop = [
    'poutcome_was_unknown',
    'day_of_week_was_unknown',
    'month_was_unknown',
    'contact_was_unknown',
    'loan_was_unknown',
    'previous_scaled',
    'education_was_unknown',
    'housing',
    'default'

]

df_holdout = df_holdout.drop(
    columns=columns_to_drop
)


X_holdout = df_holdout.copy()

if 'y' in X_holdout.columns:
    X_holdout = (
        X_holdout
        .drop(columns='y')
    )

holdout_preds = best_xgb_no_thr.predict(
    X_holdout
)

df_holdout = (
    df_holdout
    .assign(
        predictions=lambda d: holdout_preds
    )
)

#df_holdout = df_holdout['predictions']

#df_holdout.to_csv(
#    "team7-module3-predictions.csv",
#    index=False
#)

#print("Holdout predictions (no threshold tuning) saved.")

#df_holdout.shape


In [None]:

# Visuals ------------- $ All our visuals were re-done in R for the Executive Summary
graph_dist = (
    ggplot(
        df_train,
        aes(x='y')
    ) + \
    geom_bar(
        fill='#4682B4',
        color='black'
    ) + \
    ggsize(500, 300) + \
    ggtitle("Distribution of Y (Target)")
)

graph_dist





In [None]:

# Final Checks -----------
cm = confusion_matrix(
    y_val,
    y_val_pred
)

df_cm = (
    pd.DataFrame(
        cm,
        index=[
            'True_0',
            'True_1'
        ],
        columns=[
            'Pred_0',
            'Pred_1'
        ]
    )
    .reset_index()
    .melt(
        id_vars='index',
        var_name='Prediction',
        value_name='Count'
    )
)

graph_cm = (
    ggplot(
        df_cm,
        aes(
            x='Prediction',
            y='index',
            fill='Count'
        )
    ) + \
    geom_tile() + \
    geom_text(
        aes(label='Count'),
        color='white'
    ) + \
    scale_fill_gradient(
        low='#4682B4',
        high='#B22222'
    ) + \
    ggtitle("Confusion Matrix - No Threshold Tuning") + \
    ggsize(500, 300)
)

graph_cm


In [None]:

from sklearn.metrics import precision_recall_curve, average_precision_score

# Predict probabilities
y_val_proba = best_xgb_no_thr.predict_proba(
    X_val
)[:, 1]

# Compute Precision-Recall curve
precision, recall, _ = precision_recall_curve(
    y_val,
    y_val_proba
)

# Compute AUPRC
auprc = average_precision_score(
    y_val,
    y_val_proba
)

# Create DataFrame for plotting
df_pr = pd.DataFrame(
    {
        'recall': recall,
        'precision': precision
    }
)

# Plot Precision-Recall Curve with AUPRC
graph_auprc = (
    ggplot(
        df_pr,
        aes(
            x='recall',
            y='precision'
        )
    ) + \
    geom_line(
        color='#4682B4'
    ) + \
    geom_text(
        x=0.6,
        y=0.2,
        label=f'AUPRC = {auprc:.2f}',
        size=12,
        color='black'
    ) + \
    xlab("Recall") + \
    ylab("Precision") + \
    ggtitle("Precision-Recall Curve - No Threshold Tuning") + \
    ggsize(500, 300)
)

graph_auprc



In [None]:
importances = best_xgb_no_thr.feature_importances_

df_fi = pd.DataFrame(
    {
        'feature': X.columns,
        'importance': importances
    }
)

df_fi = (
    df_fi
    .sort_values(
        'importance',
        ascending=False
    )
    .reset_index(
        drop=True
    )
)

graph_fi = (
    ggplot(
        df_fi,
        aes(
            x='feature',
            y='importance'
        )
    ) + \
    geom_bar(
        stat='identity',
        fill='#4682B4'
    ) + \
    coord_flip() + \
    ggtitle("Feature Importances - No Threshold Tuning") + \
    ggsize(500, 300)
)

graph_fi




In [None]:


# by Month

df_train = pd.read_csv(
    'https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv'
)

df_train = df_train.assign(
    age=lambda x: x['age'].apply(
        lambda a: 1 if a < 65 else 0
    )
)

dt_month = (
    df_train
    .groupby('age')
    .size()
    .reset_index(
        name='sum'
    )
)

dt_month_y = (
    df_train
    .groupby(
        ['age', 'y']
    )
    .size()
    .reset_index(
        name='bymonth'
    )
)

dt_graph = (
    dt_month
    .merge(
        dt_month_y,
        on='age'
    )
    .assign(
        percent=lambda x: x['bymonth'] / x['sum']
    )
)

dt_graph.head(10)

graph = (
    ggplot(
        dt_graph,
        aes(
            x='age',
            y='percent',
            fill='y'
        )
    ) + \
    geom_bar(
        position='dodge',
        stat='identity'
    ) + \
    scale_fill_manual(
        values=['#4682B4', '#B22222']
    ) + \
    theme_minimal() + \
    labs(
        x='Age Group',
        y='Percentage',
        fill='Y'
    )
)

graph



In [None]:


df_train = pd.read_csv(
    'https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv'
)

df_train = df_train.assign(
    month=pd.Categorical(
        df_train['month'],
        categories=[
            "mar", "apr", "may", "jun",
            "jul", "aug", "sep", "oct", "nov", "dec"
        ],
        ordered=True
    )
)

dt_month = (
    df_train
    .groupby('month')
    .size()
    .reset_index(
        name='sum'
    )
)

dt_month_y = (
    df_train
    .groupby(
        ['month', 'y']
    )
    .size()
    .reset_index(
        name='bymonth'
    )
)

dt_graph = (
    dt_month
    .merge(
        dt_month_y,
        on='month'
    )
    .assign(
        percent=lambda x: x['bymonth'] / x['sum']
    )
)

graph = (
    ggplot(
        dt_graph,
        aes(
            x='month',
            y='percent',
            color='y',
            group='y'
        )
    ) + \
    geom_line() + \
    scale_color_manual(
        values=['#4682B4', '#B22222']
    ) + \
    theme_minimal() + \
    labs(
        x='Month',
        y='Percentage',
        color='Y'
    )
)

graph
