In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Load the dataset
data = pd.read_csv("bank-full.csv", delimiter=';')
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

# Data Exploration & Preprocessing

**Column Descriptions**
*   **age**
*   **job**: type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
*   **marital**: marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
*   **education**: (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
*   **default**: has credit in default?
*   **balance**: average yearly balance
*   **housing**: has housing loan?
*   **loan**: has personal loan?
*   **contact**: contact communication type (categorical: 'cellular','telephone')
*   **day_of_week**: last contact day of the week
*   **month**: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
*   **duration**: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
*   **campaign**: number of contacts performed during this campaign and for this client (numeric, includes last contact)
*   **pdays**: number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted)
*   **previous**: number of contacts performed before this campaign and for this client
*   **poutcome**: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
*   **y**: has the client subscribed a term deposit?

In [None]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [None]:
print("\nData Info:")
data.info()


Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [None]:
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [None]:
print("\nDescriptive Statistics:")
data.describe()


Descriptive Statistics:


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [None]:
target = data['y']
data_wt_target = data.drop(columns=['y'])
# data_wt_target.nunique().loc[data_wt_target.nunique() < 20]

In [None]:
# Identify continuous random variables and discrete random variables
continuous_cols = ['age', 'balance', 'duration', 'campaign', 'duration', 'pdays', 'previous'] # From the website annotation
discrete_cols = data_wt_target.drop(columns=continuous_cols).columns.to_list()
print(continuous_cols)
print(discrete_cols)

['age', 'balance', 'duration', 'campaign', 'duration', 'pdays', 'previous']
['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'day', 'month', 'poutcome']


In [None]:
target = data['y']

# Check the number of distinct values of continuous random variables
data_wt_target[continuous_cols].nunique()

Unnamed: 0,0
age,77
balance,7168
duration,1573
campaign,48
duration,1573
pdays,559
previous,41


Since Naive Bayes is sensitive to the number of distinct values with respect to run time, we wanna keep all distinct feature values for each under a specific threshold. For the sake of time, let's say this threshold to be 600.

In [None]:
# Discretization: Apply quantile-based binning to numerical features
thres = 600
corresponding_columns = [col for col in data_wt_target.columns if data_wt_target[col].nunique() > thres]
for column in corresponding_columns:
  data_wt_target[column] = pd.qcut(data_wt_target[column], q=thres, labels=False, duplicates='drop')

In [None]:
# Validate correctness
print(f"Number of distinct values in each column: {data_wt_target.nunique()}")
data_wt_target

Number of distinct values in each column: age           77
job           12
marital        3
education      4
default        2
balance      549
housing        2
loan           2
contact        3
day           31
month         12
duration     451
campaign      48
pdays        559
previous      41
poutcome       4
dtype: int64


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,442,yes,no,unknown,5,may,256,1,-1,0,unknown
1,44,technician,single,secondary,no,73,yes,no,unknown,5,may,145,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,51,yes,yes,unknown,5,may,69,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,404,yes,no,unknown,5,may,85,1,-1,0,unknown
4,33,unknown,single,unknown,no,50,no,no,unknown,5,may,192,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,329,no,no,cellular,17,nov,439,3,-1,0,unknown
45207,71,retired,divorced,primary,no,420,no,no,cellular,17,nov,369,2,-1,0,unknown
45208,72,retired,married,secondary,no,518,no,no,cellular,17,nov,444,5,184,3,success
45209,57,blue-collar,married,secondary,no,302,no,no,telephone,17,nov,384,4,-1,0,unknown


Now we ensure that all random variables are bahving like discrete random variables that could be enumerated in Naive Bayes Model, and we are gonna summarize our data preprocessing as follows:

In [None]:
# Summarized data preprocessing
def data_processing(df, thres=600):
    df_wt_target = df.drop(columns=['y'])
    corresponding_columns = [col for col in df_wt_target.columns if df_wt_target[col].nunique() > thres]
    for column in corresponding_columns:
      df_wt_target[column] = pd.qcut(df_wt_target[column], q=thres, labels=False, duplicates='drop')
    return df_wt_target.to_numpy()

In [None]:
class Naive_Bayes():
    """

    Naive Bayes classifer

    Attributes:
        prior: P(Y)
        likelihood: P(X_j | Y)
    """

    def __init__(self):
        self.model_name = 'Naive Bayes'


    def fit(self, X_train, y_train):

        """
            The fit function fits the Naive Bayes model based on the training data.
            Here, we assume that all the features are **discrete** features, which should be done by preprocessing the data.

            X_train is a matrix or 2-D numpy array, represnting training instances.
            Each training instance is a feature vector.

            y_train contains the corresponding labels. There might be multiple (i.e., > 2) classes.
        """
        self.y_train = np.array(y_train)
        self.prior = dict()
        n = len(y_train)

        # Initialize prior distribution
        for y in y_train:
            if f'Y = {y}' not in self.prior:
                self.prior[f'Y = {y}'] = 1/n
            else:
                self.prior[f'Y = {y}'] += 1/n
        # Create likelihood table (CPTs)
        self.likelihood = dict()
        for x, y in zip(X_train, y_train):
            data = np.array(x).flatten()
            for j in range(len(data)):
                if f'X{j} = {data[j]} | Y = {y}' not in self.likelihood:
                    self.likelihood[f'X{j} = {data[j]} | Y = {y}'] = 1 / self.prior[f'Y = {y}']
                else:
                    self.likelihood[f'X{j} = {data[j]} | Y = {y}'] += 1 / self.prior[f'Y = {y}']


    def ind_predict(self, x : list):

        """
            Predict the most likely class label of one test instance based on its feature vector x.
        """

        x = np.array(x).flatten()
        ret, max_log_prob = None, float('-inf')
        for y in np.unique(self.y_train):
            log_prob = np.log(self.prior[f'Y = {y}'])
            for j in range(len(x)):
                log_prob += np.log(self.likelihood.get(f'X{j} = {x[j]} | Y = {y}', 1e-6))  # using a small constant for unknown values
            if log_prob > max_log_prob:
                max_log_prob = log_prob
                ret = y
        return ret


    def predict(self, X):

        """
            X is a matrix or 2-D numpy array, represnting testing instances.
            Each testing instance is a feature vector.

            Return the predictions of all instances in a list.
        """

        ret = []
        for x in X:
            ret.append(self.ind_predict(x))

        return np.array(ret)

In [None]:
# Preprocess the data
from sklearn.model_selection import train_test_split
X, y = data_processing(data), data.y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33,random_state = 88)

In [None]:
# Train and predict
clf = Naive_Bayes()
clf.fit(X_train, y_train)
y_test = np.array(y_test)
y_hat = clf.predict(X_test)

In [None]:
# Output Accuracy
sum(y_hat == y_test)/ len(y_hat)

0.8810321715817694

In [None]:
# Evaluate the model
from sklearn.metrics import confusion_matrix, classification_report
print(f"Confusion Matrix of the model: \n{confusion_matrix(y_test, y_hat)}")
print(f"A general report of the model: \n{classification_report(y_test, y_hat)}")

Confusion Matrix of the model: 
[[12407   783]
 [  992   738]]
A general report of the model: 
              precision    recall  f1-score   support

          no       0.93      0.94      0.93     13190
         yes       0.49      0.43      0.45      1730

    accuracy                           0.88     14920
   macro avg       0.71      0.68      0.69     14920
weighted avg       0.87      0.88      0.88     14920



In [None]:
# overfitting/underfitting graph

# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split

# test_sizes = [0.1, 0.2, 0.3, 0.4]
# train_errors = []
# val_errors = []

# for size in test_sizes:
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=88)

#     clf = Naive_Bayes()
#     clf.fit(X_train, y_train)
#     train_pred = clf.predict(X_train)
#     test_pred = clf.predict(X_test)

#     train_errors.append(1 - sum(train_pred == y_train)/ len(y_train))
#     val_errors.append(1 - sum(test_pred == y_test)/ len(y_test))

# plt.figure(figsize=(8, 5))
# plt.plot(test_sizes * len(X_train), train_errors, label="Training Error", marker='o')
# plt.plot(test_sizes * len(X_train), val_errors, label="Validation Error", marker='o')
# plt.xlabel("Training Set Size")
# plt.ylabel("Error Rate")
# plt.title("Underfitting vs. Overfitting")
# plt.legend()
# plt.show()