# 01 — Data Preprocessing

This notebook cleans the Bank Marketing dataset and saves `data/bank_cleaned.csv`.

In [13]:
# 01 — Data Preprocessing
# This notebook cleans the Bank Marketing dataset and saves ../data/bank_cleaned.csv

import sys
from pathlib import Path
import pandas as pd
import numpy as np

# make ../src importable when running from notebooks/
sys.path.append("../src")

from preprocessing import preprocess_pipeline  # from ../src/preprocessing.py

# paths (assuming this notebook is in cmpt459-project/notebooks/)
RAW = Path("../data/bank.csv")
CLEAN = Path("../data/bank_cleaned.csv")

print("RAW path:", RAW.resolve())
print("CLEAN path:", CLEAN.resolve())


RAW path: /Users/camille/bank-marketing-analysis/data/bank.csv
CLEAN path: /Users/camille/bank-marketing-analysis/data/bank_cleaned.csv


## Inspect Raw Data

In [14]:
if not RAW.exists():
    raise FileNotFoundError(
        f"Could not find {RAW}. Make sure your raw CSV is at cmpt459-project/data/bank.csv"
    )

df = pd.read_csv(RAW)
print(f"Loaded raw dataset: {df.shape[0]} rows × {df.shape[1]} columns")
display(df.head())
df.info()
display(df.describe(include='all'))


Loaded raw dataset: 11162 rows × 17 columns


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
count,11162.0,11162,11162,11162,11162,11162.0,11162,11162,11162,11162.0,11162,11162.0,11162.0,11162.0,11162.0,11162,11162
unique,,12,3,4,2,,2,2,3,,12,,,,,4,2
top,,management,married,secondary,no,,no,no,cellular,,may,,,,,unknown,no
freq,,2566,6351,5476,10994,,5881,9702,8042,,2824,,,,,8326,5873
mean,41.231948,,,,,1528.538524,,,,15.658036,,371.993818,2.508421,51.330407,0.832557,,
std,11.913369,,,,,3225.413326,,,,8.42074,,347.128386,2.722077,108.758282,2.292007,,
min,18.0,,,,,-6847.0,,,,1.0,,2.0,1.0,-1.0,0.0,,
25%,32.0,,,,,122.0,,,,8.0,,138.0,1.0,-1.0,0.0,,
50%,39.0,,,,,550.0,,,,15.0,,255.0,2.0,-1.0,0.0,,
75%,49.0,,,,,1708.0,,,,22.0,,496.0,3.0,20.75,1.0,,


In [15]:
df_clean = preprocess_pipeline(str(RAW), str(CLEAN))
print(f"Saved cleaned CSV to {CLEAN}")

 Cleaned data saved to ../data/bank_cleaned.csv
Saved cleaned CSV to ../data/bank_cleaned.csv


## Run Preprocessing Pipeline

In [16]:
df_clean = pd.read_csv(CLEAN)
print(f"Loaded cleaned dataset: {df_clean.shape[0]} rows × {df_clean.shape[1]} columns")
display(df_clean.head())


Loaded cleaned dataset: 11162 rows × 39 columns


Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success
0,1.491505,0,0.252525,1,0,-1.265746,1.930226,-0.554168,,-0.36326,...,False,False,False,False,True,False,False,False,False,False
1,1.239676,0,-0.459974,0,0,-1.265746,3.154612,-0.554168,,-0.36326,...,False,False,False,False,True,False,False,False,False,False
2,-0.01947,0,-0.08016,1,0,-1.265746,2.929901,-0.554168,,-0.36326,...,False,False,False,False,True,False,False,False,False,False
3,1.155733,0,0.293762,1,0,-1.265746,0.596366,-0.554168,,-0.36326,...,False,False,False,False,True,False,False,False,False,False
4,1.07179,0,-0.416876,0,0,-1.265746,0.867171,-0.186785,,-0.36326,...,False,False,False,False,True,False,False,False,False,False


## Verify Cleaned Data

In [17]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 39 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  11162 non-null  float64
 1   default              11162 non-null  int64  
 2   balance              11162 non-null  float64
 3   housing              11162 non-null  int64  
 4   loan                 11162 non-null  int64  
 5   day                  11162 non-null  float64
 6   duration             11162 non-null  float64
 7   campaign             11162 non-null  float64
 8   pdays                2838 non-null   float64
 9   previous             11162 non-null  float64
 10  deposit              11162 non-null  int64  
 11  job_blue-collar      11162 non-null  bool   
 12  job_entrepreneur     11162 non-null  bool   
 13  job_housemaid        11162 non-null  bool   
 14  job_management       11162 non-null  bool   
 15  job_retired          11162 non-null 