## Data PreProcessing ##

**Importing Libraries**

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

**Load the dataset**

In [2]:
df = pd.read_csv('../data/bank-additional.csv', delimiter=';')

# Get the info about the dataframe
df.info()

# Get few rows of the dataframe
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.idx   4119 non-null   f

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [9]:
#Create two Feature sets - Numeric & Categorical
numeric_features = ['age', 'duration', 'campaign', 'pdays', 'previous',
                      'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']
categorical_features = ['job', 'marital', 'education', 'default', 'housing',
                        'loan', 'contact', 'month', 'day_of_week', 'poutcome']

# Initialize the standardScaler and apply to numeric features
scaler = StandardScaler()
scaled_numeric = pd.DataFrame(scaler.fit_transform(df[numeric_features]),
                                columns=numeric_features, index=df.index)

# Initialize encoders for Categorical Features
oneHotEncoder = OneHotEncoder(drop='if_binary', sparse_output=False)
labelEncoder = LabelEncoder()


# Divide categorical features into low and high cardinality
low_cardinality_categorical = [col for col in categorical_features if df[col].nunique() <= 4]
high_cardinality_categorical = [col for col in categorical_features if col not in low_cardinality_categoricals]



# OneHotEncoder low cardinality categorical features
low_cardinality_encoded = pd.DataFrame(oneHotEncoder.fit_transform(df[low_cardinality_categorical]),
                                columns=oneHotEncoder.get_feature_names_out(low_cardinality_categorical),
                                index=df.index)

# LabelEncoder high cardinality categorical features
high_cardinality_encoded = df[high_cardinality_categorical].apply(labelEncoder.fit_transform)


# Combine all preprocessed features with the target variable
preprocessed_data = pd.concat([scaled_numeric, low_cardinality_encoded, high_cardinality_encoded, df['y']], axis=1)
preprocessed_data.head()




Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,loan_yes,contact_telephone,poutcome_failure,poutcome_nonexistent,poutcome_success,job,education,month,day_of_week,y
0,-0.980752,0.903952,-0.209228,0.201031,-0.351356,-1.206054,-1.185448,-1.240939,-1.331707,-0.914779,...,0.0,0.0,0.0,1.0,0.0,1,2,6,0,no
1,-0.107991,0.3503,0.569634,0.201031,-0.351356,0.649441,0.715193,0.892269,0.711698,0.332862,...,0.0,1.0,0.0,1.0,0.0,7,3,6,0,no
2,-1.465619,-0.116966,-0.59866,0.201031,-0.351356,0.841389,1.528273,-0.283172,0.773427,0.836535,...,0.0,1.0,0.0,1.0,0.0,7,3,4,4,no
3,-0.204965,-0.941553,0.180203,0.201031,-0.351356,0.841389,1.528273,-0.283172,0.771697,0.836535,...,0.0,1.0,0.0,1.0,0.0,7,2,4,0,no
4,0.667795,-0.780563,-0.59866,0.201031,-0.351356,-0.11835,-0.655478,-0.326707,0.328632,0.398028,...,0.0,0.0,0.0,1.0,0.0,0,6,7,1,no


In [10]:
#Train and Test Data
X = preprocessed_data.drop(columns=['y'])
y = preprocessed_data['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Save the training and test data to CSV files
X_train.to_csv('../data/X_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
y_test.to_csv('../data/y_test.csv', index=False)

# Display the preprocessed data overview
preprocessed_data.info(), X_train.shape, X_test.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 32 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   4119 non-null   float64
 1   duration              4119 non-null   float64
 2   campaign              4119 non-null   float64
 3   pdays                 4119 non-null   float64
 4   previous              4119 non-null   float64
 5   emp.var.rate          4119 non-null   float64
 6   cons.price.idx        4119 non-null   float64
 7   cons.conf.idx         4119 non-null   float64
 8   euribor3m             4119 non-null   float64
 9   nr.employed           4119 non-null   float64
 10  marital_divorced      4119 non-null   float64
 11  marital_married       4119 non-null   float64
 12  marital_single        4119 non-null   float64
 13  marital_unknown       4119 non-null   float64
 14  default_no            4119 non-null   float64
 15  default_unknown      

(None, (2883, 31), (1236, 31))