In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Data location
dataset = 'Data/Banking_Marketing.csv'

# load dataframe
df = pd.read_csv(dataset, header=0)

In [2]:
# Drop NA rows
df = df.dropna()

In [3]:
# Get categorical column headers
data_column_category = df.select_dtypes(exclude=[np.number]).columns
data_column_category

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

In [4]:
# Print some categorical data
df[data_column_category].head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,nonexistent
1,technician,married,unknown,no,no,no,cellular,nov,fri,nonexistent
2,management,single,university.degree,no,yes,no,cellular,jun,thu,success
3,services,married,high.school,no,no,no,cellular,apr,fri,nonexistent
4,retired,married,basic.4y,no,yes,no,cellular,aug,fri,success


In [5]:
# Make a copy of the dataframe to use in alternate method below
df_copy = df[:]

# Get numeric columns for later (to combine with onehot encoded columns)
data_column_numbers = df.select_dtypes(np.number).columns
data_column_numbers

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate',
       'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y'],
      dtype='object')

In [6]:
# Encode labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for i in data_column_category:
    df[i] = label_encoder.fit_transform(df[i])
    
print('Label Encoded Data: ')
df.head()

Label Encoded Data: 


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44.0,1,1,0,1,2,0,0,1,2,...,1,999,0,1,1.4,93.444,-36.1,4.963,5228.1,0
1,53.0,9,1,7,0,0,0,0,7,0,...,1,999,0,1,-0.1,93.2,-42.0,4.021,5195.8,0
2,28.0,4,2,6,0,2,0,0,4,2,...,3,6,2,2,-1.7,94.055,-39.8,0.729,4991.6,1
3,39.0,7,1,3,0,0,0,0,0,0,...,2,999,0,1,-1.8,93.075,-47.1,1.405,5099.1,0
4,55.0,5,1,0,0,2,0,0,1,0,...,1,3,1,2,-2.9,92.201,-31.4,0.869,5076.2,1


In [7]:
# One-hot
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(df[data_column_category])

In [8]:
#Create new dataframe
onehot_encoded_frame = pd.DataFrame(onehot_encoded, 
                                    columns=onehot_encoder.get_feature_names_out(data_column_category))
onehot_encoded_frame.head()

Unnamed: 0,job_0,job_1,job_2,job_3,job_4,job_5,job_6,job_7,job_8,job_9,...,month_8,month_9,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,poutcome_0,poutcome_1,poutcome_2
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
onehot_encoded_frame.columns

Index(['job_0', 'job_1', 'job_2', 'job_3', 'job_4', 'job_5', 'job_6', 'job_7',
       'job_8', 'job_9', 'job_10', 'job_11', 'marital_0', 'marital_1',
       'marital_2', 'marital_3', 'education_0', 'education_1', 'education_2',
       'education_3', 'education_4', 'education_5', 'education_6',
       'education_7', 'default_0', 'default_1', 'default_2', 'housing_0',
       'housing_1', 'housing_2', 'loan_0', 'loan_1', 'loan_2', 'contact_0',
       'contact_1', 'month_0', 'month_1', 'month_2', 'month_3', 'month_4',
       'month_5', 'month_6', 'month_7', 'month_8', 'month_9', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'poutcome_0', 'poutcome_1', 'poutcome_2'],
      dtype='object')

---
***Note: The book is misleading here. You want to start with the dataframe before it's encoded and you need to store the numeric column names so that you can combine the one-hot category fields back up with the numeric ones. I made a copy of the df dataframe and selected the numeric dtype columns above to do this alternate method.***

***Furthermore, the prefix= kwarg is no longer needed for the purpose of this exercise; the get_dummies pandas function prepends the original column name by default***

In [10]:
# Alternate one-hot encoding method
df_onehot_getdummies = pd.get_dummies(df_copy[data_column_category])

In [11]:
# Combine encoded data and numeric data (can be done with previous method too, of course)

df_onehot_encoded_data = pd.concat([df_onehot_getdummies, df_copy[data_column_numbers]], axis=1)

In [12]:
df_onehot_encoded_data.columns

Index(['job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu',
       'day_of_week_tue', 'day_of_week_wed', 'poutcome_failure',

In [13]:
df_onehot_encoded_data.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,duration,campaign,pdays,previous,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,0,1,0,0,0,0,0,0,0,0,...,210.0,1,999,0,1.4,93.444,-36.1,4.963,5228.1,0
1,0,0,0,0,0,0,0,0,0,1,...,138.0,1,999,0,-0.1,93.2,-42.0,4.021,5195.8,0
2,0,0,0,0,1,0,0,0,0,0,...,339.0,3,6,2,-1.7,94.055,-39.8,0.729,4991.6,1
3,0,0,0,0,0,0,0,1,0,0,...,185.0,2,999,0,-1.8,93.075,-47.1,1.405,5099.1,0
4,0,0,0,0,0,1,0,0,0,0,...,137.0,1,3,1,-2.9,92.201,-31.4,0.869,5076.2,1
