In [36]:
# import pandas, numpy, OneHotEncoder and load the data set
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

dataset = 'data/Banking_Marketing.csv'

df = pd.read_csv(dataset, header = 0)

In [37]:
# remove the NA rows from the data frame
df.dropna(inplace = True)

In [38]:
# select the data frame's categorical columns
data_column_category = df.select_dtypes(exclude = [np.number]).columns
data_column_category

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

In [39]:
# print the first five rows of the data frame
df[data_column_category].head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,nonexistent
1,technician,married,unknown,no,no,no,cellular,nov,fri,nonexistent
2,management,single,university.degree,no,yes,no,cellular,jun,thu,success
3,services,married,high.school,no,no,no,cellular,apr,fri,nonexistent
4,retired,married,basic.4y,no,yes,no,cellular,aug,fri,success


In [32]:
# perform label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for i in data_column_category:
    df[i] = label_encoder.fit_transform(df[i])

print('Label Encoded Data:')

df[data_column_category].head()

Label Encoded Data:


Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,1,1,0,1,2,0,0,1,2,1
1,9,1,7,0,0,0,0,7,0,1
2,4,2,6,0,2,0,0,4,2,2
3,7,1,3,0,0,0,0,0,0,1
4,5,1,0,0,2,0,0,1,0,2


In [26]:
# perform One-Hot Encoding
onehot_encoder = OneHotEncoder(sparse = False)

onehot_encoded = onehot_encoder.fit_transform(df[data_column_category])


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [18]:
# create a new data frame with the new encoded column names
onehot_encoded_frame = pd.DataFrame(onehot_encoded, columns = onehot_encoder.get_feature_names(data_column_category))
onehot_encoded_frame.head()

Unnamed: 0,job_0.0,job_1.0,job_2.0,job_3.0,job_4.0,job_5.0,job_6.0,job_7.0,job_8.0,job_9.0,...,month_8.0,month_9.0,day_of_week_0.0,day_of_week_1.0,day_of_week_2.0,day_of_week_3.0,day_of_week_4.0,poutcome_0.0,poutcome_1.0,poutcome_2.0
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
# print all the columns of the encoded data frame
onehot_encoded_frame.columns


Index(['job_0.0', 'job_1.0', 'job_2.0', 'job_3.0', 'job_4.0', 'job_5.0',
       'job_6.0', 'job_7.0', 'job_8.0', 'job_9.0', 'job_10.0', 'job_11.0',
       'marital_0.0', 'marital_1.0', 'marital_2.0', 'marital_3.0',
       'education_0.0', 'education_1.0', 'education_2.0', 'education_3.0',
       'education_4.0', 'education_5.0', 'education_6.0', 'education_7.0',
       'default_0.0', 'default_1.0', 'default_2.0', 'housing_0.0',
       'housing_1.0', 'housing_2.0', 'loan_0.0', 'loan_1.0', 'loan_2.0',
       'contact_0.0', 'contact_1.0', 'month_0.0', 'month_1.0', 'month_2.0',
       'month_3.0', 'month_4.0', 'month_5.0', 'month_6.0', 'month_7.0',
       'month_8.0', 'month_9.0', 'day_of_week_0.0', 'day_of_week_1.0',
       'day_of_week_2.0', 'day_of_week_3.0', 'day_of_week_4.0', 'poutcome_0.0',
       'poutcome_1.0', 'poutcome_2.0'],
      dtype='object')

In [58]:
# alternatively prefix the category name with the column name

# reset the data frame
df = pd.read_csv(dataset, header = 0)
df.dropna(inplace = True)

# get the numerical columns
data_column_number = df.select_dtypes(include = [np.number]).columns

# get the categorical column names prefixing the category name
df_onehot_getdummies = pd.get_dummies(df[data_column_category], prefix = data_column_category)
# concatenate the categorical columns with the rest of the non-categorical columns
data_onehot_encoded_data = pd.concat([df_onehot_getdummies, df[data_column_number]], axis = 1)
# print columns
data_onehot_encoded_data.columns


Index(['job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu',
       'day_of_week_tue', 'day_of_week_wed', 'poutcome_failure',