## Encoding Techniques

- encoding used for convert textual data into numeric data

### 1. Ordinal Encoding
- ordinal encoding used when we have order or ranked wise categorical data

<img src='https://miro.medium.com/v2/resize:fit:1216/1*4IJdYmFQqfZcgcMEYKkeUw.png'>

In [13]:
import pandas as pd
import numpy as np

In [14]:
data = pd.read_csv('bank-full.csv',sep=';',usecols=['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','y'])
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,no


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  y          45211 non-null  object
dtypes: int64(5), object(9)
memory usage: 4.8+ MB


In [16]:
data['education'].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [17]:
# columns for apply ordinal encoding 
# 1. education

from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['unknown','primary','secondary','tertiary']])
data['education'] = oe.fit_transform(data[['education']])

In [9]:
data.head(4)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,y
0,58,management,married,3.0,no,2143,yes,no,unknown,5,may,261,1,no
1,44,technician,single,2.0,no,29,yes,no,unknown,5,may,151,1,no
2,33,entrepreneur,married,2.0,no,2,yes,yes,unknown,5,may,76,1,no
3,47,blue-collar,married,0.0,no,1506,yes,no,unknown,5,may,92,1,no


### 2. One-Hot Encoding

- apply onehot encoding in remaining columns because remaining columns values are not order or ranked wise 

In [22]:
categorical_cols = data.select_dtypes('object')
categorical_cols = categorical_cols.iloc[:,:-1]
categorical_cols.head(3)

Unnamed: 0,job,marital,default,housing,loan,contact,month
0,management,married,no,yes,no,unknown,may
1,technician,single,no,yes,no,unknown,may
2,entrepreneur,married,no,yes,yes,unknown,may


In [24]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(dtype='int')
categorical_cols = pd.DataFrame(ohe.fit_transform(categorical_cols).toarray())

In [25]:
categorical_cols.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [28]:
new_df = pd.concat([data,categorical_cols],axis=1)
new_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,26,27,28,29,30,31,32,33,34,35
0,58,management,married,3.0,no,2143,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0
1,44,technician,single,2.0,no,29,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0
2,33,entrepreneur,married,2.0,no,2,yes,yes,unknown,5,...,0,0,0,0,0,0,1,0,0,0
3,47,blue-collar,married,0.0,no,1506,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0
4,33,unknown,single,0.0,no,1,no,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0


### 3. Label Encoding

- label encoding used for only target column

In [29]:
new_df['y'].head()

0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

In [30]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
new_df['y'] = le.fit_transform(new_df['y'])

In [31]:
new_df.head(4)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,26,27,28,29,30,31,32,33,34,35
0,58,management,married,3.0,no,2143,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0
1,44,technician,single,2.0,no,29,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0
2,33,entrepreneur,married,2.0,no,2,yes,yes,unknown,5,...,0,0,0,0,0,0,1,0,0,0
3,47,blue-collar,married,0.0,no,1506,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,0


In [34]:
# drop categorial colum because we convert it into numeric columns so now we do not need categorical columns

original_data = new_df.drop(columns = ['job','marital','default','housing','loan','contact','day','month'],axis=1)

In [36]:
pd.set_option('display.max_columns',None)
original_data.head()

Unnamed: 0,age,education,balance,duration,campaign,y,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35
0,58,3.0,2143,261,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
1,44,2.0,29,151,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,33,2.0,2,76,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
3,47,0.0,1506,92,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
4,33,0.0,1,198,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
