# Feature Encoding

This will demonstrate how to use One-Hot and Label Encoding.

## Importing and loading data

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

print(pd.__version__)
print(np.__version__)

2.1.3
1.26.1


In [2]:
# Loading the data
data = pd.read_csv('datasets/train_bm.csv')

# Checking the data
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [3]:
# Selecting the categorical columns for performing encoding
cat_cols = data.select_dtypes(include = ['object']).columns
print(cat_cols)

Index(['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'],
      dtype='object')


## One-Hot Encoding

One hot encoding is a technique that we use to represent categorical variables as numerical values in a machine learning model.

### One-Hot Encoding for a single variable

In [4]:
# Performing one hot encoding on outlet type
pd.get_dummies(data['Outlet_Type'], dtype = int).head()

Unnamed: 0,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,1,0,0,0
4,0,1,0,0


### One-Hot Encoding for all variables 

In [5]:
# Performing one hot encoding on all the variables
data_encoded = pd.get_dummies(data, dtype = int)

# Checking the data
data_encoded.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Identifier_DRA12,Item_Identifier_DRA24,Item_Identifier_DRA59,Item_Identifier_DRB01,Item_Identifier_DRB13,...,Outlet_Size_High,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,0.016047,249.8092,1999,3735.138,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
1,5.92,0.019278,48.2692,2009,443.4228,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,17.5,0.01676,141.618,1999,2097.27,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
3,19.2,0.0,182.095,1998,732.38,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
4,8.93,0.0,53.8614,1987,994.7052,0,0,0,0,0,...,1,0,0,0,0,1,0,1,0,0


#### **Problem 1:**
- The newly created variables *'Outlet_Size_High, Outlet_Size_Medium, Outlet_Size_Small'*, the order between these variables is destroyed. 
- As a result we are missing out on some important information.

**Solution:** We can use Label Encoding instead of One-Hot Encoding.

## Label Encoding

- Label Encoding is a technique that is used to convert categorical columns into numerical ones.
- It is an important pre-processing step in a machine-learning project.

In [6]:
# Import LabelEncoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

le.fit_transform(['Small', 'Medium', 'High'])

array([2, 1, 0], dtype=int64)

In [8]:
# Label encoder uses the alphabetical order. 
data['Outlet_Size'] = data['Outlet_Size'].map({'Small': 0, 'Medium': 1, 'High': 2})

# Checking the data
data['Outlet_Size'].head()

0    1.0
1    1.0
2    1.0
3    NaN
4    2.0
Name: Outlet_Size, dtype: float64

#### **Problem 2:** 

The number of features has increased from 12 to 1600, where maximum values are 0.

**Solution:** Combine the sparse classes.

In [9]:
# Print the shape of the data
print(data.shape, data_encoded.shape)

(8523, 12) (8523, 1605)


In [10]:
# Print the unique values
print(data.nunique())

Item_Identifier              1559
Item_Weight                   415
Item_Fat_Content                5
Item_Visibility              7880
Item_Type                      16
Item_MRP                     5938
Outlet_Identifier              10
Outlet_Establishment_Year       9
Outlet_Size                     3
Outlet_Location_Type            3
Outlet_Type                     4
Item_Outlet_Sales            3493
dtype: int64


In [12]:
# Check the frequency
data['Item_Identifier'].value_counts().head()

Item_Identifier
FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
Name: count, dtype: int64

In [14]:
# Create a temporary dataframe of a frequency of the data
temp = data['Item_Identifier'].value_counts()
temp.head()

Item_Identifier
FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
Name: count, dtype: int64

In [15]:
data['Item_identifier_count'] = data['Item_Identifier'].apply(lambda x: temp[x])

# Checking the data
data[['Item_Identifier','Item_identifier_count']].head()

Unnamed: 0,Item_Identifier,Item_identifier_count
0,FDA15,8
1,DRC01,6
2,FDN15,7
3,FDX07,6
4,NCD19,6


In [19]:
# Filtering the data
for i in range(0, len(data)):
    if data['Item_identifier_count'][i] < 4:
        data['Item_Identifier'][i] = 'other'

data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_identifier_count
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,1.0,Tier 1,Supermarket Type1,3735.138,8
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,1.0,Tier 3,Supermarket Type2,443.4228,6
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,1.0,Tier 1,Supermarket Type1,2097.27,7
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38,6
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,2.0,Tier 3,Supermarket Type1,994.7052,6


In [21]:
# Checking the frequency again
data['Item_Identifier'].value_counts().head()

Item_Identifier
other    418
FDG33     10
FDW13     10
FDW26      9
NCY18      9
Name: count, dtype: int64