# <span style='color:red'> One Hot Encoding</span>

## <span style='color:blue'><a href="https://x.com/abhishekdotai/status/1751546176069922976?s=20">Read about OHE in detail</a></span>

In [1]:
import numpy as np
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [4]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [6]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

## 1. One Hot Encoding in Pandas

In [10]:
pd.get_dummies(df, columns=['fuel', 'owner'], dtype=int)

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


## 2. K-1 One Hot Encoding

## <span style='color:blue'><a href="https://x.com/abhishekdotai/status/1752023753515815154?s=20">Read about Dummy Variable Trap in OHE in detail</a></span>

In [13]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True ,dtype=int) #get rid of Dummy Variable Trap

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## 3. One Hot Encoding using SkLearn

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, :-1], test_size = 0.2, random_state=2)

In [16]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [17]:
X_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
606,Hyundai,80000,Petrol,First Owner
7575,Mahindra,70000,Diesel,Second Owner
7705,Toyota,68089,Petrol,First Owner
4305,Hyundai,70000,Petrol,Second Owner
2685,Mahindra,97000,Diesel,Second Owner


In [18]:
from sklearn.preprocessing import OneHotEncoder

In [21]:
ohe = OneHotEncoder(drop = 'first', sparse_output = False, dtype= np.int32)

In [24]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.fit_transform(X_test[['fuel','owner']])

In [25]:
X_train_new.shape, X_test_new.shape

((6502, 7), (1626, 7))

In [29]:
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

## 3. OHE with Highly Populated Cateogeries

In [31]:
count = df['brand'].value_counts()

In [32]:
df['brand'].nunique()

32

In [33]:
threshold = 100

In [36]:
repl = count[count <= threshold].index

In [38]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'), dtype=int).sample(10)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
4937,0,0,0,0,0,0,1,0,0,0,0,0,0
324,0,0,0,0,0,0,1,0,0,0,0,0,0
5845,0,0,0,0,0,0,0,1,0,0,0,0,0
7835,0,0,0,0,0,0,0,0,0,0,0,1,0
2322,0,0,0,0,0,0,0,0,0,0,0,0,1
4529,0,0,0,0,1,0,0,0,0,0,0,0,0
2156,0,0,0,0,1,0,0,0,0,0,0,0,0
2372,0,0,1,0,0,0,0,0,0,0,0,0,0
2489,0,0,0,0,0,0,1,0,0,0,0,0,0
5418,0,0,0,0,0,0,0,1,0,0,0,0,0
