In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
5366,Hyundai,80000,Diesel,First Owner,720000
6355,Honda,100000,Petrol,First Owner,250000
2432,Maruti,80000,Diesel,Second Owner,640000
4398,Tata,53000,Diesel,First Owner,980000
2569,Renault,15000,Petrol,First Owner,380000


In [4]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

## 1.One Hot Encoding using Pands 

In [5]:
pd.get_dummies(df, columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


## K-1 one hot encoding

In [6]:
pd.get_dummies(df, columns=['fuel','owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


## One Hot Encoding using Sklearn

In [7]:
#pands cannot remember the position of columns it changes position every time thats why we use sklearn 

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=2)

In [9]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [10]:
X_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
606,Hyundai,80000,Petrol,First Owner
7575,Mahindra,70000,Diesel,Second Owner
7705,Toyota,68089,Petrol,First Owner
4305,Hyundai,70000,Petrol,Second Owner
2685,Mahindra,97000,Diesel,Second Owner


In [11]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
ohe = OneHotEncoder(drop='first', sparse=False,dtype=np.int32)
# when we use sparse = False then we sont need to add .toarray() at the end of the X_train_new, and X_test_new object

In [15]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])



In [16]:
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [17]:
X_train_new.shape

(6502, 7)

In [18]:
X_train.values

array([['Hyundai', 35000, 'Diesel', 'First Owner'],
       ['Jeep', 60000, 'Diesel', 'First Owner'],
       ['Hyundai', 25000, 'Petrol', 'First Owner'],
       ...,
       ['Tata', 15000, 'Petrol', 'First Owner'],
       ['Maruti', 32500, 'Diesel', 'Second Owner'],
       ['Isuzu', 121000, 'Diesel', 'First Owner']], dtype=object)

In [20]:
X_train_new

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [22]:
# to append the X_train_new (transformed columns) to the data frame horizaontally 
np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1, ..., 0, 0, 0],
       ['Jeep', 60000, 1, ..., 0, 0, 0],
       ['Hyundai', 25000, 0, ..., 0, 0, 0],
       ...,
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ['Maruti', 32500, 1, ..., 1, 0, 0],
       ['Isuzu', 121000, 1, ..., 0, 0, 0]], dtype=object)

## One Hot Encoding with top Categories

In [24]:
counts = df['brand'].value_counts()

In [25]:
df['brand'].nunique()

32

In [26]:
threshold = 100

In [30]:
repl = counts[counts <= threshold].index

In [31]:
counts[counts <= threshold].values

array([81, 71, 67, 65, 54, 47, 40, 34, 31, 14,  6,  6,  5,  4,  4,  3,  3,
        1,  1,  1], dtype=int64)

In [33]:
pd.get_dummies(df['brand'].replace(repl,'uncomman')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncomman
6855,False,False,False,False,False,True,False,False,False,False,False,False,False
1618,False,False,False,False,False,False,True,False,False,False,False,False,False
4063,False,False,False,False,True,False,False,False,False,False,False,False,False
1766,False,False,False,False,True,False,False,False,False,False,False,False,False
1136,False,False,False,False,False,False,False,False,False,False,False,False,True
