# Encoding categorical data

In [76]:
import pandas as pd
import numpy as np


In [77]:
url="https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day26-ordinal-encoding/customer.csv"
da=pd.read_csv(url)

In [78]:
new=da.iloc[:,2:]
new.sample(5)

Unnamed: 0,review,education,purchased
26,Poor,PG,No
24,Average,PG,Yes
11,Good,UG,Yes
12,Poor,School,No
17,Poor,UG,Yes


In [79]:
# now we apply ordinal encoding on review and education column and label encoding for purchased column

from sklearn.model_selection import train_test_split

xtrain,xtest,ytrain,ytest=train_test_split(new.iloc[:,0:2],new.iloc[:,-1],test_size=0.2)
xtrain.head(5)

Unnamed: 0,review,education
47,Good,PG
33,Good,PG
18,Good,School
14,Poor,PG
12,Poor,School


## Ordinal Encoding (input data)

In [80]:

from sklearn.preprocessing import OrdinalEncoder

In [81]:
oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])

oe.fit(xtrain)

OrdinalEncoder(categories=[['Poor', 'Average', 'Good'], ['School', 'UG', 'PG']])

In [82]:
xtr=oe.transform(xtrain)
xte=oe.transform(xtest)

In [83]:
xtr[0:6]

array([[2., 2.],
       [2., 2.],
       [2., 0.],
       [0., 2.],
       [0., 0.],
       [1., 0.]])

In [84]:
oe.categories_

[array(['Poor', 'Average', 'Good'], dtype=object),
 array(['School', 'UG', 'PG'], dtype=object)]

In [85]:
xtr_df=pd.DataFrame(xtr, columns=xtest.columns)
xtr_df.head(5)

Unnamed: 0,review,education
0,2.0,2.0
1,2.0,2.0
2,2.0,0.0
3,0.0,2.0
4,0.0,0.0


## Label Encoding (applied for target data/ y axis/ output data )

In [86]:
from sklearn.preprocessing import LabelEncoder

In [87]:
le=LabelEncoder()

le.fit(ytrain)

ytr=le.transform(ytrain)
yte=le.transform(ytest)

le.classes_

array(['No', 'Yes'], dtype=object)

In [88]:
ytr

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1])

# One hot encoding - for Nominal data

In [89]:
cars=pd.read_csv('https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day27-one-hot-encoding/cars.csv')

In [90]:
cars

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000
...,...,...,...,...,...
8123,Hyundai,110000,Petrol,First Owner,320000
8124,Hyundai,119000,Diesel,Fourth & Above Owner,135000
8125,Maruti,120000,Diesel,First Owner,382000
8126,Tata,25000,Diesel,First Owner,290000


In [91]:
cars['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [92]:
cars['owner'].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth & Above Owner', 'Test Drive Car'], dtype=object)

In [93]:
cars['fuel'].unique()

array(['Diesel', 'Petrol', 'LPG', 'CNG'], dtype=object)

### 1.One hot encoding using pandas

In [94]:
pd.get_dummies(cars,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


### dummy encoding (k-1 encoding)

In [95]:
pd.get_dummies(cars,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


### 2.Onehot encoding using sklearn (this should be used istead of encoding using pandas)

In [96]:
from sklearn.model_selection import train_test_split

In [97]:
cars.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
2828,Audi,26000,Diesel,First Owner,2800000
728,Nissan,25000,Petrol,Second Owner,350000
4031,Hyundai,90000,Petrol,Second Owner,434999
1524,Maruti,50000,Petrol,Second Owner,300000
6739,Mahindra,15000,Diesel,First Owner,1200000


In [98]:
xtrain, xtest, ytrain, ytest=train_test_split(cars.iloc[:,0:4],cars.iloc[:,-1],test_size=0.2,random_state=0)
xtrain.head(5)

Unnamed: 0,brand,km_driven,fuel,owner
3042,Hyundai,60000,LPG,First Owner
1520,Tata,150000,Diesel,Third Owner
2611,Hyundai,110000,Diesel,Second Owner
3544,Mahindra,28000,Diesel,Second Owner
4138,Maruti,15000,Petrol,First Owner


In [99]:
from sklearn.preprocessing import OneHotEncoder

In [100]:
ohe=OneHotEncoder()

#need to apply OHE on fuel, owner column togeather and join the resulltant value with the table
ohe.fit_transform(xtrain[['fuel','owner']])

<6502x9 sparse matrix of type '<class 'numpy.float64'>'
	with 13004 stored elements in Compressed Sparse Row format>

In [101]:
# the above code encodes and gives in a matrix for which looks like this
ohe.fit_transform(xtrain[['fuel','owner']]).toarray()

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [102]:
xtrain_new=ohe.fit_transform(xtrain[['fuel','owner']]).toarray()
xtest_new=ohe.fit_transform(xtest[['fuel','owner']]).toarray()

In [103]:
#now need to stack the other columns of xtrain (brand and km_driven) to the above numpy array

xtrain_en=np.hstack((xtrain[['brand','km_driven']].values,xtrain_new))

In [104]:
pd.DataFrame(xtrain_en)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Hyundai,60000,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Tata,150000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Hyundai,110000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,Mahindra,28000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,Maruti,15000,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
6497,Tata,70000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6498,Ford,100000,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6499,Hyundai,90000,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
6500,Volkswagen,90000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### dummy encoding (k-1 encoding)

In [105]:
den=OneHotEncoder(drop='first')

In [112]:
xtrain_n=den.fit_transform(xtrain[['fuel','owner']]).toarray()

In [113]:
xtrain_en=np.hstack((xtrain[['brand','km_driven']].values,xtrain_n))

In [115]:
xtrain_en.shape

(6502, 9)

In [131]:
xtrain_en

array([['Hyundai', 60000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Tata', 150000, 1.0, ..., 0.0, 0.0, 1.0],
       ['Hyundai', 110000, 1.0, ..., 1.0, 0.0, 0.0],
       ...,
       ['Hyundai', 90000, 0.0, ..., 1.0, 0.0, 0.0],
       ['Volkswagen', 90000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 110000, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

## OneHot Encoding with top categories

In [116]:
cars['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [120]:
# will encode the brands which has more than 100 counts

counts=cars['brand'].value_counts()

threshold=100

In [130]:
rep=counts[counts<=threshold].index
rep

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [129]:
pd.get_dummies(cars['brand'].replace(rep, 'others'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0
