#### ONE HOT ENCODING

In [None]:
import numpy as np
import pandas as pd


In [None]:
df = pd.read_csv('cars.csv')

In [None]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [None]:
df['brand'].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186




> Series.nunique(dropna=True)[source]
Return number of unique elements in the object.



In [None]:
df['brand'].nunique()

32

In [None]:
df['owner'].value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


In [None]:
df['fuel'].value_counts()

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,4402
Petrol,3631
CNG,57
LPG,38


##### One hot encoding using pandas

> pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

In [None]:
# apply encoding on these columns by passing it as a list
pd.get_dummies(df,columns=['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


##### K-1 one hot encoding

> 1st column from both the categories is removed (to solve the problem of multicollinearity)

In [None]:
pd.get_dummies(df,columns=['fuel','owner'],drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


Limitation using pandas -> it doesnt remember the position of columns that it creates (yes talking about the dummy variables!!)

that is why we use sklearn class OneHotEncoder

##### One hot encoding using sklearn

In [None]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2,random_state=13)

> class sklearn.preprocessing.OneHotEncoder(*, categories='auto', drop=None, sparse_output=True, dtype=<class 'numpy.float64'>, handle_unknown='error', min_frequency=None, max_categories=None, feature_name_combiner='concat')

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first',sparse_output=False,dtype=np.int32)
# if you dont want to convert to array from sparse matrix, make sparse='False'
# to convert float to int , change dtype


In [None]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
3612,Maruti,50000,Petrol,First Owner
2186,Mahindra,65000,Diesel,Second Owner
2943,Toyota,25000,Diesel,Second Owner
1896,Maruti,60000,Diesel,Second Owner
4846,Renault,45000,Petrol,Second Owner


In [None]:
X_train[['fuel','owner']]

Unnamed: 0,fuel,owner
3612,Petrol,First Owner
2186,Diesel,Second Owner
2943,Diesel,Second Owner
1896,Diesel,Second Owner
4846,Petrol,Second Owner
...,...,...
2790,Diesel,Second Owner
7696,Petrol,First Owner
74,Diesel,First Owner
6320,Petrol,Second Owner


one hot encoder by default produces a sparse matrix

4 fuel + 5 owner

when we do drop='first', first category of both the columns is dropped

In [None]:
ohe.fit_transform(X_train[['fuel','owner']])

array([[0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [None]:
type(ohe.fit_transform(X_train[['fuel','owner']]))

numpy.ndarray

returns an object of class csr_matrix which has a method toarray()

> scipy.sparse.csr_matrix.
toarray
toarray(order=None, out=None)[source]
Return a dense ndarray representation of this sparse array/matrix.

In [None]:
X_train_new = ohe.fit_transform(X_train[['fuel','owner']])
X_test_new = ohe.transform(X_test[['fuel','owner']])

In [None]:
X_train_new

array([[0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [None]:
X_train[['brand','km_driven']].values

array([['Maruti', 50000],
       ['Mahindra', 65000],
       ['Toyota', 25000],
       ...,
       ['Maruti', 70000],
       ['Maruti', 70000],
       ['Maruti', 90000]], dtype=object)

horizontally stack them

In [None]:
# pass both the numpy arrays in the tuple
np.hstack((X_train[['brand','km_driven']],X_train_new))

array([['Maruti', 50000, 0, ..., 0, 0, 0],
       ['Mahindra', 65000, 1, ..., 1, 0, 0],
       ['Toyota', 25000, 1, ..., 1, 0, 0],
       ...,
       ['Maruti', 70000, 1, ..., 0, 0, 0],
       ['Maruti', 70000, 0, ..., 1, 0, 0],
       ['Maruti', 90000, 1, ..., 0, 0, 0]], dtype=object)

In [None]:
np.hstack((X_train[['brand','km_driven']],X_train_new)).shape

(6502, 9)

##### **One hot encoding with top categories**

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('cars.csv')

In [None]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [None]:
# dealing with column that has lots of categories in it
counts = df['brand'].value_counts()

In [None]:
threshold = 100
(counts <= threshold)


Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,False
Hyundai,False
Mahindra,False
Tata,False
Toyota,False
Honda,False
Ford,False
Chevrolet,False
Renault,False
Volkswagen,False


In [None]:
repl = counts[counts <= threshold].index
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Land', 'Force', 'Isuzu', 'Ambassador',
       'Kia', 'MG', 'Daewoo', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [None]:
pd.get_dummies(df['brand'].replace(repl,'uncommon'),dtype=np.int32).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
5012,0,0,0,0,1,0,0,0,0,0,0,0,0
7124,0,0,0,0,0,0,0,0,0,0,0,0,1
7384,0,0,0,0,0,0,1,0,0,0,0,0,0
5477,0,0,1,0,0,0,0,0,0,0,0,0,0
4791,0,0,0,0,0,0,0,0,0,0,1,0,0
