In [49]:
import pandas as pd
import numpy as np

In [50]:
df = pd.read_csv('cars.csv')

In [51]:
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
6202,Maruti,20000,Petrol,First Owner,550000
4777,Mahindra,11500,Diesel,First Owner,819999
4679,Mahindra,60000,Diesel,Second Owner,540000
2651,Maruti,17000,Petrol,First Owner,350000
1827,Ford,93468,Diesel,Second Owner,975000


In [52]:
# Checking how many cars present in each brand
print("total number of cars in each brand:\n",df['brand'].value_counts())
print("\n\nfuel types: \n",df['fuel'].value_counts())
print("\n\n total owners list: \n",df['owner'].value_counts())

total number of cars in each brand:
 Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64


fuel types: 
 Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: fuel, dtype: int64


 total owners list: 
 First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner 

In [53]:
# Checking total number of brands and fuel types

print("total number of brands: ", df['brand'].nunique())
print("total number of fuel types: ",df['fuel'].nunique())
print("total number of owners: ",df['owner'].nunique())

total number of brands:  32
total number of fuel types:  4
total number of owners:  5


## **One hot encoding using Pandas - get_dummies()**

In [54]:
pd.get_dummies(df, columns =['fuel','owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


### **K-1 One hot encoding - Removing first column from each of the dummy variable**

In [55]:
pd.get_dummies(df, columns =['fuel','owner'], drop_first = True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


We dont use pandas get_dummy because it does not remember the sequence of the columns. Hence we use Scikit learns's one hot encoder class

## **One hot encoding using Sklearn**

In [56]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.2,random_state=2)

In [57]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
5571,Hyundai,35000,Diesel,First Owner
2038,Jeep,60000,Diesel,First Owner
2957,Hyundai,25000,Petrol,First Owner
7618,Mahindra,130000,Diesel,Second Owner
6684,Hyundai,155000,Diesel,First Owner


In [58]:
X_test.head()

Unnamed: 0,brand,km_driven,fuel,owner
606,Hyundai,80000,Petrol,First Owner
7575,Mahindra,70000,Diesel,Second Owner
7705,Toyota,68089,Petrol,First Owner
4305,Hyundai,70000,Petrol,Second Owner
2685,Mahindra,97000,Diesel,Second Owner


In [59]:
from sklearn.preprocessing import OneHotEncoder

In [67]:
ohe = OneHotEncoder(drop='first')    # drop = 'first' is to drop the first dummy variable from each column (that are encoded) to avoid dummy variable trap

Here first we apply encoding in fuel and owner then append it with brand and km_driven

Applying encoding on fuel and owner

In [68]:
# X_train_new will be a sparse matrix

X_train_new = ohe.fit_transform(X_train[['fuel','owner']]).toarray()
X_train_new

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [69]:
# X_test_new will be a sparse matrix

X_test_new = ohe.transform(X_test[['fuel','owner']]).toarray()
X_test_new

array([[0., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

Joining Fuel and Owner back to Brand and Km_driven

In [70]:
# Showing how we convert brand and km_driven into np array first then horizontally stack them to fuel and owner (stacking is done in next cell)

X_train[['brand','km_driven']].values

array([['Hyundai', 35000],
       ['Jeep', 60000],
       ['Hyundai', 25000],
       ...,
       ['Tata', 15000],
       ['Maruti', 32500],
       ['Isuzu', 121000]], dtype=object)

In [71]:
# Horizontally stacking the brand , km_driven, and newly encoded columns

np.hstack((X_train[['brand','km_driven']].values,X_train_new))

array([['Hyundai', 35000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Jeep', 60000, 1.0, ..., 0.0, 0.0, 0.0],
       ['Hyundai', 25000, 0.0, ..., 0.0, 0.0, 0.0],
       ...,
       ['Tata', 15000, 0.0, ..., 0.0, 0.0, 0.0],
       ['Maruti', 32500, 1.0, ..., 1.0, 0.0, 0.0],
       ['Isuzu', 121000, 1.0, ..., 0.0, 0.0, 0.0]], dtype=object)

**Now performing final step - OHE the brand category with top categories**

In [76]:
df['brand'].value_counts()

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

Here let's say we take a threshold of 100 such that all the brands having more than 100 cars will form separate dummy variable and all the brands having less than 100 cars will be stored together to form a single dummy variable (say - '**uncommon**')

In [77]:
df['brand'].nunique
threshold = 100

In [80]:
counts = df['brand'].value_counts()
repl = counts[counts<=threshold].index      # storing all the brands with less than threshold (100) cars in repl variable
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [83]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon'))     # replacing all the non frequent brands as "uncommon" column

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


### **We can use column transformer to avoid such break and append steps and with single line of code we can perform this encoding on all categorical data**
We have done this in next notebook - Day 28 ipynb file.