In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('cars.csv')

In [13]:
print(df.sample())

       brand  km_driven    fuel        owner  selling_price
6374  Maruti      15000  Petrol  First Owner         400000


In [5]:
print(df['owner'].value_counts())

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64


## 1. OneHotEncoding using Pandas


In [16]:
print(pd.get_dummies(df, columns=['fuel', 'owner'], prefix=['F', 'O']))

        brand  km_driven  selling_price  F_CNG  F_Diesel  F_LPG  F_Petrol  \
0      Maruti     145500         450000  False      True  False     False   
1       Skoda     120000         370000  False      True  False     False   
2       Honda     140000         158000  False     False  False      True   
3     Hyundai     127000         225000  False      True  False     False   
4      Maruti     120000         130000  False     False  False      True   
...       ...        ...            ...    ...       ...    ...       ...   
8123  Hyundai     110000         320000  False     False  False      True   
8124  Hyundai     119000         135000  False      True  False     False   
8125   Maruti     120000         382000  False      True  False     False   
8126     Tata      25000         290000  False      True  False     False   
8127     Tata      25000         290000  False      True  False     False   

      O_First Owner  O_Fourth & Above Owner  O_Second Owner  O_Test Drive C

## 2. K-1 OneHotEncoding


In [27]:
print(pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True))
# we drop one input bcz of multicolinearity()

        brand  km_driven  selling_price  fuel_Diesel  fuel_LPG  fuel_Petrol  \
0      Maruti     145500         450000         True     False        False   
1       Skoda     120000         370000         True     False        False   
2       Honda     140000         158000        False     False         True   
3     Hyundai     127000         225000         True     False        False   
4      Maruti     120000         130000        False     False         True   
...       ...        ...            ...          ...       ...          ...   
8123  Hyundai     110000         320000        False     False         True   
8124  Hyundai     119000         135000         True     False        False   
8125   Maruti     120000         382000         True     False        False   
8126     Tata      25000         290000         True     False        False   
8127     Tata      25000         290000         True     False        False   

      owner_Fourth & Above Owner  owner_Second Owne

## 3. OneHotEncoding using Sklearn


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, 0:4], df.iloc[:, -1], test_size=0.2, random_state=2)

In [18]:
print(X_train.head())

         brand  km_driven    fuel         owner
5571   Hyundai      35000  Diesel   First Owner
2038      Jeep      60000  Diesel   First Owner
2957   Hyundai      25000  Petrol   First Owner
7618  Mahindra     130000  Diesel  Second Owner
6684   Hyundai     155000  Diesel   First Owner


In [19]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder(drop='first', dtype=np.int32, sparse_output=True)

In [32]:
X_train_new = ohe.fit_transform(X_train[['fuel', 'owner']])

In [33]:
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

In [34]:
print(X_train_new.shape)

(6502, 7)


In [36]:
# print(np.hstack((X_train[['brand', 'km_driven']].values, X_train_new)))

## 4. OneHotEncoding with Top Categories


In [37]:
counts = df['brand'].value_counts()
print(counts)

brand
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Land                6
Force               6
Isuzu               5
Ambassador          4
Kia                 4
MG                  3
Daewoo              3
Ashok               1
Opel                1
Peugeot             1
Name: count, dtype: int64


In [38]:
df['brand'].nunique()
threshold = 100

In [40]:
repl = counts[counts <= threshold].index
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Land', 'Force', 'Isuzu', 'Ambassador',
       'Kia', 'MG', 'Daewoo', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [41]:
print(pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5))

        BMW  Chevrolet   Ford  Honda  Hyundai  Mahindra  Maruti  Renault  \
6004  False      False  False  False     True     False   False    False   
6989  False      False  False  False    False     False   False    False   
6550  False      False  False   True    False     False   False    False   
3641  False      False  False  False    False     False    True    False   
6824  False      False  False  False     True     False   False    False   

      Skoda   Tata  Toyota  Volkswagen  uncommon  
6004  False  False   False       False     False  
6989  False  False    True       False     False  
6550  False  False   False       False     False  
3641  False  False   False       False     False  
6824  False  False   False       False     False  
