In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df = pd.read_csv('cars.csv')
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [24]:
counts = df['brand'].value_counts()

In [21]:
less_than_hun = counts < 100

brand
Maruti           False
Hyundai          False
Mahindra         False
Tata             False
Toyota           False
Honda            False
Ford             False
Chevrolet        False
Renault          False
Volkswagen       False
BMW              False
Skoda            False
Nissan            True
Jaguar            True
Volvo             True
Datsun            True
Mercedes-Benz     True
Fiat              True
Audi              True
Lexus             True
Jeep              True
Mitsubishi        True
Force             True
Land              True
Isuzu             True
Kia               True
Ambassador        True
Daewoo            True
MG                True
Ashok             True
Opel              True
Peugeot           True
Name: count, dtype: bool

In [34]:
other_cars = counts[less_than_hun].index
other_cars

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [46]:
pd.get_dummies(df['brand'].replace(other_cars, 'others'), dtype=int)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


# Using Sci-kit Learn

### First step to club all the columns with less than 100 cars into a new catagory named 'others'.

In [49]:
counts = df['brand'].value_counts()

In [55]:
less_than_hun = counts[counts < 100].index
less_than_hun

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [66]:
df['brand']

0        Maruti
1         Skoda
2         Honda
3       Hyundai
4        Maruti
         ...   
8123    Hyundai
8124    Hyundai
8125     Maruti
8126       Tata
8127       Tata
Name: brand, Length: 8128, dtype: object

In [81]:
df['brand'].replace(less_than_hun, 'others',inplace=True)
df['brand'].value_counts()

# we had replaced all the catagories less than 100 to 'others'

brand
Maruti        2448
Hyundai       1415
Mahindra       772
Tata           734
others         538
Toyota         488
Honda          467
Ford           397
Chevrolet      230
Renault        228
Volkswagen     186
BMW            120
Skoda          105
Name: count, dtype: int64

### now we use scikit learn for one hot encoding

In [82]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1],test_size=0.2)

In [83]:
x_train.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner
2787,Ford,150000,Diesel,First Owner
1602,Mahindra,56000,Diesel,First Owner
281,Honda,33033,Petrol,First Owner
7807,others,61000,Diesel,Second Owner
3900,Hyundai,29029,Petrol,First Owner


In [89]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop = 'first', dtype=int, sparse_output=False)

x_train_new = ohe.fit_transform(x_train[['brand']])

In [90]:
x_test_new = ohe.transform(x_test[['brand']])

In [91]:
x_train_new

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [96]:
x_train_remaining = np.array(x_train.iloc[:,1:])
x_train_remaining

array([[120000, 'Diesel', 'Third Owner'],
       [100000, 'Petrol', 'Second Owner'],
       [110000, 'Diesel', 'Second Owner'],
       ...,
       [70000, 'Petrol', 'Second Owner'],
       [41000, 'Diesel', 'First Owner'],
       [150000, 'Diesel', 'First Owner']], dtype=object)

In [98]:
np.hstack([x_train_new, x_train_remaining])

array([[0, 0, 0, ..., 120000, 'Diesel', 'Third Owner'],
       [0, 0, 0, ..., 100000, 'Petrol', 'Second Owner'],
       [0, 0, 0, ..., 110000, 'Diesel', 'Second Owner'],
       ...,
       [0, 0, 0, ..., 70000, 'Petrol', 'Second Owner'],
       [0, 0, 0, ..., 41000, 'Diesel', 'First Owner'],
       [0, 0, 0, ..., 150000, 'Diesel', 'First Owner']], dtype=object)

### Now applying One Hot Encoding for all the columns (ie: brand, fuel, owner)

In [104]:
x_train_encoded = ohe.fit_transform(x_train[['brand','fuel','owner']])
x_test_encoded = ohe.transform(x_test[['brand','fuel','owner']])

In [105]:
x_train_encoded

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [106]:
x_train_encoded.shape

(6502, 19)

In [107]:
x_test_encoded

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])