In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day27-one-hot-encoding/cars.csv")

In [3]:
df.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


### Finding no of unique values for each column

In [5]:
for ele in df.columns:
    print("No of unique values for", ele, df[ele].nunique())
    print(df[ele].value_counts())
    print()
    print()

No of unique values for brand 32
Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64


No of unique values for km_driven 921
120000    536
70000     456
80000     448
60000     425
50000     391
         ... 
59635       1
123219      1
65300       1
39395       1
191000      1
Name: km_driven, Length: 921, dtype: int64


## OneHotEncoding using pandas

In [6]:
pd.get_dummies(df, columns = ["fuel","owner"])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


## K-1 OneHotEncoding

### In order to mitigate the problem of multicollinearity we have to remove the first columns of each feature after one hot encoding

In [7]:
pd.get_dummies(df, columns = ["fuel","owner"], drop_first= True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


### Pandas doesnt remember the sequence of the columns so we cant implement this method in a machine learning project


## OneHotEncoding using ScikitLearn

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df.drop("selling_price", axis = 1), df["selling_price"], test_size = 0.3, random_state = 0)

In [9]:
x_train

Unnamed: 0,brand,km_driven,fuel,owner
5224,Tata,20000,Petrol,First Owner
520,Maruti,30000,Petrol,First Owner
36,Maruti,15000,Petrol,First Owner
5782,Ford,53000,Diesel,First Owner
6522,Chevrolet,120000,Diesel,First Owner
...,...,...,...,...
4931,Tata,70000,Diesel,Third Owner
3264,Ford,100000,Diesel,Second Owner
1653,Hyundai,90000,Petrol,Second Owner
2607,Volkswagen,90000,Diesel,First Owner


In [23]:
from sklearn.preprocessing import OneHotEncoder
# drop argument is used in order to remove the problem of multicollinearity

ohe = OneHotEncoder(drop = "first" , sparse = False)  ## If we mention sparse = False then the obtained matrix is a normal matrix not sparse matrix

ohe.fit(x_train[["fuel", "owner"]])
x_train_transformed = ohe.transform(x_train[["fuel", "owner"]])
x_test_transformed = ohe.transform(x_test[["fuel", "owner"]])

In [24]:
x_train_transformed.shape

(5689, 7)

### The resulting matrix obtained from onehotencoding is a sparse matrix ,   There may be a situation in which a matrix contains more number of ZERO values than NON-ZERO values. Such matrix is known as sparse matrix.

## Concatenate brand and km_driven with the transformed arrays

In [25]:
x_train_transformed = np.hstack((x_train_transformed, x_train[["brand","km_driven"]].values))

x_test_transformed = np.hstack((x_test_transformed, x_test[["brand","km_driven"]].values))
x_train_transformed.shape

(5689, 9)

## OneHotEncoding of the feature brand(Interesting)

### Over here we are gonna put all those brands having less than 100 cars into a single category called others

In [37]:
counts = df["brand"].value_counts()
type(counts)
counts

Maruti           2448
Hyundai          1415
Mahindra          772
Tata              734
Toyota            488
Honda             467
Ford              397
Chevrolet         230
Renault           228
Volkswagen        186
BMW               120
Skoda             105
Nissan             81
Jaguar             71
Volvo              67
Datsun             65
Mercedes-Benz      54
Fiat               47
Audi               40
Lexus              34
Jeep               31
Mitsubishi         14
Force               6
Land                6
Isuzu               5
Kia                 4
Ambassador          4
Daewoo              3
MG                  3
Ashok               1
Opel                1
Peugeot             1
Name: brand, dtype: int64

In [64]:
counts = pd.DataFrame(counts)
list = counts[counts <= 100].index
list.size
list

Index(['Maruti', 'Hyundai', 'Mahindra', 'Tata', 'Toyota', 'Honda', 'Ford',
       'Chevrolet', 'Renault', 'Volkswagen', 'BMW', 'Skoda', 'Nissan',
       'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi', 'Lexus',
       'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia', 'Ambassador',
       'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [49]:
pd.get_dummies(df["brand"].replace(list, "others"))

Unnamed: 0,others
0,1
1,1
2,1
3,1
4,1
...,...
8123,1
8124,1
8125,1
8126,1


In [66]:
counts = df['brand'].value_counts()
repl = counts[counts <= 100].index
pd.get_dummies(df['brand'].replace(repl, 'others'))


Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,0,0,0,0,1,0,0,0,0,0,0,0,0
8124,0,0,0,0,1,0,0,0,0,0,0,0,0
8125,0,0,0,0,0,0,1,0,0,0,0,0,0
8126,0,0,0,0,0,0,0,0,0,1,0,0,0


In [65]:
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object')

In [62]:
counts.index.size

32