# Data Preprocessing

*Importing the libraries*

In [13]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

*Importing the dataset*

In [14]:
dataset = pd.read_csv('cars_engage_2022.csv')

*Printing dataset*

In [15]:
print(dataset)

      Unnamed: 0        Make      Model        Variant Ex-Showroom_Price  \
0              0        Tata  Nano Genx             Xt      Rs. 2,92,667   
1              1        Tata  Nano Genx             Xe      Rs. 2,36,447   
2              2        Tata  Nano Genx        Emax Xm      Rs. 2,96,661   
3              3        Tata  Nano Genx            Xta      Rs. 3,34,768   
4              4        Tata  Nano Genx             Xm      Rs. 2,72,223   
...          ...         ...        ...            ...               ...   
1271        1271       Honda       City   Vx Mt Diesel     Rs. 13,02,000   
1272        1272       Honda       City   Zx Mt Diesel     Rs. 14,21,000   
1273        1273       Honda       City  Zx Cvt Petrol     Rs. 14,31,000   
1274        1274       Honda       City   V Cvt Petrol     Rs. 12,01,000   
1275        1275  Mitsubishi    Montero         3.2 At     Rs. 68,62,560   

     Displacement  Cylinders  Valves_Per_Cylinder               Drivetrain  \
0        

In [16]:
dataset.columns

Index(['Unnamed: 0', 'Make', 'Model', 'Variant', 'Ex-Showroom_Price',
       'Displacement', 'Cylinders', 'Valves_Per_Cylinder', 'Drivetrain',
       'Cylinder_Configuration',
       ...
       'Leather_Wrapped_Steering', 'Automatic_Headlamps', 'Engine_Type',
       'ASR_/_Traction_Control', 'Cruise_Control', 'USB_Ports',
       'Heads-Up_Display', 'Welcome_Lights', 'Battery', 'Electric_Range'],
      dtype='object', length=141)

*Filling out the unknown string values*

In [17]:
#data[].fillna("") is used to fill blank spaces

dataset['Emission_Norm'] = dataset['Emission_Norm'].fillna('NaN')
dataset['Power'] = dataset['Power'].fillna('NaN')
dataset['Torque'] = dataset['Torque'].fillna('NaN')
dataset['Start_/_Stop_Button'] = dataset['Start_/_Stop_Button'].fillna('NaN')
dataset['12v_Power_Outlet'] = dataset['12v_Power_Outlet'].fillna('NaN')
dataset['Basic_Warranty'] = dataset['Basic_Warranty'].fillna('NaN')

*Removing the unwanted units*

In [18]:
#Units must be removed to get real values

dataset['Ex-Showroom_Price'] = dataset['Ex-Showroom_Price'].str.replace("Rs.","").str.replace(",","")
dataset['Displacement'] = dataset['Displacement'].str.replace("cc","")
dataset['City_Mileage'] = dataset['City_Mileage'].str.replace("km/litre","").str.replace("?","")
dataset['Highway_Mileage'] = dataset['Highway_Mileage'].str.replace("km/litre","")
dataset['ARAI_Certified_Mileage'] = dataset['ARAI_Certified_Mileage'].str.replace("km/litre","")
dataset['Kerb_Weight'] = dataset['Kerb_Weight'].str.replace("kg","")
dataset['Boot_Space'] = dataset['Boot_Space'].str.replace("litres","")

  This is separate from the ipykernel package so we can avoid doing imports until
  """


*Converting string to integer*

In [19]:
dataset['Ex-Showroom_Price'] = pd.to_numeric(dataset['Ex-Showroom_Price'],errors='coerce')
dataset['Displacement'] = pd.to_numeric(dataset['Displacement'],errors='coerce')
dataset['City_Mileage'] = pd.to_numeric(dataset['City_Mileage'],errors='coerce')
dataset['Highway_Mileage'] = pd.to_numeric(dataset['Highway_Mileage'],errors='coerce')
dataset['ARAI_Certified_Mileage'] = pd.to_numeric(dataset['ARAI_Certified_Mileage'],errors='coerce')
dataset['Kerb_Weight'] = pd.to_numeric(dataset['Kerb_Weight'],errors='coerce')
dataset['Boot_Space'] = pd.to_numeric(dataset['Boot_Space'],errors='coerce')

*Filling integral coulmnn blanks*

In [20]:
# Mean strategy for float and median for integer

dataset['Ex-Showroom_Price'] = dataset['Ex-Showroom_Price'].fillna(dataset['Ex-Showroom_Price'].median())
dataset['Displacement'] = dataset['Displacement'].fillna(dataset['Displacement'].median())
dataset['Cylinders'] = dataset['Cylinders'].fillna(dataset['Cylinders'].median())
dataset['Valves_Per_Cylinder'] = dataset['Valves_Per_Cylinder'].fillna(dataset['Valves_Per_Cylinder'].median())
dataset['City_Mileage'] = dataset['City_Mileage'].fillna(dataset['City_Mileage'].mean())
dataset['Highway_Mileage'] = dataset['Highway_Mileage'].fillna(dataset['Highway_Mileage'].mean())
dataset['ARAI_Certified_Mileage'] = dataset['ARAI_Certified_Mileage'].fillna(dataset['ARAI_Certified_Mileage'].mean())
dataset['Kerb_Weight'] = dataset['Kerb_Weight'].fillna(dataset['Kerb_Weight'].mean())
dataset['Boot_Space'] = dataset['Boot_Space'].fillna(dataset['Boot_Space'].mean())

*Label encoding*

In [21]:
#for true and false values

from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()

dataset['Start_/_Stop_Button'] = LE.fit_transform(dataset['Start_/_Stop_Button'])
dataset['12v_Power_Outlet'] = LE.fit_transform(dataset['12v_Power_Outlet'])
dataset['Cruise_Control'] = LE.fit_transform(dataset['Cruise_Control'])

*Encoding*

In [22]:
seires = [ dataset['Make'], dataset['Model'], dataset['Variant']]
X = pd.concat(seires, axis=1)

encoder = OneHotEncoder(handle_unknown= 'ignore')
encoder.fit_transform(X)

Y = pd.DataFrame(encoder.fit_transform(X).toarray())

X.join(Y)

Unnamed: 0,Make,Model,Variant,0,1,2,3,4,5,6,...,1357,1358,1359,1360,1361,1362,1363,1364,1365,1366
0,Tata,Nano Genx,Xt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Tata,Nano Genx,Xe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Tata,Nano Genx,Emax Xm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Tata,Nano Genx,Xta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Tata,Nano Genx,Xm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1271,Honda,City,Vx Mt Diesel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1272,Honda,City,Zx Mt Diesel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1273,Honda,City,Zx Cvt Petrol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1274,Honda,City,V Cvt Petrol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


*Making final list*

In [23]:
final = [ dataset['Ex-Showroom_Price'],
         dataset['Displacement'],
         dataset['Cylinders'],
         dataset['Valves_Per_Cylinder'],
         dataset['Emission_Norm'],
         dataset['City_Mileage'],
         dataset['Highway_Mileage'],
         dataset['ARAI_Certified_Mileage'],
         dataset['Kerb_Weight'],
         dataset['Boot_Space'],
         dataset['Basic_Warranty'],
         dataset['Power'],
         dataset['Torque'],
         dataset['Start_/_Stop_Button'],
         dataset['12v_Power_Outlet'],
         dataset['Cruise_Control'],
           ]

listF = pd.concat(final, axis=1)
listF.join(Y)

Unnamed: 0,Ex-Showroom_Price,Displacement,Cylinders,Valves_Per_Cylinder,Emission_Norm,City_Mileage,Highway_Mileage,ARAI_Certified_Mileage,Kerb_Weight,Boot_Space,...,1357,1358,1359,1360,1361,1362,1363,1364,1365,1366
0,292667,624.0,2.0,2.0,BS IV,23.600000,16.901326,23.600000,660.0,110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,236447,624.0,2.0,2.0,BS IV,23.600000,16.901326,23.600000,725.0,110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,296661,624.0,2.0,2.0,BS IV,53.607176,16.901326,19.917351,710.0,110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,334768,624.0,2.0,2.0,BS IV,23.600000,16.901326,21.900000,725.0,94.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,272223,624.0,2.0,2.0,BS IV,23.600000,16.901326,23.600000,725.0,110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1271,1302000,1498.0,4.0,4.0,BS IV,22.600000,25.100000,25.100000,1170.0,510.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1272,1421000,1498.0,4.0,4.0,BS IV,22.600000,25.100000,25.100000,1175.0,510.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1273,1431000,1497.0,4.0,4.0,BS 6,18.000000,18.000000,22.600000,1107.0,510.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1274,1201000,1497.0,4.0,4.0,BS 6,14.300000,16.400000,17.800000,1080.0,510.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


***RESULT***

In [24]:
print(listF)

      Ex-Showroom_Price  Displacement  Cylinders  Valves_Per_Cylinder  \
0                292667         624.0        2.0                  2.0   
1                236447         624.0        2.0                  2.0   
2                296661         624.0        2.0                  2.0   
3                334768         624.0        2.0                  2.0   
4                272223         624.0        2.0                  2.0   
...                 ...           ...        ...                  ...   
1271            1302000        1498.0        4.0                  4.0   
1272            1421000        1498.0        4.0                  4.0   
1273            1431000        1497.0        4.0                  4.0   
1274            1201000        1497.0        4.0                  4.0   
1275            6862560        3200.0        4.0                  4.0   

     Emission_Norm  City_Mileage  Highway_Mileage  ARAI_Certified_Mileage  \
0            BS IV     23.600000        16.901