In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVR
from sklearn.impute import KNNImputer

In [2]:
column_names = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',  'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price'    ]

In [3]:
data = pd.read_csv('data1.data')
data.columns = column_names

In [4]:
data

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,...,136,mpfi,3.19,3.40,8.5,110,5500,19,25,15250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845
200,-1,95,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045
201,-1,95,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485
202,-1,95,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470


replacing all the ? to NaN

In [5]:
data = data.replace('?', np.NaN)

In [6]:
data.isnull().sum()

symboling             0
normalized-losses    40
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

Checking the data description

In [7]:
data.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0
mean,0.823529,98.806373,174.075,65.916667,53.74902,2555.602941,126.892157,10.148137,25.240196,30.769608
std,1.239035,5.994144,12.362123,2.146716,2.424901,521.96082,41.744569,3.981,6.551513,6.898337
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.075,52.0,2145.0,97.0,8.575,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,119.5,9.0,24.0,30.0
75%,2.0,102.4,183.2,66.9,55.5,2939.25,142.0,9.4,30.0,34.5
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          204 non-null    int64  
 1   normalized-losses  164 non-null    object 
 2   make               204 non-null    object 
 3   fuel-type          204 non-null    object 
 4   aspiration         204 non-null    object 
 5   num-of-doors       202 non-null    object 
 6   body-style         204 non-null    object 
 7   drive-wheels       204 non-null    object 
 8   engine-location    204 non-null    object 
 9   wheel-base         204 non-null    float64
 10  length             204 non-null    float64
 11  width              204 non-null    float64
 12  height             204 non-null    float64
 13  curb-weight        204 non-null    int64  
 14  engine-type        204 non-null    object 
 15  num-of-cylinders   204 non-null    object 
 16  engine-size        204 non

Checking out the make feature

In [9]:
data['make'].nunique()

22

The make features has 22 unique values and they are categorical in nature, it should be converted into numerical format but curse of dimensionality should also be avoided so not all the 22 values will be in the dataset

In [10]:
data['make'].value_counts()

toyota           32
nissan           18
mazda            17
honda            13
mitsubishi       13
subaru           12
volkswagen       12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
chevrolet         3
jaguar            3
alfa-romero       2
renault           2
mercury           1
Name: make, dtype: int64

In [11]:
#Selection the 10 most frequent car make.
make_10 = data['make'].value_counts().head(10)
make_10 = list(make_10.index)

In [12]:
for sample in data['make']: 
    for value in make_10: 
        data[value] = np.where(sample == value, 1, 0)
        


In [13]:
#Dropping the make column

data = data.drop(columns = ['make'])

In [14]:
data['normalized-losses'] = pd.to_numeric(data['normalized-losses'])

The fuel type columns has only two values, gas and diesel, trying to convert it them to numeric values

In [15]:
data['fuel-type'] = data['fuel-type'].map({'gas': 0, 'diesel': 1})

Checking out the aspiration column, the aspiration column has two values which are 'std' and 'turbo'

In [16]:
data['aspiration'] = data['aspiration'].map({'std': 0, 'turbo':1})

In [17]:
data['aspiration'].value_counts()

0    167
1     37
Name: aspiration, dtype: int64

Checking the num-of-doors column

In [18]:
data['num-of-doors'] = data['num-of-doors'].map({'two': 0, 'four': 1})

Checking out the body style of the vehicle

In [19]:
#Encoding the unique values of the body-style feature
body_style_map = {'convertible': 0, 'hatchback': 1, 'sedan': 2, 'wagon':3, 'hardtop':4}
data['body-style'] = data['body-style'].map(body_style_map)

Checking the drive-wheels feature.

In [20]:
#Encoding the drive-wheels feature
mapping = {'rwd' : 0, 'fwd': 1, '4wd' : 2}
data['drive-wheels'] = data['drive-wheels'].map(mapping) 

Checking out the engine-location feature

In [21]:
mapping = {'front' : 0, 'rear' : 1}
data['engine-location'] = data['engine-location'].map(mapping)

Checking out the fuel-system feature

In [22]:
#Encoding the fuel-system feature.
mapping = {'mpfi': 0, '2bbl': 1, 'mfi': 2, '1bbl':3, 'spfi':4, '4bbl' :5, 'idi':6, 'spdi':7}
data['fuel-system'] = data['fuel-system'].map(mapping)

Converting the following features to numeric data form

In [23]:
#Converting the bore, stroke,  numeric
data['bore'] = pd.to_numeric(data['bore'])
data['stroke'] = pd.to_numeric(data['stroke'])
data['price'] = pd.to_numeric(data['price'])
data['horsepower'] = pd.to_numeric(data['horsepower'])
data['peak-rpm'] = pd.to_numeric(data['peak-rpm'])

Converting the number-of-cylinders  column  to numeric

In [24]:
mapping = {'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 'two':2, 'eight':8}
data['num-of-cylinders'] = data['num-of-cylinders'].map(mapping)

Converting the engine-type column into a numeric form

In [25]:
#Encoding the engine-type column
mapping = {'dohc':0, 'ohcv':1, 'ohc':2, 'l':3, 'rotor':4, 'ohcf':5, 'dohcv':6}
data['engine-type'] = data['engine-type'].map(mapping)

Splitting the data into training and testing test.

In [27]:
X.shape, y.shape

((204, 34), (204,))

In [28]:
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = .3, random_state = 10, stratify  = y)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Handling all the missing values using KNNImputer

In [None]:
imputer = KNNImputer()

In [None]:
columns = data.columns
data = imputer.fit_transform(data)

In [None]:
data.shape, len(columns)

In [None]:
data = pd.DataFrame(data)
data.columns = columns
data.head(5)

Confirming that there are no null values present in the data

In [None]:
data.isna().sum()

Splitting the data into dependent and independent variables.

In [29]:
X = data.drop(columns = ['price'])
y = data['price']

In [None]:
data['engine-location'].value_counts()

In [None]:
data.info()