In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Problem statement
# To predict price of cars by using various features

## Data Gathering 

In [3]:
df = pd.read_csv(r'C:\Users\Kishor\OneDrive\Desktop\CSV\autos_dataset.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [4]:
# Exploratory Data Analysis

In [5]:
df.isna().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [7]:
df['symboling']

0      3
1      3
2      1
3      2
4      2
      ..
200   -1
201   -1
202   -1
203   -1
204   -1
Name: symboling, Length: 205, dtype: int64

In [8]:
df['normalized-losses']

0        ?
1        ?
2        ?
3      164
4      164
      ... 
200     95
201     95
202     95
203     95
204     95
Name: normalized-losses, Length: 205, dtype: object

In [9]:
df['normalized-losses'].replace({'?':np.nan},inplace = True)

In [10]:
df.replace({'?':np.nan},inplace = True)

In [11]:
df.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engine-size           0
fuel-system           0
bore                  4
stroke                4
compression-ratio     0
horsepower            2
peak-rpm              2
city-mpg              0
highway-mpg           0
price                 4
dtype: int64

In [12]:
df.isna().mean() * 100

symboling             0.00000
normalized-losses    20.00000
make                  0.00000
fuel-type             0.00000
aspiration            0.00000
num-of-doors          0.97561
body-style            0.00000
drive-wheels          0.00000
engine-location       0.00000
wheel-base            0.00000
length                0.00000
width                 0.00000
height                0.00000
curb-weight           0.00000
engine-type           0.00000
num-of-cylinders      0.00000
engine-size           0.00000
fuel-system           0.00000
bore                  1.95122
stroke                1.95122
compression-ratio     0.00000
horsepower            0.97561
peak-rpm              0.97561
city-mpg              0.00000
highway-mpg           0.00000
price                 1.95122
dtype: float64

In [13]:
df['normalized-losses'] = df['normalized-losses'].astype(float)
df['normalized-losses'].mean()

122.0

In [14]:
df['normalized-losses'].median()

115.0

In [15]:
df['normalized-losses'] = df['normalized-losses'].fillna(df['normalized-losses'].median()).astype(int)
df['normalized-losses']

0      115
1      115
2      115
3      164
4      164
      ... 
200     95
201     95
202     95
203     95
204     95
Name: normalized-losses, Length: 205, dtype: int32

In [16]:
df['make'].value_counts()

toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
alfa-romero       3
chevrolet         3
jaguar            3
renault           2
mercury           1
Name: make, dtype: int64

In [17]:
df['fuel-type'].replace({'gas':1,'diesel':0},inplace = True)
df['fuel-type'].value_counts()

1    185
0     20
Name: fuel-type, dtype: int64

In [18]:
fuel_type_values = {'gas':1,'diesel':0}

In [19]:
df['fuel-type']

0      1
1      1
2      1
3      1
4      1
      ..
200    1
201    1
202    1
203    0
204    1
Name: fuel-type, Length: 205, dtype: int64

In [20]:
df['aspiration'].value_counts().to_dict()

{'std': 168, 'turbo': 37}

In [21]:
df['aspiration'].replace({'std': 0, 'turbo': 1},inplace = True)

In [22]:
df['num-of-doors'].value_counts().to_dict()

{'four': 114, 'two': 89}

In [23]:
df['num-of-doors'].replace({'four': 4, 'two': 2},inplace= True)

In [24]:
df['num-of-doors'].fillna(df['num-of-doors'].mode()[0],inplace = True)
df['num-of-doors'].unique()

array([2., 4.])

In [25]:
num_of_doors_values = {'four': 4, 'two': 2}

In [26]:
df['body-style'].value_counts().to_dict()

{'sedan': 96, 'hatchback': 70, 'wagon': 25, 'hardtop': 8, 'convertible': 6}

In [27]:
df['body-style'].replace({'sedan':0,'hatchback':1,'wagon':2,'hardtop':3,'convertible': 4},inplace= True)

In [28]:
df['drive-wheels'].value_counts().to_dict()

{'fwd': 120, 'rwd': 76, '4wd': 9}

In [29]:
df['drive-wheels'].replace({'fwd': 0, 'rwd': 1, '4wd': 2},inplace = True)
df['drive-wheels']

0      1
1      1
2      1
3      0
4      2
      ..
200    1
201    1
202    1
203    1
204    1
Name: drive-wheels, Length: 205, dtype: int64

In [30]:
df['engine-location'].value_counts().to_dict()

{'front': 202, 'rear': 3}

In [31]:
df['engine-location'].replace({'front' :1,'rear':0},inplace = True)

In [32]:
print(df['num-of-cylinders'].value_counts().to_dict())

{'four': 159, 'six': 24, 'five': 11, 'eight': 5, 'two': 4, 'twelve': 1, 'three': 1}


In [33]:
df['num-of-cylinders'].replace({'four': 4, 'six': 6, 'five': 5, 'eight': 8, 
                                'two': 2, 'three': 3, 'twelve': 12},inplace = True)

In [34]:
num_of_cylinders_values = {'four': 4, 'six': 6, 'five': 5, 'eight': 8, 
                                'two': 2, 'three': 3, 'twelve': 12}

In [35]:
df = pd.get_dummies(df,columns=['fuel-system'])
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,highway-mpg,price,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,3,115,alfa-romero,1,0,2.0,4,1,1,88.6,...,27,13495,0,0,0,0,0,1,0,0
1,3,115,alfa-romero,1,0,2.0,4,1,1,88.6,...,27,16500,0,0,0,0,0,1,0,0
2,1,115,alfa-romero,1,0,2.0,1,1,1,94.5,...,26,16500,0,0,0,0,0,1,0,0
3,2,164,audi,1,0,4.0,0,0,1,99.8,...,30,13950,0,0,0,0,0,1,0,0
4,2,164,audi,1,0,4.0,0,2,1,99.4,...,22,17450,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,volvo,1,0,4.0,0,1,1,109.1,...,28,16845,0,0,0,0,0,1,0,0
201,-1,95,volvo,1,1,4.0,0,1,1,109.1,...,25,19045,0,0,0,0,0,1,0,0
202,-1,95,volvo,1,0,4.0,0,1,1,109.1,...,23,21485,0,0,0,0,0,1,0,0
203,-1,95,volvo,0,1,4.0,0,1,1,109.1,...,27,22470,0,0,0,1,0,0,0,0


In [36]:

df['bore'] = df['bore'].fillna(df['bore'].median()).astype(float)
df['stroke'] = df['stroke'].fillna(df['stroke'].median()).astype(float)
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median()).astype(float)
df['peak-rpm'] = df['peak-rpm'].fillna(df['peak-rpm'].median()).astype(float)
df['price'] = df['price'].fillna(df['price'].median()).astype(float)

## Train Test Split

In [37]:
df = df.select_dtypes(exclude=object) 
x = df.drop('price',axis = 1)
y = df['price']

In [38]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=34)

## Model Training

In [39]:
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [40]:
model.intercept_

17160.865198541575

In [41]:
# Testing Data Evaluation
y_pred = model.predict(x_test)
y_pred[20:25]

array([10698.94779183, 19647.23945554, 11338.31284649,  7705.53728027,
        7562.17180095])

In [42]:
y_test[20:25]

123     8921.0
178    16558.0
3      13950.0
76      5389.0
79      7689.0
Name: price, dtype: float64

In [43]:
# Testing Data Evaluation
y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("MAE :",mae)

r2 = r2_score(y_test, y_pred)
print('R-Squared :',r2)

MSE : 18362342.536621656
RMSE : 4285.130399021908
MAE : 2707.849253354722
R-Squared : 0.6499375798612054


In [44]:
# Training Data Evaluation

y_pred_train = model.predict(x_train)
mse = mean_squared_error(y_train, y_pred_train)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

mae = mean_absolute_error(y_train, y_pred_train)
print("MAE :",mae)

r2 = r2_score(y_train, y_pred_train)
print('R-Squared :',r2)

MSE : 7439192.365098531
RMSE : 2727.488288718859
MAE : 1905.8033155826952
R-Squared : 0.8838914278897249


## Testing on single row

In [45]:
model.n_features_in_

30

In [46]:
x.columns

Index(['symboling', 'normalized-losses', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight',
       'num-of-cylinders', 'engine-size', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'fuel-system_1bbl', 'fuel-system_2bbl',
       'fuel-system_4bbl', 'fuel-system_idi', 'fuel-system_mfi',
       'fuel-system_mpfi', 'fuel-system_spdi', 'fuel-system_spfi'],
      dtype='object')

In [47]:
fuel_type_values = {'gas':1,'diesel':0}
num_of_doors_values = {'four': 4, 'two': 2}
num_of_cylinders_values = {'four': 4, 'six': 6, 'five': 5, 'eight': 8, 
                                'two': 2, 'three': 3, 'twelve': 12}

In [48]:
df

Unnamed: 0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,highway-mpg,price,fuel-system_1bbl,fuel-system_2bbl,fuel-system_4bbl,fuel-system_idi,fuel-system_mfi,fuel-system_mpfi,fuel-system_spdi,fuel-system_spfi
0,3,115,1,0,2.0,4,1,1,88.6,168.8,...,27,13495.0,0,0,0,0,0,1,0,0
1,3,115,1,0,2.0,4,1,1,88.6,168.8,...,27,16500.0,0,0,0,0,0,1,0,0
2,1,115,1,0,2.0,1,1,1,94.5,171.2,...,26,16500.0,0,0,0,0,0,1,0,0
3,2,164,1,0,4.0,0,0,1,99.8,176.6,...,30,13950.0,0,0,0,0,0,1,0,0
4,2,164,1,0,4.0,0,2,1,99.4,176.6,...,22,17450.0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95,1,0,4.0,0,1,1,109.1,188.8,...,28,16845.0,0,0,0,0,0,1,0,0
201,-1,95,1,1,4.0,0,1,1,109.1,188.8,...,25,19045.0,0,0,0,0,0,1,0,0
202,-1,95,1,0,4.0,0,1,1,109.1,188.8,...,23,21485.0,0,0,0,0,0,1,0,0
203,-1,95,0,1,4.0,0,1,1,109.1,188.8,...,27,22470.0,0,0,0,1,0,0,0,0


In [49]:
symboling = 3.00
normalized_losses = 134.00
fuel_type = 'gas'
aspiration = 0.00
num_of_doors = 'two'
drive_wheels = 1.00
engine_location = 1.00
wheel_base = 88.60
length = 168.80
width = 64.10
height = 48.80
curb_weight = 2458.00
num_of_cylinders = 'four'
engine_size = 150.00
bore = 4.47
stroke = 2.78
compression_ratio = 9.00
horsepower = 121.00
peak_rpm = 5000.00
city_mpg = 20.00
highway_mpg = 26.00
body_style = "sedan"
engine_type= "ohc"
fuel_system = "mfi"

In [50]:
label_encoded_columns = {"Num_of_Doors" :num_of_doors_values,
                       "num_of_cylinders_values" :num_of_cylinders_values,
                        "fuel_type_values":fuel_type_values,
                        "Columns" : list(x.columns)}
label_encoded_columns

{'Num_of_Doors': {'four': 4, 'two': 2},
 'num_of_cylinders_values': {'four': 4,
  'six': 6,
  'five': 5,
  'eight': 8,
  'two': 2,
  'three': 3,
  'twelve': 12},
 'fuel_type_values': {'gas': 1, 'diesel': 0},
 'Columns': ['symboling',
  'normalized-losses',
  'fuel-type',
  'aspiration',
  'num-of-doors',
  'body-style',
  'drive-wheels',
  'engine-location',
  'wheel-base',
  'length',
  'width',
  'height',
  'curb-weight',
  'num-of-cylinders',
  'engine-size',
  'bore',
  'stroke',
  'compression-ratio',
  'horsepower',
  'peak-rpm',
  'city-mpg',
  'highway-mpg',
  'fuel-system_1bbl',
  'fuel-system_2bbl',
  'fuel-system_4bbl',
  'fuel-system_idi',
  'fuel-system_mfi',
  'fuel-system_mpfi',
  'fuel-system_spdi',
  'fuel-system_spfi']}

In [56]:
column_names = x.columns
no_of_doors = num_of_doors_values[num_of_doors]
no_of_cylinders = num_of_cylinders_values[num_of_cylinders]
fuel_type_new =fuel_type_values[fuel_type]
array = np.zeros(len(x.columns),dtype = int)

array[0] = symboling
array[1] = normalized_losses
array[2] = fuel_type_new
array[3] = aspiration
array[4] = no_of_doors
array[5] = drive_wheels 
array[6] = engine_location 
array[7] = wheel_base
array[8] = length 
array[9] = width 
array[10] = height 
array[11] = curb_weight 
array[12] = no_of_cylinders
array[13] = engine_size 
array[14] = bore 
array[15] = stroke
array[16] = compression_ratio 
array[17] = horsepower
array[18] = peak_rpm 
array[19] = city_mpg
array[20] = highway_mpg 

body_style = "body-style_"+ body_style
# engine_type= "engine-type_"+"ohc"
fuel_system = "fuel-system_"+ fuel_system

body_style_index = np.where(column_names == body_style)[0][0]
# engine_type_index = np.where(column_names == engine_type)[0][0]
fuel_system_index = np.where(column_names == fuel_system)[0][0]

array[body_style_index] = 1 
# array[engine_type_index] = 1
array[fuel_system_index] = 1 

predicted_price = np.around(model.predict([array])[0],2)
print("predicted_price of car is :",predicted_price)

IndexError: index 0 is out of bounds for axis 0 with size 0

In [52]:
import pickle

with open("Linear_Model.pkl",'wb') as f:
    pickle.dump(model,f)

In [53]:
import json

with open("Label_Encoded_Columns.json",'w') as f:
    json.dump(label_encoded_columns,f)