**Author**: Riadul Islam Rabbi

<h1><center> Hyundai Car Price Prediction Using ML</center></h1>

### Hyundai Car Dataset From Kaggle
-------------------------------------------------------------------------------------------------------------------------------

## Importing Libraries

In [46]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#Load the dataset

In [2]:
df_hyundai_data = pd.read_csv("https://raw.githubusercontent.com/SKawsar/Data_Visualization_with_Python/main/hyundai.csv")

##Views Data

In [3]:
df_hyundai_data.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
0,I20,2017,7999,Manual,17307,Petrol,58.9,1.2
1,Tucson,2016,14499,Automatic,25233,Diesel,43.5,2.0
2,Tucson,2016,11399,Manual,37877,Diesel,61.7,1.7
3,I10,2016,6499,Manual,23789,Petrol,60.1,1.0
4,IX35,2015,10199,Manual,33177,Diesel,51.4,2.0


In [4]:
df_hyundai_data.tail()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
4855,I30,2016,8680,Manual,25906,Diesel,78.4,1.6
4856,I40,2015,7830,Manual,59508,Diesel,65.7,1.7
4857,I10,2017,6830,Manual,13810,Petrol,60.1,1.0
4858,Tucson,2018,13994,Manual,23313,Petrol,44.8,1.6
4859,Tucson,2016,15999,Automatic,11472,Diesel,57.6,1.7


In [5]:
df_hyundai_data.shape

(4860, 8)

There are no missing values in hyundai dataset.

In [6]:
df_hyundai_data.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
mpg             0
engineSize      0
dtype: int64

We see from this data 4860 instances and 8 attributes. 2 columns `float64`, 3 columns `int64` 3 columns `object`

In [7]:
df_hyundai_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         4860 non-null   object 
 1   year          4860 non-null   int64  
 2   price         4860 non-null   int64  
 3   transmission  4860 non-null   object 
 4   mileage       4860 non-null   int64  
 5   fuelType      4860 non-null   object 
 6   mpg           4860 non-null   float64
 7   engineSize    4860 non-null   float64
dtypes: float64(2), int64(3), object(3)
memory usage: 303.9+ KB


##Separating the numeric features and target variable,

In [36]:
X = df_hyundai_data[['mileage']] # feature data
y = df_hyundai_data["price"] # target data

##Split the original dataset into the train set (80%) and the test set (20%), 

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.80, test_size = 0.20, random_state = 102)
print("X_train: ", X_train.shape)
print("X_test : ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

X_train:  (3888, 1)
X_test :  (972, 1)
y_train:  (3888,)
y_test:  (972,)


##Perform Simple Linear Regression and Predict the 'Price' from the test set, 

### Feed the model

In [38]:
LR = LinearRegression()
LR.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

###Predict the price

In [39]:
y_pred = LR.predict(X_test)

In [40]:
y_pred[0:10]

array([14926.38486669, 14984.2597343 , 14675.29234212, 15534.37240819,
       13612.5952288 , 15963.91244123, 15985.01265338, 14919.90408725,
       13988.93258406, 13105.43655821])

In [41]:
y_test[0:10]

825     13485
4211     7795
1782    19894
3450    25000
196     15498
2862    23995
2175     9989
4117    10495
261     14698
1616     6290
Name: price, dtype: int64

## Difference between actual and predicted price

In [44]:
hyundai_dif_price_data = pd.DataFrame({'Actual Data': y_test, 'Predicted Price' : y_pred, 'Difference Price': (y_pred - y_test)})
hyundai_dif_price_data.head()

Unnamed: 0,Actual Data,Predicted Price,Difference Price
825,13485,14926.384867,1441.384867
4211,7795,14984.259734,7189.259734
1782,19894,14675.292342,-5218.707658
3450,25000,15534.372408,-9465.627592
196,15498,13612.595229,-1885.404771


##Find the RMSE value from the actual test data and the predicted data.

In [47]:
RMSE = mean_squared_error(y_test, y_pred, squared= True)
print("RMSE : ", RMSE)

RMSE :  27049432.75552336


#Perform Multiple Linear Regression and Predict the 'Price' from the test set

In [50]:
X = df_hyundai_data[['mileage', 'mpg', 'engineSize']]# feature data
y = df_hyundai_data["price"] # target data

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.80, test_size = 0.20, random_state = 102)
print("X_train: ", X_train.shape)
print("X_test : ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

X_train:  (3888, 3)
X_test :  (972, 3)
y_train:  (3888,)
y_test:  (972,)


In [52]:
multiple_LR = LinearRegression()
multiple_LR.fit(X_train, y_train)

y_pred_mlr = multiple_LR.predict(X_test)


In [53]:
hyundai_dif_price_data_mlr = pd.DataFrame({'Actual Data': y_test, 'Predicted Price' : y_pred_mlr, 'Difference Price': (y_pred_mlr - y_test)})
hyundai_dif_price_data_mlr.head()

Unnamed: 0,Actual Data,Predicted Price,Difference Price
825,13485,11327.650872,-2157.349128
4211,7795,11000.639393,3205.639393
1782,19894,16643.227003,-3250.772997
3450,25000,21900.376108,-3099.623892
196,15498,15995.166974,497.166974


In [55]:
RMSE = mean_squared_error(y_test, y_pred_mlr, squared= True)
print("RMSE : ", RMSE)

RMSE :  14439067.295353228
