### Import Dependencies/ Packages here

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error

### Load Dataset

In [2]:
df=pd.read_csv(r"C:\Users\DURGA\Downloads\BIKE DETAILS.csv")

##  Data Preprocessing

### Find the shape, samples, information and description of the dataset.

In [3]:
df.shape

(1061, 7)

In [4]:
df.head()

Unnamed: 0,name,selling_price,year,seller_type,owner,km_driven,ex_showroom_price
0,Royal Enfield Classic 350,175000,2019,Individual,1st owner,350,
1,Honda Dio,45000,2017,Individual,1st owner,5650,
2,Royal Enfield Classic Gunmetal Grey,150000,2018,Individual,1st owner,12000,148114.0
3,Yamaha Fazer FI V 2.0 [2016-2018],65000,2015,Individual,1st owner,23000,89643.0
4,Yamaha SZ [2013-2014],20000,2011,Individual,2nd owner,21000,


In [5]:
df.sample(5)

Unnamed: 0,name,selling_price,year,seller_type,owner,km_driven,ex_showroom_price
275,Honda CB Unicorn 150,40000,2014,Individual,1st owner,77592,74295.0
511,Hero Splendor Plus,12000,2007,Individual,1st owner,100000,
556,Honda CBR150 R,110000,2017,Individual,1st owner,11800,129662.0
9,Bajaj Discover 125,50000,2016,Individual,1st owner,42000,60122.0
156,Bajaj Discover 125,28000,2013,Individual,1st owner,45000,60122.0


In [6]:
df.describe()

Unnamed: 0,selling_price,year,km_driven,ex_showroom_price
count,1061.0,1061.0,1061.0,626.0
mean,59638.151744,2013.867107,34359.833176,87958.71
std,56304.291973,4.301191,51623.152702,77496.59
min,5000.0,1988.0,350.0,30490.0
25%,28000.0,2011.0,13500.0,54852.0
50%,45000.0,2015.0,25000.0,72752.5
75%,70000.0,2017.0,43000.0,87031.5
max,760000.0,2020.0,880000.0,1278000.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1061 entries, 0 to 1060
Data columns (total 7 columns):
name                 1061 non-null object
selling_price        1061 non-null int64
year                 1061 non-null int64
seller_type          1061 non-null object
owner                1061 non-null object
km_driven            1061 non-null int64
ex_showroom_price    626 non-null float64
dtypes: float64(1), int64(3), object(3)
memory usage: 58.1+ KB


In [8]:
df.columns

Index(['name', 'selling_price', 'year', 'seller_type', 'owner', 'km_driven',
       'ex_showroom_price'],
      dtype='object')

### Check for null values and handle them accordingly

In [9]:
df.isnull().sum()

name                   0
selling_price          0
year                   0
seller_type            0
owner                  0
km_driven              0
ex_showroom_price    435
dtype: int64

In [10]:
df["ex_showroom_price"].fillna(df["ex_showroom_price"].mean(),inplace=True)


In [11]:
df.isnull().sum()

name                 0
selling_price        0
year                 0
seller_type          0
owner                0
km_driven            0
ex_showroom_price    0
dtype: int64

### Encode Categorical Values

In [12]:
df.replace({'seller_type':{'Individual':0,'Dealer':1,}},inplace=True)

In [13]:
df['seller_type'].unique()

array([0, 1], dtype=int64)

In [14]:
df.replace({'owner':{'1st owner':0,'2nd owner':1,'3rd owner':2,'4th owner':3}},inplace=True)

In [15]:
df['owner'].unique()

array([0, 1, 2, 3], dtype=int64)

# Seperate the features from the target

In [16]:
x=df[['year','seller_type','owner','km_driven','ex_showroom_price']]

In [17]:
y=df['selling_price']

### Split data into train and test

In [18]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=5)

### Transform the data using appropraite data transformation technique

In [19]:
lr = LinearRegression()

In [20]:
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
y_pred1=lr.predict(x_test)

In [22]:
from sklearn.metrics import r2_score,mean_squared_error,accuracy_score

In [23]:
r2_score(y_test,y_pred1)

0.7879866823425977

In [24]:
input=lr.predict([[2007,0,2,43000,89643.0]])
print(input)

[36184.04620765]


In [25]:
from sklearn.ensemble import RandomForestRegressor

In [26]:
rf=RandomForestRegressor()
rf.fit(x_train,y_train)




RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [27]:
y_pred2=rf.predict(x_test)

In [28]:
r2_score(y_test,y_pred2)

0.6831981662354203

In [29]:
from sklearn.svm import SVR

In [30]:
svm=SVR()
svm.fit(x_train,y_train)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [31]:
y_pred3=svm.predict(x_test)

In [32]:
r2_score(y_test,y_pred3)

-0.033109970164939995

In [33]:
mean_squared_error(y_test,y_pred1)

882939715.2575926

In [34]:
mean_squared_error(y_test,y_pred2)

1319336558.607997

In [35]:
mean_squared_error(y_test,y_pred2)

1319336558.607997