# Predict Car Price 
## Linear Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Read the data using pandas library


In [2]:
dataframe = pd.read_csv(....input\car data.csv")


# Explore the dataset

In [3]:
dataframe.head(10) #display first 10 rows

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0
5,vitara brezza,2018,9.25,9.83,2071,Diesel,Dealer,Manual,0
6,ciaz,2015,6.75,8.12,18796,Petrol,Dealer,Manual,0
7,s cross,2015,6.5,8.61,33429,Diesel,Dealer,Manual,0
8,ciaz,2016,8.75,8.89,20273,Diesel,Dealer,Manual,0
9,ciaz,2015,7.45,8.92,42367,Diesel,Dealer,Manual,0


In [4]:
dataframe.info() #display information about the data(columns,datatype,rows, vs.)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [5]:
missing_perc =  dataframe.isna().sum() / len(dataframe) #check missing value
missing_perc

Car_Name         0.0
Year             0.0
Selling_Price    0.0
Present_Price    0.0
Kms_Driven       0.0
Fuel_Type        0.0
Seller_Type      0.0
Transmission     0.0
Owner            0.0
dtype: float64

In [6]:
dataframe.Transmission.value_counts() #count different group data

Manual       261
Automatic     40
Name: Transmission, dtype: int64

In [7]:
dataframe.Seller_Type.value_counts() #count different group data

Dealer        195
Individual    106
Name: Seller_Type, dtype: int64

In [8]:
dataframe.Car_Name.value_counts() #count different group data

city                        26
corolla altis               16
verna                       14
fortuner                    11
brio                        10
                            ..
Hero Splender Plus           1
camry                        1
KTM 390 Duke                 1
Bajaj Avenger Street 220     1
Bajaj Pulsar 135 LS          1
Name: Car_Name, Length: 98, dtype: int64

In [9]:
dataframe.Fuel_Type.value_counts() #count different group data

Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64

# dimension of data

In [10]:
dataframe.shape

(301, 9)

In [11]:
dataframe.describe() 

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner
count,301.0,301.0,301.0,301.0,301.0
mean,2013.627907,4.661296,7.628472,36947.20598,0.043189
std,2.891554,5.082812,8.644115,38886.883882,0.247915
min,2003.0,0.1,0.32,500.0,0.0
25%,2012.0,0.9,1.2,15000.0,0.0
50%,2014.0,3.6,6.4,32000.0,0.0
75%,2016.0,6.0,9.9,48767.0,0.0
max,2018.0,35.0,92.6,500000.0,3.0


# Split target and features

In [12]:
trainData = dataframe.drop(["Selling_Price"],axis=1)
trainData.shape

(301, 8)

In [13]:
targetData = dataframe["Selling_Price"]
targetData.shape

(301,)

# Handling with numeric ,categoric variables

In [14]:
numerical_data = dataframe.select_dtypes(include=[np.number])
categorical_data = dataframe.select_dtypes(exclude=[np.number])

numerical_data.shape , categorical_data.shape

((301, 5), (301, 4))

In [15]:
print(f'numeric_columns :{numerical_data.columns}  categorical_columns : {categorical_data.columns}')

numeric_columns :Index(['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner'], dtype='object')  categorical_columns : Index(['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission'], dtype='object')


# Pipeline for model(Scale data)

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression


integer_transformer = Pipeline(steps = [
   ('scaler', StandardScaler(with_mean=False))])

continuous_transformer = Pipeline(steps = [
   ('scaler', StandardScaler(with_mean=False))])


categorical_transformer = Pipeline(steps = [
   ('lab_enc', OneHotEncoder(handle_unknown='ignore')),
   ('scaler', StandardScaler(with_mean=False))])

# Use the ColumnTransformer to apply the transformations to the correct columns in the dataframe.
integer_features = list(trainData.columns[trainData.dtypes == 'int64'])
continuous_features = list(trainData.columns[trainData.dtypes == 'float64'])
categorical_features = list(trainData.columns[trainData.dtypes == 'object'])

# Built linear model

In [17]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
   transformers=[
       ('ints', integer_transformer, integer_features),
       ('cont', continuous_transformer, continuous_features),
       ('cat', categorical_transformer, categorical_features)])

# Create a pipeline that combines the preprocessor created above with a classifier.

base_pipeLine = Pipeline(steps=[('preprocessor', preprocessor),
                     ('classifier', LinearRegression())])

In [18]:
base_pipeLine.steps # pipeline step

[('preprocessor',
  ColumnTransformer(transformers=[('ints',
                                   Pipeline(steps=[('scaler',
                                                    StandardScaler(with_mean=False))]),
                                   ['Year', 'Kms_Driven', 'Owner']),
                                  ('cont',
                                   Pipeline(steps=[('scaler',
                                                    StandardScaler(with_mean=False))]),
                                   ['Present_Price']),
                                  ('cat',
                                   Pipeline(steps=[('lab_enc',
                                                    OneHotEncoder(handle_unknown='ignore')),
                                                   ('scaler',
                                                    StandardScaler(with_mean=False))]),
                                   ['Car_Name', 'Fuel_Type', 'Seller_Type',
                                    'Transmissio

# Train and test split

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(trainData, targetData ,test_size=0.33, random_state=0)

# train model

In [20]:
model = base_pipeLine.fit(x_train, y_train)
model


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('ints',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler(with_mean=False))]),
                                                  ['Year', 'Kms_Driven',
                                                   'Owner']),
                                                 ('cont',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler(with_mean=False))]),
                                                  ['Present_Price']),
                                                 ('cat',
                                                  Pipeline(steps=[('lab_enc',
                                                                   OneHotEncoder(handle_unknown='ignore')),
                                           

# Prediction

In [21]:
real_values = y_test.to_numpy()

In [25]:
prediction = base_pipeLine.predict(x_test)
print(f'predict value  by the model  : {prediction[4]}, real value ,the model try to predict : {real_values[4]}')
print(f'predict value by the model  : {prediction[3]}, real value ,the model try to predict : {real_values[3]}')

predict by the model value : 17.424799836710463, real value the model try to predict : 18.0
predict by the model value : 9.283646086427552, real value the model try to predict : 9.5


# Evaluate 

In [26]:
predicted = base_pipeLine.score(x_train, y_train)
predicted

0.9516873813400109

In [27]:
test = base_pipeLine.score(x_test, y_test)
test

0.8813391623315759