# Using pipeline 

In [1]:
#importing required and necessary libraries
import numpy as np # used to perform mathematical operation on array
import pandas as pd # used for data manipulation
import matplotlib.pyplot as plt;
import seaborn as sns
%matplotlib inline

In [2]:
# load data
data = pd.read_csv('car data1.csv')

In [3]:
data

Unnamed: 0,Car_Name,Year,Car_age,Selling_Price,Present_Price,Kms_Driven,Owner,Fuel_Type,Seller_type,Transmission
0,ritz,2014,7,3.35,5.59,27000,0,Petrol,Dealer,Manual
1,sx4,2013,8,4.75,9.54,43000,0,Diesel,Dealer,Manual
2,ciaz,2017,4,7.25,9.85,6900,0,Petrol,Dealer,Manual
3,wagon r,2011,10,2.85,4.15,5200,0,Petrol,Dealer,Manual
4,swift,2014,7,4.60,6.87,42450,0,Diesel,Dealer,Manual
...,...,...,...,...,...,...,...,...,...,...
296,city,2016,5,9.50,11.60,33988,0,Diesel,Dealer,Manual
297,brio,2015,6,4.00,5.90,60000,0,Petrol,Dealer,Manual
298,city,2009,12,3.35,11.00,87934,0,Petrol,Dealer,Manual
299,city,2017,4,11.50,12.50,9000,0,Diesel,Dealer,Manual


In [4]:
data.drop('Car_Name',axis=1,inplace=True)

In [5]:
data.drop('Year',axis=1,inplace=True)

## Outlier treatment

In [6]:
# calculate the IQR for a column 'column_name'
Q1 = data['Selling_Price'].quantile(0.25)
Q3 = data['Selling_Price'].quantile(0.75)
IQR = Q3 - Q1

# detect outliers
outliers = data[(data['Selling_Price'] < Q1 - 1.5 * IQR) | (data['Selling_Price'] > Q3 + 1.5 * IQR)]

In [7]:
data = data[(data['Selling_Price'] >= Q1 - 1.5 * IQR) & (data['Selling_Price'] <= Q3 + 1.5 * IQR)]

In [8]:
# calculate the IQR for a column 'column_name'
Q1 = data['Present_Price'].quantile(0.25)
Q3 = data['Present_Price'].quantile(0.75)
IQR = Q3 - Q1

# detect outliers
outliers = data[(data['Present_Price'] < Q1 - 1.5 * IQR) | (data['Present_Price'] > Q3 + 1.5 * IQR)]

In [9]:
data = data[(data['Present_Price'] >= Q1 - 1.5 * IQR) & (data['Present_Price'] <= Q3 + 1.5 * IQR)]

In [10]:
# calculate the IQR for a column 'column_name'
Q1 = data['Kms_Driven'].quantile(0.25)
Q3 = data['Kms_Driven'].quantile(0.75)
IQR = Q3 - Q1

# detect outliers
outliers = data[(data['Kms_Driven'] < Q1 - 1.5 * IQR) | (data['Kms_Driven'] > Q3 + 1.5 * IQR)]

In [11]:
data = data[(data['Kms_Driven'] >= Q1 - 1.5 * IQR) & (data['Kms_Driven'] <= Q3 + 1.5 * IQR)]

In [12]:
data

Unnamed: 0,Car_age,Selling_Price,Present_Price,Kms_Driven,Owner,Fuel_Type,Seller_type,Transmission
0,7,3.35,5.59,27000,0,Petrol,Dealer,Manual
1,8,4.75,9.54,43000,0,Diesel,Dealer,Manual
2,4,7.25,9.85,6900,0,Petrol,Dealer,Manual
3,10,2.85,4.15,5200,0,Petrol,Dealer,Manual
4,7,4.60,6.87,42450,0,Diesel,Dealer,Manual
...,...,...,...,...,...,...,...,...
296,5,9.50,11.60,33988,0,Diesel,Dealer,Manual
297,6,4.00,5.90,60000,0,Petrol,Dealer,Manual
298,12,3.35,11.00,87934,0,Petrol,Dealer,Manual
299,4,11.50,12.50,9000,0,Diesel,Dealer,Manual


In [14]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('Selling_Price', axis=1), data['Selling_Price'], test_size=0.2, random_state=42)

# Define the column transformer to preprocess the data
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['Car_age', 'Present_Price', 'Kms_Driven', 'Owner']),
    ('cat', OneHotEncoder(), ['Fuel_Type', 'Seller_type', 'Transmission'])
])

# Define the random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', rf)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model using mean squared error and R-squared
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Squared Error: 0.564366477142857
R-squared: 0.9347417331082855
