Step 1 : Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
Step 2 : Load the dataset

In [None]:
from google.colab import files
upload =files.upload()

Saving CarPrice_Assignment.csv to CarPrice_Assignment.csv


In [None]:
df = pd.read_csv('CarPrice_Assignment.csv')

Step 3 : Check the data

In [None]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

In [None]:
print(df.describe())

           car_ID   symboling   wheelbase   carlength    carwidth   carheight  \
count  205.000000  205.000000  205.000000  205.000000  205.000000  205.000000   
mean   103.000000    0.834146   98.756585  174.049268   65.907805   53.724878   
std     59.322565    1.245307    6.021776   12.337289    2.145204    2.443522   
min      1.000000   -2.000000   86.600000  141.100000   60.300000   47.800000   
25%     52.000000    0.000000   94.500000  166.300000   64.100000   52.000000   
50%    103.000000    1.000000   97.000000  173.200000   65.500000   54.100000   
75%    154.000000    2.000000  102.400000  183.100000   66.900000   55.500000   
max    205.000000    3.000000  120.900000  208.100000   72.300000   59.800000   

        curbweight  enginesize   boreratio      stroke  compressionratio  \
count   205.000000  205.000000  205.000000  205.000000        205.000000   
mean   2555.565854  126.907317    3.329756    3.255415         10.142537   
std     520.680204   41.642693    0.270844

In [None]:
print(df.columns)

Index(['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'],
      dtype='object')


In [None]:
print(df.nunique())

car_ID              205
symboling             6
CarName             147
fueltype              2
aspiration            2
doornumber            2
carbody               5
drivewheel            3
enginelocation        2
wheelbase            53
carlength            75
carwidth             44
carheight            49
curbweight          171
enginetype            7
cylindernumber        7
enginesize           44
fuelsystem            8
boreratio            38
stroke               37
compressionratio     32
horsepower           59
peakrpm              23
citympg              29
highwaympg           30
price               189
dtype: int64


In [None]:
print(df.duplicated().sum())

0


In [None]:
print(df.isnull().sum())

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64


Step 4 : Data Preproccessing

In [None]:
df = df.drop(columns=['car_ID'])

In [None]:
df['company_name'] = df['CarName'].apply(lambda x: x.split()[0])

In [None]:
df['company_name'] = df['company_name'].replace({
    'Toyta': 'Toyota',
    'Ford': 'Ford', })

Step 5 : Label encoding

In [None]:
label_encoder = LabelEncoder()
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])

Step 6 : Outliers detection and removal

In [None]:
df = df[(np.abs(stats.zscore(df.select_dtypes(include=[np.number]))) < 3).all(axis=1)]

Step 7 : Feature Selection

In [None]:
corr_matrix = df.corr()
print(corr_matrix['price'].sort_values(ascending=False))

price               1.000000
curbweight          0.843535
enginesize          0.817810
horsepower          0.807691
carwidth            0.788366
carlength           0.749698
wheelbase           0.712624
fuelsystem          0.630889
drivewheel          0.514262
boreratio           0.490906
carheight           0.257095
aspiration          0.254595
carbody             0.150727
cylindernumber      0.133664
stroke              0.062299
peakrpm             0.015671
enginetype         -0.017430
company_name       -0.121287
symboling          -0.126074
CarName            -0.132787
doornumber         -0.177057
compressionratio   -0.301310
highwaympg         -0.777664
citympg            -0.779588
fueltype                 NaN
enginelocation           NaN
Name: price, dtype: float64


Step 8 : Remove multicolinearity

In [None]:
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]
df = df.drop(columns=to_drop)

Step 9 : Data splitting

In [None]:
X = df.drop(columns=['price'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Step 10 : Apply Algorithms

Linear Regression

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred = linear_reg.predict(X_test)
print('Linear Regression RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Linear Regression R2:', r2_score(y_test, y_pred))


Linear Regression RMSE: 2438.4713622646327
Linear Regression R2: 0.7296995833457192


Decision Tree Regressor

In [None]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
y_pred = tree_reg.predict(X_test)
print('Decision Tree RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Decision Tree R2:', r2_score(y_test, y_pred))

Decision Tree RMSE: 4431.6908922061
Decision Tree R2: 0.107207942448179


Random Forest Regressor

In [None]:
forest_reg = RandomForestRegressor()
forest_reg.fit(X_train, y_train)
y_pred = forest_reg.predict(X_test)
print('Random Forest RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Random Forest R2:', r2_score(y_test, y_pred))

Random Forest RMSE: 2542.94056637014
Random Forest R2: 0.7060429935925909


Gradient Boosting Regressor

In [None]:
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train, y_train)
y_pred = gb_reg.predict(X_test)
print('Gradient Boosting RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Gradient Boosting R2:', r2_score(y_test, y_pred))

Gradient Boosting RMSE: 2443.7099522174735
Gradient Boosting R2: 0.7285369582157568


Support Vector Regressor

In [None]:
svr_reg = SVR()
svr_reg.fit(X_train, y_train)
y_pred = svr_reg.predict(X_test)
print('SVR RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('SVR R2:', r2_score(y_test, y_pred))

SVR RMSE: 4969.291554713319
SVR R2: -0.1225361924088364
