In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
import os
from six.moves import urllib


warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
df = pd.read_csv(os.path.join('cardekho.csv'))
df= df.drop('torque',axis=1)

In [3]:
df['max_power'] = df['max_power'].str.replace(' bhp','')
df['max_power'] = pd.to_numeric(df['max_power'], errors='coerce')
df['engine']= df['engine'].str.replace(' CC','')
df['engine'] = pd.to_numeric(df['engine'], errors='coerce')
df['mileage'] = df['mileage'].str.replace(' kmpl','')
df['mileage'] = df['mileage'].str.replace(' km/kg','')
df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')

In [4]:
df = df.dropna()
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14,1498.0,103.52,5.0
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7,1497.0,78.0,5.0
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0,1396.0,90.0,5.0
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1,1298.0,88.2,5.0


In [5]:
categorial_columns = [feature for feature in df.columns if df[feature].dtype == "O"]
numerical_columns = [feature for feature in df.columns if df[feature].dtype != 'O']
numerical_columns

['year',
 'selling_price',
 'km_driven',
 'mileage',
 'engine',
 'max_power',
 'seats']

In [6]:
fuel = ['Diesel', 'Petrol', 'LPG', 'CNG']
seller_type = ['Individual', 'Dealer', 'Trustmark Dealer']
transmission= ['Manual', 'Automatic']
owner = ['First Owner', 'Second Owner', 'Third Owner','Fourth & Above Owner', 'Test Drive Car']

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [8]:
df.head(1)

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4,1248.0,74.0,5.0


In [9]:
X = df.drop(columns=['name','selling_price'])

In [10]:
Y = df[['selling_price']]

In [11]:
numerical_columns = [feature for feature in X.columns if X[feature].dtype != 'O']
categorial_columns = [feature for feature in X.columns if X[feature].dtype == "O"]



In [12]:
missing_numerical_columns = set(numerical_columns) - set(X.columns)
missing_categorial_columns = set(categorial_columns) - set(X.columns)

print("Missing Numerical Columns:", missing_numerical_columns)
print("Missing Categorical Columns:", missing_categorial_columns)

Missing Numerical Columns: set()
Missing Categorical Columns: set()


In [13]:
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[fuel, seller_type, transmission, owner])),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer([
('num_pipeline',num_pipeline,numerical_columns),
('cat_pipeline',cat_pipeline,categorial_columns)
])

print(preprocessor)

ColumnTransformer(transformers=[('num_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 ['year', 'km_driven', 'mileage', 'engine',
                                  'max_power', 'seats']),
                                ('cat_pipeline',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(categories=[['Diesel',
                                                                              'Petrol',
                                                                              'LPG',
                                                           

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.30, random_state=30)

In [15]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [16]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())

In [17]:
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())


In [18]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [20]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [22]:
models = {
    "LinearRegression": LinearRegression(n_jobs=-1),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "RandomForest": RandomForestRegressor()
}
def train_model(model, model_name, x_train= X_train, x_test = X_test, y_train = y_train, y_test = y_test):
    print("#"*50)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(model_name,":")
    print("r2_score", r2_score(y_test, y_pred))
    print("Score", ":", mean_squared_error(y_pred, y_test))
    print("Mean score", ":", np.sqrt(mean_squared_error(y_pred, y_test)))
for model_name, model in models.items():
    train_model(model, model_name)

##################################################
LinearRegression :
r2_score 0.6521899683424051
Score : 210932141704.27002
Mean score : 459273.493361276
##################################################
Ridge :
r2_score 0.6522042763380349
Score : 210923464507.27457
Mean score : 459264.04660856543
##################################################
Lasso :
r2_score 0.6521901500373137
Score : 210932031513.9571
Mean score : 459273.3733997183
##################################################
ElasticNet :
r2_score 0.6339466459801422
Score : 221995948689.17557
Mean score : 471164.4603417957
##################################################
RandomForest :
r2_score 0.9627840230483043
Score : 22569923261.34509
Mean score : 150232.89673485328
