In [6]:
!pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/cb/be/dec2a8d31d133034a8ec51ae68ac564ec9bde1c78a64551f1438c3690b9e/scikit_learn-1.5.1-cp312-cp312-win_amd64.whl.metadata
  Downloading scikit_learn-1.5.1-cp312-cp312-win_amd64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.6.0 from https://files.pythonhosted.org/packages/3f/72/305686527c68f33f1dd3ebdd28f53340d372b2f9e44dccaf6f92e17739d3/scipy-1.14.0-cp312-cp312-win_amd64.whl.metadata
  Downloading scipy-1.14.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.8 kB ? eta -:--:--
     ------------------- ------------------ 30.7/60.8 kB 187.9 kB/s eta 0:00:01
     ------------------- ------------------ 30.7/60.8 kB 1


[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score

from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv(r'C:\Users\pintu\OneDrive\Desktop\Projects\car_price_predictor\cleaned.csv')

In [3]:
df.sample()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
4849,Hyundai i20 Active 1.4 SX,Hyderabad,2016,62679,Diesel,Manual,First,21.19,1396.0,88.73,5.0,6.95


In [7]:
X = df.drop(columns=['Price','Location'])
y = df['Price']

# Ordinal Encoding

In [8]:
column_to_encode = ['Name','Fuel_Type','Transmission']

In [10]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Kilometers_Driven','Engine','Power','Seats','Year']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), column_to_encode),
        ('cat1',OrdinalEncoder(),['Owner_Type'])
    ], 
    remainder='passthrough'
)

In [11]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [12]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [14]:
pipeline.fit(X_train,y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [15]:
y_pred = pipeline.predict(X_test)

In [16]:
mean_absolute_error(y_test,y_pred)

2.7665336189213234

In [17]:
r2_score(y_test,y_pred)

0.7551114041997995

In [16]:
y_pred

array([33.99141795,  0.37027443,  1.93653423, ..., 25.71469776,
       27.15586654,  4.51659835])

In [17]:
y_test

501     37.56
3163     2.45
3954     2.55
4275     3.50
4898    31.00
        ...  
2080     5.49
2915    14.23
4775    33.64
607     29.50
3176     3.30
Name: Price, Length: 1162, dtype: float64

In [18]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(round(mean_absolute_error(np.expm1(y_test),y_pred),2))
    
    return output

In [30]:
!pip install xgboost




[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [20]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [21]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [24]:
output = pd.DataFrame(model_output,columns=['model_name','r2_score','Mean_absolute_error'])

In [34]:
output

Unnamed: 0,model_name,r2_score,Mean_absolute_error
0,linear_reg,0.752793,1.209827e+37
1,svr,0.592472,4.122725e+37
2,ridge,0.817467,4.122725e+37
3,LASSO,0.665961,4.122725e+37
4,decision tree,0.837154,4.122725e+37
5,random forest,0.901513,4.122725e+37
6,extra trees,0.903736,4.1226519999999998e+37
7,gradient boosting,0.873313,4.122725e+37
8,adaboost,0.655294,4.122725e+37
9,mlp,0.917352,3.528095e+37


In [33]:
output['Mean_absolute_error']

0     1.209827e+37
1     4.122725e+37
2     4.122725e+37
3     4.122725e+37
4     4.122725e+37
5     4.122725e+37
6     4.122652e+37
7     4.122725e+37
8     4.122725e+37
9     3.528095e+37
10    4.122725e+37
Name: Mean_absolute_error, dtype: float64

In [35]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', MLPRegressor())
])

In [37]:
pipeline.fit(X,y)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [None]:
import pickle
pickle.dump(pipeline,open('pipeline.pkl','wb'))