<a href="https://colab.research.google.com/github/christina3099/Resale-car-price-prediction/blob/main/Resale_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("msnbehdani/mock-dataset-of-second-hand-car-sales")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'mock-dataset-of-second-hand-car-sales' dataset.
Path to dataset files: /kaggle/input/mock-dataset-of-second-hand-car-sales


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [76]:
cars = pd.read_csv("/kaggle/input/mock-dataset-of-second-hand-car-sales/car_sales_data.csv")

In [77]:
cars.head()

Unnamed: 0,Manufacturer,Model,Engine size,Fuel type,Year of manufacture,Mileage,Price
0,Ford,Fiesta,1.0,Petrol,2002,127300,3074
1,Porsche,718 Cayman,4.0,Petrol,2016,57850,49704
2,Ford,Mondeo,1.6,Diesel,2014,39190,24072
3,Toyota,RAV4,1.8,Hybrid,1988,210814,1705
4,VW,Polo,1.0,Petrol,2006,127869,4101


In [78]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Manufacturer         50000 non-null  object 
 1   Model                50000 non-null  object 
 2   Engine size          50000 non-null  float64
 3   Fuel type            50000 non-null  object 
 4   Year of manufacture  50000 non-null  int64  
 5   Mileage              50000 non-null  int64  
 6   Price                50000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 2.7+ MB


### Data is pretty clean already. Jumping to Model Building.


In [79]:
cars['Age']=2022-cars['Year of manufacture']

In [58]:
cars.head()

Unnamed: 0,Manufacturer,Model,Engine size,Fuel type,Year of manufacture,Mileage,Price,Age
0,Ford,Fiesta,1.0,Petrol,2002,127300,3074,20
1,Porsche,718 Cayman,4.0,Petrol,2016,57850,49704,6
2,Ford,Mondeo,1.6,Diesel,2014,39190,24072,8
3,Toyota,RAV4,1.8,Hybrid,1988,210814,1705,34
4,VW,Polo,1.0,Petrol,2006,127869,4101,16


In [59]:
cars['Fuel type'].unique()

array(['Petrol', 'Diesel', 'Hybrid'], dtype=object)

In [80]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Manufacturer         50000 non-null  object 
 1   Model                50000 non-null  object 
 2   Engine size          50000 non-null  float64
 3   Fuel type            50000 non-null  object 
 4   Year of manufacture  50000 non-null  int64  
 5   Mileage              50000 non-null  int64  
 6   Price                50000 non-null  int64  
 7   Age                  50000 non-null  int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 3.1+ MB


In [81]:
from sklearn.model_selection import train_test_split

In [82]:
X_train, X_test, y_train, y_test = train_test_split(cars.drop('Price',axis=1), cars['Price'], test_size=0.3, random_state=101)

In [83]:
X_train.head()

Unnamed: 0,Manufacturer,Model,Engine size,Fuel type,Year of manufacture,Mileage,Age
29671,Ford,Fiesta,1.0,Petrol,1997,74300,25
48133,VW,Golf,2.0,Petrol,1999,174827,23
2872,Toyota,Prius,1.4,Hybrid,2006,85522,16
23899,VW,Golf,2.0,Petrol,1993,246389,29
40469,VW,Golf,1.8,Petrol,1993,245993,29


In [66]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(y_true, y_pred, n_samples, n_features):

    # Error metrics
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    # Adjusted R²
    adj_r2 = 1 - (1-r2) * (n_samples-1)/(n_samples-n_features-1)

    return {"R2": r2, "Adjusted_R2": adj_r2, "MAE": mae, "MSE": mse, "RMSE": rmse}


In [67]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb

In [68]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.01),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, random_state=42),
    "XGBoost": xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42)
}

In [69]:
results = {}

In [70]:
for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Evaluate
    results[name] = evaluate_model(y_test, y_pred, n_samples=X_test.shape[0], n_features=X_test.shape[1])


results_df = pd.DataFrame(results).T.sort_values(by="R2", ascending=False)
print(results_df)

                         R2  Adjusted_R2       MAE       MSE      RMSE
XGBoost            0.999215     0.999215  0.016736  0.000751  0.027400
Random Forest      0.998037     0.998036  0.021585  0.001878  0.043336
Gradient Boosting  0.991795     0.991791  0.056083  0.007850  0.088599
Decision Tree      0.983990     0.983982  0.077477  0.015317  0.123763
Ridge Regression   0.681440     0.681291  0.368343  0.304776  0.552065
Linear Regression  0.681439     0.681291  0.368344  0.304776  0.552065
Lasso Regression   0.681178     0.681029  0.367694  0.305026  0.552292


In [71]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Manufacturer         50000 non-null  int64  
 1   Model                50000 non-null  int64  
 2   Engine size          50000 non-null  float64
 3   Fuel type            50000 non-null  int64  
 4   Year of manufacture  50000 non-null  int64  
 5   Mileage              50000 non-null  float64
 6   Price                50000 non-null  float64
 7   Age                  50000 non-null  float64
dtypes: float64(4), int64(4)
memory usage: 3.1 MB


In [87]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb


# --- Columns ---
num_features = ["Mileage", "Engine size", "Age"]
cat_features = [ "Manufacturer", "Model","Fuel type"]

preprocessor = ColumnTransformer([
    ("encoding", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1), cat_features)],
    remainder="passthrough")


# --- Pipeline ---

pipeline = Pipeline(steps=[
    ("Preprocessing",preprocessor),
    ("model", xgb.XGBRegressor(n_estimators=200, learning_rate=0.1, random_state=42))
])

# --- Training ---
pipeline.fit(X_train, y_train)




The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [89]:
# New unseen data (same feature structure as X_train)

new_car = pd.DataFrame([{
    "Year of manufacture": 2019,
    "Mileage": 25000,
    "Engine size": 1.6,
    "Fuel type": "Petrol",
    "Age":2022- 2019,
    "Manufacturer": "Hyundai",
    "Model": "i20"
}])

# Predict resale price
predicted_price = pipeline.predict(new_car)
print("Predicted Resale Price:", predicted_price[0])

Predicted Resale Price: 39453.9


In [91]:
from sklearn.preprocessing import OrdinalEncoder

Encoder = OrdinalEncoder()
df= cars.copy()

for col in ['Manufacturer','Fuel type','Model']:
    df[col]= Encoder.fit_transform(df[[col]])