In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [3]:
df = pd.read_csv("car_price_prediction.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

In [6]:
df.sample(5)

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
5551,45730384,32311,640,SSANGYONG,REXTON,2013,Jeep,Yes,Diesel,2.0,123700 km,4.0,Automatic,Front,04-May,Left wheel,Black,4
15949,45791597,11917,-,MERCEDES-BENZ,E 320,2003,Sedan,Yes,CNG,3.2,0 km,6.0,Automatic,Rear,04-May,Left wheel,Silver,8
15182,45812916,12000,-,HONDA,FIT,2010,Hatchback,No,Hybrid,1.3,155000 km,4.0,Variator,Front,04-May,Right-hand drive,White,4
17820,45773284,862,259,CHEVROLET,Volt,2014,Jeep,Yes,Hybrid,1.4,133195 km,4.0,Automatic,Front,04-May,Left wheel,Black,0
10094,45809444,2901,749,HYUNDAI,Sonata,2014,Sedan,Yes,Petrol,2.4,158714 km,4.0,Automatic,Front,04-May,Left wheel,White,12


In [7]:
df.columns = df.columns.str.strip().str.replace(" ", "_")

In [8]:
df.sample(3)

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod._year,Category,Leather_interior,Fuel_type,Engine_volume,Mileage,Cylinders,Gear_box_type,Drive_wheels,Doors,Wheel,Color,Airbags
14612,45648498,29479,1058,TOYOTA,Highlander,2012,Jeep,Yes,Hybrid,3.5,205250 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1989,45811585,9565,1356,MERCEDES-BENZ,GL 550,2014,Jeep,Yes,Petrol,4.6,98466 km,8.0,Automatic,4x4,04-May,Left wheel,White,12
17370,45619314,627,1604,MERCEDES-BENZ,E 350,2014,Sedan,Yes,Diesel,5.5,101424 km,8.0,Automatic,4x4,04-May,Left wheel,Grey,12


In [9]:
df.dropna()

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod._year,Category,Leather_interior,Fuel_type,Engine_volume,Mileage,Cylinders,Gear_box_type,Drive_wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,45798355,8467,-,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,300000 km,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5
19233,45778856,15681,831,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600 km,4.0,Tiptronic,Front,04-May,Left wheel,Red,8
19234,45804997,26108,836,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,116365 km,4.0,Automatic,Front,04-May,Left wheel,Grey,4
19235,45793526,5331,1288,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2,51258 km,4.0,Automatic,Front,04-May,Left wheel,Black,4


In [10]:
if 'Prod._year' in df.columns:
    df['Car_Age'] = 2025 - df['Prod._year']

In [11]:
df.sample(5)

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod._year,Category,Leather_interior,Fuel_type,Engine_volume,Mileage,Cylinders,Gear_box_type,Drive_wheels,Doors,Wheel,Color,Airbags,Car_Age
4194,45416629,6899,-,HONDA,Civic,2002,Hatchback,No,Petrol,1.5,157000 km,4.0,Variator,Front,04-May,Right-hand drive,Carnelian red,4,23
13785,45523825,549,1079,TOYOTA,CHR,2018,Jeep,Yes,Petrol,2.0,46733 km,4.0,Automatic,Front,04-May,Left wheel,Silver,12,7
13364,45732162,40569,891,HYUNDAI,Santa FE,2016,Jeep,Yes,Diesel,2.0,194067 km,4.0,Automatic,Front,04-May,Left wheel,Black,4,9
5715,45487601,23521,-,TOYOTA,Prius,2013,Sedan,No,Hybrid,1.8,115000 km,4.0,Variator,Front,04-May,Left wheel,Sky blue,12,12
9282,45781147,2500,-,NISSAN,Micra,1992,Coupe,No,Petrol,1.0,262000 km,4.0,Manual,Front,02-Mar,Left wheel,Black,0,33


In [12]:
label_encoders = {}
for column in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [13]:
df.sample(3)

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod._year,Category,Leather_interior,Fuel_type,Engine_volume,Mileage,Cylinders,Gear_box_type,Drive_wheels,Doors,Wheel,Color,Airbags,Car_Age
9676,45778865,12544,375,8,537,2014,9,0,5,24,192,4.0,0,1,1,0,12,0,11
15101,45650116,5018,489,58,435,2013,9,1,2,46,2933,4.0,0,1,1,0,1,12,12
4327,45812672,8467,0,36,385,2000,1,0,5,36,4067,4.0,1,2,0,0,12,4,25


In [14]:
X = df.drop("Price", axis=1)
y = df["Price"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))

R² Score: -170.78994979464073
RMSE: 231363.71105346203


In [18]:
joblib.dump(model, "car_price_model.pkl")
joblib.dump(list(X.columns), "model_features.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

['label_encoders.pkl']