In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('dataset.csv')

In [3]:
df.head(3)

Unnamed: 0,name,description,make,model,year,price,engine,cylinders,fuel,mileage,transmission,trim,body,doors,exterior_color,interior_color,drivetrain
0,2024 Jeep Wagoneer Series II,"\n \n Heated Leather Seats, Nav Sy...",Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,Series II,SUV,4.0,White,Global Black,Four-wheel Drive
1,2024 Jeep Grand Cherokee Laredo,Al West is committed to offering every custome...,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,Laredo,SUV,4.0,Metallic,Global Black,Four-wheel Drive
2,2024 GMC Yukon XL Denali,,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,Denali,SUV,4.0,Summit White,Teak/Light Shale,Four-wheel Drive


In [4]:
df = df.drop(['name', 'description','trim', 'exterior_color', 'interior_color'], axis=1)

In [5]:
df.head(3)

Unnamed: 0,make,model,year,price,engine,cylinders,fuel,mileage,transmission,body,doors,drivetrain
0,Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,SUV,4.0,Four-wheel Drive
1,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,SUV,4.0,Four-wheel Drive
2,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,SUV,4.0,Four-wheel Drive


In [6]:
df.isnull().sum()

make              0
model             0
year              0
price            23
engine            2
cylinders       105
fuel              7
mileage          34
transmission      2
body              3
doors             7
drivetrain        0
dtype: int64

In [7]:
df.shape

(1002, 12)

In [8]:
len(df['fuel'].unique())

8

In [9]:
(df['fuel'].unique())

array(['Gasoline', 'Diesel', 'Hybrid', 'Electric', 'E85 Flex Fuel',
       'PHEV Hybrid Fuel', nan, 'Diesel (B20 capable)'], dtype=object)

In [10]:
df = df.dropna(subset=["price"])

In [11]:
numerical_cols = ["cylinders", "mileage", "doors"]
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

In [12]:
categorical_cols = ["engine", "fuel", "transmission", "body", "drivetrain"]
for col in categorical_cols:
    df[col] = df[col].fillna("Unknown")

In [13]:
# Checking for outliers:

# Check for outliers in 'price' and 'mileage'
price_outliers = df[(df["price"] == 0) | (df["price"] > df["price"].quantile(0.99))]
mileage_outliers = df[df["mileage"] > df["mileage"].quantile(0.99)]


In [14]:
# Removing extreme outliers

df = df[~df.index.isin(price_outliers.index)]
df = df[~df.index.isin(mileage_outliers.index)]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 958 entries, 0 to 1001
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   make          958 non-null    object 
 1   model         958 non-null    object 
 2   year          958 non-null    int64  
 3   price         958 non-null    float64
 4   engine        958 non-null    object 
 5   cylinders     958 non-null    float64
 6   fuel          958 non-null    object 
 7   mileage       958 non-null    float64
 8   transmission  958 non-null    object 
 9   body          958 non-null    object 
 10  doors         958 non-null    float64
 11  drivetrain    958 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 97.3+ KB


#### Data has been cleaned till here.

In [16]:
df1 = df.copy()

In [17]:
df1.head(3)

Unnamed: 0,make,model,year,price,engine,cylinders,fuel,mileage,transmission,body,doors,drivetrain
0,Jeep,Wagoneer,2024,74600.0,24V GDI DOHC Twin Turbo,6.0,Gasoline,10.0,8-Speed Automatic,SUV,4.0,Four-wheel Drive
1,Jeep,Grand Cherokee,2024,50170.0,OHV,6.0,Gasoline,1.0,8-Speed Automatic,SUV,4.0,Four-wheel Drive
2,GMC,Yukon XL,2024,96410.0,"6.2L V-8 gasoline direct injection, variable v...",8.0,Gasoline,0.0,Automatic,SUV,4.0,Four-wheel Drive


In [18]:
features_count = ['make', 'model', 'engine', 'fuel', 'transmission', 'body', 'drivetrain']
for fea in features_count:
    print(f"Total length of {fea}: {len(df1[fea].unique())}")

# len(df1['make'].unique())

Total length of make: 28
Total length of model: 148
Total length of engine: 98
Total length of fuel: 8
Total length of transmission: 38
Total length of body: 9
Total length of drivetrain: 4


In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
categorical_cols = ["make", "model", "engine", "fuel", "transmission", "body", "drivetrain"]

In [21]:
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col])
    label_encoders[col] = le  # Store encoders for future inverse transformation if needed

In [22]:
df1.head()

Unnamed: 0,make,model,year,price,engine,cylinders,fuel,mileage,transmission,body,doors,drivetrain
0,15,135,2024,74600.0,23,6.0,4,10.0,19,6,4.0,1
1,15,54,2024,50170.0,65,6.0,4,1.0,19,6,4.0,1
2,9,144,2024,96410.0,47,8.0,4,0.0,31,6,4.0,1
3,7,28,2023,46835.0,13,8.0,4,32.0,19,6,4.0,0
4,23,2,2024,81663.0,19,6.0,0,10.0,9,5,4.0,1


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [24]:
# Define features (X) and target (y)
X = df1.drop(columns=["price"])  # All columns except 'price'
y = df1["price"]  # Target variable

In [25]:
# Split into train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [27]:
# Apply RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
rf_random = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42), 
    param_distributions=param_grid, 
    n_iter=20, 
    cv=5, 
    verbose=2, 
    n_jobs=-1, 
    random_state=42
)

In [28]:
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [29]:
# Best parameters
best_params = rf_random.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20, 'bootstrap': True}


In [30]:
# Train model with best parameters
rf_model = RandomForestRegressor(**best_params, random_state=42)
rf_model.fit(X_train, y_train)





In [31]:
# Predictions and evaluation
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [32]:
mse

49736103.93667293

In [33]:
r2

0.801687181196471

In [36]:
def get_vehicle_price():
    # Predefined vehicle features
    vehicle_data = np.array([[15, 54, 2024, 65, 6.0, 4, 1.0, 19, 6, 4.0, 1]])

    # Predict price using the trained model
    predicted_price = rf_model.predict(vehicle_data)[0]
    print(f"Model type: {type(rf_model)}")

    return f"🔹 Estimated Vehicle Price: ${predicted_price:,.2f}"

In [37]:
# Call function
print(get_vehicle_price())

Model type: <class 'sklearn.ensemble._forest.RandomForestRegressor'>
🔹 Estimated Vehicle Price: $49,452.72




In [None]:
import joblib

# Save the trained model
joblib.dump(rf_model, "vehicle_price_model.pkl")