In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [15]:
# Load the dataset
from google.colab import files
uploaded = files.upload()









Saving dataset.csv to dataset.csv


In [17]:
# Now dataset.csv is available in your current Colab environment
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv("dataset.csv")
print("✅ Dataset loaded. Rows:", df.shape[0], "| Columns:", df.shape[1])

✅ Dataset loaded. Rows: 1002 | Columns: 17


In [18]:
# STEP 2: Clean and preprocess the data

# Drop rows with missing price
df.dropna(subset=['price'], inplace=True)

# Create vehicle age feature
df['vehicle_age'] = 2025 - df['year']

# Drop less useful columns
df.drop(columns=['name', 'description', 'year', 'engine'], inplace=True)

# Fill numeric columns with median
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill categorical columns with mode
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

print("✅ Data cleaned. Now ready for modeling.")


✅ Data cleaned. Now ready for modeling.


In [19]:
# STEP 3: Prepare for ML modeling

from sklearn.model_selection import train_test_split

# Features and target
X = df.drop("price", axis=1)
y = df["price"]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# STEP 4: Build and train the model

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor

# Identify feature types
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Modeling pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)
print("✅ Model training complete.")


✅ Model training complete.


In [21]:
# STEP 5: Evaluate the model

from sklearn.metrics import mean_squared_error, r2_score

# Predict
y_pred = model.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"📊 RMSE: ${rmse:.2f}")
print(f"📈 R² Score: {r2:.2f}")


📊 RMSE: $7781.24
📈 R² Score: 0.80


In [22]:
import joblib
joblib.dump(model, "vehicle_price_model.pkl")
print("✅ Model saved as vehicle_price_model.pkl")


✅ Model saved as vehicle_price_model.pkl
