In [1]:
!pip install --upgrade catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
#Import req for the ml stuff
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as n
import kagglehub
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
from google.colab import drive
drive.mount('/content/drive')

path = kagglehub.dataset_download("syedanwarafridi/vehicle-sales-data")

print("Path to dataset files:", path)

MessageError: Error: credential propagation was unsuccessful

In [None]:
file_path = "/kaggle/input/vehicle-sales-data/car_prices.csv"
car_prices = pd.read_csv(file_path)
car_prices.head(10)

In [None]:
car_prices.info()

In [None]:
#Imputing data :D
data = pd.read_csv('/kaggle/input/vehicle-sales-data/car_prices.csv')
data.info()

data.loc[:,'make'] = data.loc[:,['make']].fillna(data['make'].mode()[0])
data.loc[:,'model'] = data.loc[:,['model']].fillna(data['model'].mode()[0])
data.loc[:,'body'] = data.loc[:,['body']].fillna(data['body'].mode()[0])
data.loc[:,'trim'] = data.loc[:,['trim']].fillna(data['trim'].mode()[0])
data.loc[:,'color'] = data.loc[:,['color']].fillna(data['color'].mode()[0])
data.loc[:,'interior'] = data.loc[:,['interior']].fillna(data['interior'].mode()[0])
data.loc[:,'transmission'] = data.loc[:,['transmission']].fillna(data['transmission'].mode()[0])

data.loc[:,'condition'] = data.loc[:,['condition']].fillna(data['condition'].mean())
data.loc[:,'odometer'] = data.loc[:,['odometer']].fillna(data['odometer'].mean())
data.loc[:,'mmr'] = data.loc[:,['mmr']].fillna(data['mmr'].mean())

data.dropna(subset=['sellingprice'],axis='rows',inplace=True)

###Viewing Data

In [None]:
car_prices.describe().round().T  #transposed data

In [None]:
car_prices.shape

In [None]:
car_prices.isna().sum()

In [None]:
car_prices.count()

###Data cleaning

In [None]:
car_prices.dropna(inplace=True)

In [None]:
car_prices.isna().sum()

###Visualization

In [None]:
sns.scatterplot(car_prices, x="odometer", y="sellingprice", color="#358D0C", s=20)
plt.xlabel("Odometer (km)")
plt.ylabel("Selling Price")
plt.title("Selling Price vs. Odometer Reading")
plt.grid(True)

plt.show()

sns.scatterplot(car_prices, x="mmr", y="sellingprice", color="#0B8FB9", s=20)
plt.xlabel("MMR")
plt.ylabel("Selling Price")
plt.title("Selling Price vs. MMR")
plt.grid(True)

plt.show()

In [None]:
#plot for condition of the car to price :)))
sns.relplot(
    x="condition",
    y="sellingprice",
    color="#0000FF",
    kind="line",
    data=car_prices,
    linewidth=2,
    marker="o",
    markersize=8,
    alpha=0.7,
    dashes=False,
    legend="full",
    markerfacecolor="#FF0000",
)

plt.xlabel("Condition")
plt.ylabel("Selling Price")
plt.title("Selling Price vs. Condition")


plt.show()

In [None]:
year_df = car_prices.groupby(by="year", as_index=False)["sellingprice"].first()
fig = sns.barplot(year_df, x="year", y="sellingprice", palette="Blues")
plt.xticks(rotation=90)
plt.show()

In [None]:
new_df = car_prices.groupby(
    by=["year", "make", "transmission", "condition", "color", "odometer", "mmr"],
    as_index=False,
)["sellingprice"].first()


new_df.sort_values(by="sellingprice", ascending=False).head(2)

yearly_mean_price = (
    new_df.groupby("year", as_index=False)["sellingprice"].mean()
).round(2)
yearly_mean_price

In [None]:
sns.barplot(
    x=yearly_mean_price["year"],
    y=yearly_mean_price["sellingprice"],
    palette="rocket_r",
)

plt.title("Mean Selling Price by Year")
plt.xlabel("Year")
plt.ylabel("Mean Selling Price")
plt.xticks(rotation=90)

plt.show()

In [None]:
filtered_df = car_prices[car_prices["color"] != "—"]

palette = "Set3"

plt.figure(figsize=(9, 5))
sns.barplot(
    filtered_df,
    x="color",
    y="sellingprice",
    palette='rocket_r',
)
plt.xlabel("Color Category")
plt.ylabel("Selling Price")
plt.title("Selling Price by Color")

plt.xticks(rotation=90)

plt.show()

In [None]:
y = car_prices['sellingprice']
feature_columns = ['year', 'make', 'model', 'odometer', 'condition', 'body']
X = car_prices[feature_columns]

numerical_features = ['year', 'odometer', 'condition',]
categorical_features = ['make', 'model', 'body']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

tree_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tree_pipeline.fit(X_train, y_train)

y_pred = tree_pipeline.predict(X_test)
print(y_pred)
print(f'Decision Tree MSE: {mean_squared_error(y_test, y_pred)}')
print(f'Decision Tree R^2 score: {r2_score(y_test, y_pred)}')

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Selling Price")
plt.ylabel("Predicted Selling Price")
plt.title("Actual vs. Predicted Selling Prices (Decision Tree Regressor)")
plt.grid(True)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Add a diagonal line for reference
plt.show()

In [None]:
y = car_prices['sellingprice']
feature_columns = ['year', 'make', 'model', 'odometer', 'condition', 'body']
X = car_prices[feature_columns]

categorical_features = ['make', 'model','body']
numerical_features = ['year', 'odometer', 'condition']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=100000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(y_pred)
print(f'Linear Regression MSE: {mean_squared_error(y_test, y_pred)}')
print(f'Linear Regression R^2 score: {r2_score(y_test, y_pred)}')

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Selling Price")
plt.ylabel("Predicted Selling Price")
plt.title("Actual vs. Predicted Selling Prices (Linear Regression)")
plt.grid(True)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) # Add a diagonal line for reference
plt.show()

In [None]:
residuals = y_test - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.xlabel("Predicted Selling Price")
plt.ylabel("Residuals")
plt.title("Residual Plot (Decision Tree Regressor)")
plt.hlines(0, y_pred.min(), y_pred.max(), colors='k', linestyles='dashed')
plt.grid(True)
plt.show()

In [None]:
y = car_prices['sellingprice']
X = car_prices.drop(['sellingprice', 'vin', 'saledate'], axis=1)

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=50, max_depth=10, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_pipeline.fit(X_train, y_train)

y_pred = model_pipeline.predict(X_test)
print(y_pred)
print(f'Random Forest MSE: {mean_squared_error(y_test, y_pred)}')
print(f'Random Forest R^2 score: {r2_score(y_test, y_pred)}')

In [None]:
residuals_rf = y_test - y_pred

plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals_rf, alpha=0.5)
plt.xlabel("Predicted Selling Price")
plt.ylabel("Residuals")
plt.title("Residual Plot (Random Forest Regressor)")
plt.hlines(0, y_pred.min(), y_pred.max(), colors='k', linestyles='dashed')
plt.grid(True)
plt.show()