# **IMPORTS AND PACKAGES**

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import joblib, json


# **PATHS AND SETTINGS**

In [2]:
DATA_PATH = "dataset/cardekho_dataset.csv"

CLASS_PKL_PATH =  "models/best_classifier.pkl"
CLASS_JSON_PATH = "models/best_classifier_params.json"

REG_PKL_PATH = "models/best_regressor.pkl"
REG_JSON_PATH = "models/best_regressor_params.json"

cat_cols = ["brand", "model", "seller_type", "fuel_type", "transmission_type"]
num_cols = ["vehicle_age", "km_driven", "mileage", "engine", "max_power", "seats"]

tree_param_grid = {
    "clf__criterion": ["gini", "entropy"],
    "clf__max_depth": [3, 5, 8, None],
    "clf__min_samples_split": [2, 10],
    "clf__min_samples_leaf": [1],
    "clf__max_features": [None]
}
knn_param_grid = {
    "clf__n_neighbors": [3, 5, 7, 9, 11, 15, 20, 25],
    "clf__weights": ["uniform", "distance"],
    "clf__metric": ["minkowski"],
    "clf__p": [1, 2]
}
logReg_param_grid = [
    {
        "clf__penalty": ["l2"],
        "clf__solver": ["lbfgs", "newton-cg", "sag", "saga", "liblinear"],
        "clf__C": [0.01, 0.1, 1, 10],
    },
    {
        "clf__penalty": ["l1"],
        "clf__solver": ["saga", "liblinear"],
        "clf__C": [0.01, 0.1, 1, 10],
    },
    {
        "clf__penalty": ["elasticnet"],
        "clf__solver": ["saga"],
        "clf__l1_ratio": [0.0, 0.5, 1.0],
        "clf__C": [0.01, 0.1, 1, 10],
    }
]
ridge_param_grid = {
    "reg__alpha": [0.01, 0.1, 1, 5, 10, 50, 100]
}
lasso_param_grid = {
    "reg__alpha": [0.0001, 0.001, 0.01, 0.1, 1]
}
knn_reg_param_grid = {
    "reg__n_neighbors": [3, 5, 7, 9, 11, 15, 20],
    "reg__weights": ["uniform", "distance"],
    "reg__metric": ["minkowski"],
    "reg__p": [1, 2]
}
tree_reg_param_grid = {
    "reg__criterion": ["squared_error", "friedman_mse"],
    "reg__max_depth": [3, 5, 8, None],
    "reg__min_samples_split": [2, 10],
    "reg__min_samples_leaf": [1, 5],
    "reg__max_features": [None, "sqrt"]
}


# **PREPROCESSING DATA**

In [3]:
Data = pd.read_csv(DATA_PATH)
Data.drop(columns=["car_name", "Unnamed: 0"], inplace=True)
Data["km_driven"] = Data["km_driven"].replace(",", "").astype(int)

Data = Data.drop_duplicates()

# Lowercase
for col in Data.select_dtypes(include="object").columns:
    Data[col] = Data[col].str.lower().str.strip()

# Normalize model
Data["model"] = Data["model"].str.replace(" ", "_")

# Outliers
Data = Data[Data["km_driven"] < 500000]
Data = Data[(Data["mileage"] > 8) & (Data["mileage"] < 40)]
Data = Data[(Data["engine"] > 500) & (Data["engine"] < 6000)]
Data = Data[(Data["max_power"] > 20) & (Data["max_power"] < 400)]
Data = Data[(Data["selling_price"] > 20000) & (Data["selling_price"] < 5000000)]
Data = Data.reset_index(drop=True)

# Merge rare categories
Data["fuel_type"] = Data["fuel_type"].replace(["lpg", "electric"], "other")

brand_counts = Data["brand"].value_counts()
rare_brands = brand_counts[brand_counts < 100].index
Data["brand"] = Data["brand"].replace(rare_brands, "other_brand")

model_counts = Data["model"].value_counts()
rare_models = model_counts[model_counts < 50].index
Data["model"] = Data["model"].replace(rare_models, "other_model")

# Update brand-model map
brand_model_map = Data.groupby("brand")["model"].unique().apply(list).to_dict()
with open("models/brand_model_map.json", "w") as f:
    json.dump(brand_model_map, f, indent=4)

Data["resale_score"] = Data["selling_price"] / (Data["vehicle_age"] + 1)
low_th = Data["resale_score"].quantile(0.33)
high_th = Data["resale_score"].quantile(0.66)
def resale_class(x):
    if x < low_th:
        return "low"
    elif x < high_th:
        return "medium"
    else:
        return "high"
Data["resale_value_class"] = Data["resale_score"].apply(resale_class)
print("Resale value class counts:\n", Data["resale_value_class"].value_counts())


# preprocess pipeline
preprocess = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols)
])
preprocess_tree = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols)
])

lin_pipe = Pipeline([
    ("prep", preprocess),
    ("reg", LinearRegression())
])
ridge_pipe = Pipeline([
    ("prep", preprocess),
    ("reg", Ridge())
])
lasso_pipe = Pipeline([
    ("prep", preprocess),
    ("reg", Lasso(max_iter=5000))
])
knn_reg_pipe = Pipeline([
    ("prep", preprocess),
    ("reg", KNeighborsRegressor())
])
tree_reg_pipe = Pipeline([
    ("prep", preprocess_tree),
    ("reg", DecisionTreeRegressor(random_state=42))
])
tree_pipe = Pipeline([
    ("prep", preprocess_tree),
    ("clf", DecisionTreeClassifier(random_state=42))
])
knn_pipe = Pipeline([
    ("prep", preprocess),
    ("clf", KNeighborsClassifier())
])
logReg_pipe = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(max_iter=5000))
])

# Features, target variables
X_reg = Data.drop(columns=["selling_price", "resale_value_class", "resale_score"])
y_reg = Data["selling_price"]

X_clf = Data.drop(columns=["selling_price", "resale_value_class", "resale_score"])
y_clf = Data["resale_value_class"]

print("Final cleaned dataset shape:", Data.shape)


Resale value class counts:
 resale_value_class
high      5147
low       4981
medium    4973
Name: count, dtype: int64
Final cleaned dataset shape: (15101, 14)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_clf, y_clf, test_size=0.25, stratify=y_clf, random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

tree_search = GridSearchCV(tree_pipe, tree_param_grid, cv=cv, scoring="accuracy", n_jobs=-1, error_score=0)
tree_search.fit(X_train, y_train)
pred_tree = tree_search.predict(X_test)
print("Best Decision Tree Params:", tree_search.best_params_)
print("Decision Tree Accuracy:", accuracy_score(y_test, pred_tree))
print("Decision Tree F1 (macro):", f1_score(y_test, pred_tree, average='macro'))

knn_search = GridSearchCV(knn_pipe, knn_param_grid, cv=cv, scoring="accuracy", n_jobs=-1, error_score=0)
knn_search.fit(X_train, y_train)
pred_knn = knn_search.predict(X_test)
print("Best KNN Params:", knn_search.best_params_)
print("KNN Accuracy:", accuracy_score(y_test, pred_knn))
print("KNN F1 (macro):", f1_score(y_test, pred_knn, average='macro'))

logRegsearch = GridSearchCV(logReg_pipe, logReg_param_grid, cv=cv, scoring='accuracy', n_jobs=-1, error_score=0)
logRegsearch.fit(X_train, y_train)
pred_log = logRegsearch.predict(X_test)
print("Accuracy:", accuracy_score(y_test, pred_log))
print("F1 (macro):", f1_score(y_test, pred_log, average='macro'))

results_clf = {
    "Decision Tree": {
        "model": tree_search.best_estimator_,
        "params": tree_search.best_params_,
        "score": accuracy_score(y_test, pred_tree)
    },
    "KNN": {
        "model": knn_search.best_estimator_,
        "params": knn_search.best_params_,
        "score": accuracy_score(y_test, pred_knn)
    },
    "Logistic Regression": {
        "model": logRegsearch.best_estimator_,
        "params": logRegsearch.best_params_,
        "score": accuracy_score(y_test, pred_log)
    }
}
best_clf_name = max(results_clf, key=lambda m: results_clf[m]["score"])
best_classifier = results_clf[best_clf_name]["model"]
best_classifier_params = results_clf[best_clf_name]["params"]
best_classifier_score = results_clf[best_clf_name]["score"]


joblib.dump(best_classifier, CLASS_PKL_PATH)
with open(CLASS_JSON_PATH, "w") as f:
    json.dump(best_classifier_params, f, indent=4)


Best Decision Tree Params: {'clf__criterion': 'entropy', 'clf__max_depth': 8, 'clf__max_features': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 10}
Decision Tree Accuracy: 0.8919491525423728
Decision Tree F1 (macro): 0.8922862717397053
Best KNN Params: {'clf__metric': 'minkowski', 'clf__n_neighbors': 5, 'clf__p': 1, 'clf__weights': 'distance'}
KNN Accuracy: 0.878177966101695
KNN F1 (macro): 0.87791337086986
Accuracy: 0.909957627118644
F1 (macro): 0.9099960904703499


In [5]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.25, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

lin_pipe.fit(X_train_reg, y_train_reg)
pred_lin = lin_pipe.predict(X_test_reg)
print("\nLinear Regression")
print("R2:", r2_score(y_test_reg, pred_lin))
print("RMSE:", mean_squared_error(y_test_reg, pred_lin))

ridge_search = GridSearchCV(ridge_pipe, ridge_param_grid, cv=kf, scoring="r2", n_jobs=-1, error_score=0)
ridge_search.fit(X_train_reg, y_train_reg)
pred_ridge = ridge_search.predict(X_test_reg)
print("\nBest Ridge Params:", ridge_search.best_params_)
print("Ridge R2:", r2_score(y_test_reg, pred_ridge))
print("Ridge RMSE:", mean_squared_error(y_test_reg, pred_ridge))

lasso_search = GridSearchCV(lasso_pipe, lasso_param_grid, cv=kf, scoring="r2", n_jobs=-1, error_score=0)
lasso_search.fit(X_train_reg, y_train_reg)
pred_lasso = lasso_search.predict(X_test_reg)
print("\nBest Lasso Params:", lasso_search.best_params_)
print("Lasso R2:", r2_score(y_test_reg, pred_lasso))
print("Lasso RMSE:", mean_squared_error(y_test_reg, pred_lasso))

knn_reg_search = GridSearchCV(knn_reg_pipe, knn_reg_param_grid, cv=kf, scoring="r2", n_jobs=-1, error_score=0)
knn_reg_search.fit(X_train_reg, y_train_reg)
pred_knn_reg = knn_reg_search.predict(X_test_reg)
print("\nBest KNN Regressor Params:", knn_reg_search.best_params_)
print("KNN R2:", r2_score(y_test_reg, pred_knn_reg))
print("KNN RMSE:", mean_squared_error(y_test_reg, pred_knn_reg))

tree_reg_search = GridSearchCV(tree_reg_pipe, tree_reg_param_grid, cv=kf, scoring="r2", n_jobs=-1, error_score=0)
tree_reg_search.fit(X_train_reg, y_train_reg)
pred_tree_reg = tree_reg_search.predict(X_test_reg)
print("\nBest Decision Tree Params:", tree_reg_search.best_params_)
print("Decision Tree R2:", r2_score(y_test_reg, pred_tree_reg))
print("Decision Tree RMSE:", mean_squared_error(y_test_reg, pred_tree_reg))

results_reg = {
    "Linear": {
        "model": lin_pipe,
        "params": {"none": True},
        "score": r2_score(y_test_reg, pred_lin)
    },
    "Ridge": {
        "model": ridge_search.best_estimator_,
        "params": ridge_search.best_params_,
        "score": r2_score(y_test_reg, pred_ridge)
    },
    "Lasso": {
        "model": lasso_search.best_estimator_,
        "params": lasso_search.best_params_,
        "score": r2_score(y_test_reg, pred_lasso)
    },
    "KNN Regressor": {
        "model": knn_reg_search.best_estimator_,
        "params": knn_reg_search.best_params_,
        "score": r2_score(y_test_reg, pred_knn_reg)
    },
    "Decision Tree Regressor": {
        "model": tree_reg_search.best_estimator_,
        "params": tree_reg_search.best_params_,
        "score": r2_score(y_test_reg, pred_tree_reg)
    }
}

best_reg_name = max(results_reg, key=lambda m: results_reg[m]["score"])
best_regressor = results_reg[best_reg_name]["model"]
best_regressor_params = results_reg[best_reg_name]["params"]
best_regressor_score = results_reg[best_reg_name]["score"]

# SAVE BEST REGRESSOR
joblib.dump(best_regressor,REG_PKL_PATH)
with open(REG_JSON_PATH, "w") as f:
    json.dump(best_regressor_params, f, indent=4)


Linear Regression
R2: 0.8065196174507278
RMSE: 71602098733.33218

Best Ridge Params: {'reg__alpha': 0.1}
Ridge R2: 0.8063413273509921
Ridge RMSE: 71668079300.23126

Best Lasso Params: {'reg__alpha': 1}
Lasso R2: 0.8064735527785436
Lasso RMSE: 71619146080.26414

Best KNN Regressor Params: {'reg__metric': 'minkowski', 'reg__n_neighbors': 7, 'reg__p': 2, 'reg__weights': 'distance'}
KNN R2: 0.9308325746469874
KNN RMSE: 25597079941.66056

Best Decision Tree Params: {'reg__criterion': 'squared_error', 'reg__max_depth': None, 'reg__max_features': None, 'reg__min_samples_leaf': 1, 'reg__min_samples_split': 10}
Decision Tree R2: 0.9157551421607943
Decision Tree RMSE: 31176848780.73973
