Car Price Prediction: Machine Leaning Models

Cyrus Kolahi

run proj3_data_preprocess.ipynb to preprocess and create train and test set

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms

from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Dense
# Linear Regression



load data from data folder

In [3]:
X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")


Modeling

In [4]:
# Linear Regression
LinReg = LinearRegression()
LinReg.fit(X_train, y_train)
lr_pred = LinReg.predict(X_test)
print("Linear Regression Results:")
print(f"MSE: {mean_squared_error(y_test, lr_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, lr_pred):.2f}\n")

Linear Regression Results:
MSE: 4171.73
R2 Score: 1.00



In [None]:
# Kernel Ridge Regression with different kernels
kernels = ['linear', 'rbf', 'poly']
for kernel in kernels:
    kr = KernelRidge(kernel=kernel)
    kr.fit(X_train, y_train)
    kr_pred = kr.predict(X_test)
    print(f"Kernel Ridge Regression ({kernel} kernel) Results:")
    print(f"MSE: {mean_squared_error(y_test, kr_pred):.2f}")
    print(f"R2 Score: {r2_score(y_test, kr_pred):.2f}\n")




In [None]:
# Support Vector Regression with different kernels
for kernel in ['linear', 'rbf', 'poly']:
    svr = SVR(kernel=kernel)
    svr.fit(X_train, y_train)
    svr_pred = svr.predict(X_test)
    print(f"Support Vector Regression ({kernel} kernel) Results:")
    print(f"MSE: {mean_squared_error(y_test, svr_pred):.2f}")
    print(f"R2 Score: {r2_score(y_test, svr_pred):.2f}\n")


In [None]:
# Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)
print("Decision Tree Results:")
print(f"MSE: {mean_squared_error(y_test, dt_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, dt_pred):.2f}\n")

In [None]:
# Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Random Forest Results:")
print(f"MSE: {mean_squared_error(y_test, rf_pred):.2f}")
print(f"R2 Score: {r2_score(y_test, rf_pred):.2f}\n")


In [None]:
# Neural Network with multiple random seeds
def create_model():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

random_seeds = range(42, 52)  # 10 different random seeds
nn_results = []

for seed in random_seeds:
    np.random.seed(seed)
    model = create_model()
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
    nn_pred = model.predict(X_test)
    nn_results.append({
        'mse': mean_squared_error(y_test, nn_pred),
        'r2': r2_score(y_test, nn_pred)
    })

avg_mse = np.mean([result['mse'] for result in nn_results])
avg_r2 = np.mean([result['r2'] for result in nn_results])
print("Neural Network Results (averaged over 10 random seeds):")
print(f"Average MSE: {avg_mse:.2f}")
print(f"Average R2 Score: {avg_r2:.2f}")
