Car Price Prediction: Machine Leaning Models

Cyrus Kolahi

run proj3_data_preprocess.ipynb to preprocess and create train and test set

In [182]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, TensorDataset

from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

from tqdm import tqdm
import pickle




load data from data folder

In [183]:
X_train = pd.read_csv("data/X_train_scaled.csv")
X_test = pd.read_csv("data/X_test_scaled.csv")
X_val = pd.read_csv("data/X_val_scaled.csv")
y_train = pd.read_csv("data/y_train_scaled.csv")
y_test = pd.read_csv("data/y_test_scaled.csv")
y_val = pd.read_csv("data/y_val_scaled.csv")


Modeling

Basic Linear Regression:

In [184]:
LinReg = LinearRegression()
LinReg.fit(X_train, y_train)
lr_val_pred = LinReg.predict(X_val)
lr_pred = LinReg.predict(X_test)
print("Linear Regression Results:")
print(f"Test MSE: {mean_squared_error(y_test, lr_pred):.2f}")
print(f"Val MSE: {mean_squared_error(y_val, lr_val_pred):.2f}")
print(f"Test R2 Score: {r2_score(y_test, lr_pred):.2f}")
print(f"Val R2 Score: {r2_score(y_val, lr_val_pred):.2f}")

Linear Regression Results:
Test MSE: 0.00
Val MSE: 0.00
Test R2 Score: 1.00
Val R2 Score: 1.00


Regression with Kernels:

In [185]:
kernels = ['linear', 'rbf', 'poly']
for kernel in kernels:
    kr = KernelRidge(kernel=kernel)
    kr.fit(X_train, y_train)
    test_pred = kr.predict(X_test)
    val_pred = kr.predict(X_val)
    print(f"Regression with {kernel} kernel Results:")
    print(f"Test MSE: {mean_squared_error(y_test, test_pred):.2f}")
    print(f"Val MSE: {mean_squared_error(y_val, val_pred):.2f}")
    print(f"Test R2 Score: {r2_score(y_test, test_pred):.2f}")
    print(f"Val R2 Score: {r2_score(y_val, val_pred):.2f}\n")


Regression with linear kernel Results:
Test MSE: 0.00
Val MSE: 0.00
Test R2 Score: 1.00
Val R2 Score: 1.00

Regression with rbf kernel Results:
Test MSE: 0.00
Val MSE: 0.00
Test R2 Score: 1.00
Val R2 Score: 1.00

Regression with poly kernel Results:
Test MSE: 0.00
Val MSE: 0.00
Test R2 Score: 1.00
Val R2 Score: 1.00



Support Vector Regression with different kernels:

In [186]:
for kernel in ['linear', 'rbf', 'poly']:
    svr = SVR(kernel=kernel)
    svr.fit(X_train, y_train)
    svr_pred = svr.predict(X_test)
    svr_val_pred = svr.predict(X_val)
    print(f"Support Vector Regression ({kernel} kernel) Results:")
    print(f"Test MSE: {mean_squared_error(y_test, svr_pred):.2f}")
    print(f"Val MSE: {mean_squared_error(y_val, svr_val_pred):.2f}")
    print(f"Test R2 Score: {r2_score(y_test, svr_pred):.2f}")
    print(f"Val R2 Score: {r2_score(y_val, svr_val_pred):.2f}\n")

Support Vector Regression (linear kernel) Results:
Test MSE: 0.00
Val MSE: 0.00
Test R2 Score: 1.00
Val R2 Score: 1.00



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Support Vector Regression (rbf kernel) Results:
Test MSE: 0.00
Val MSE: 0.00
Test R2 Score: 1.00
Val R2 Score: 1.00



  y = column_or_1d(y, warn=True)


Support Vector Regression (poly kernel) Results:
Test MSE: 0.05
Val MSE: 0.05
Test R2 Score: 0.95
Val R2 Score: 0.95



Decision Tree

In [187]:
dt = DecisionTreeRegressor(random_state=9)
dt.fit(X_train, y_train)
test_pred = dt.predict(X_test)
val_pred = dt.predict(X_val)
print("Decision Tree Results:")
print(f"Test MSE: {mean_squared_error(y_test, test_pred):.2f}")
print(f"Val MSE: {mean_squared_error(y_val, val_pred):.2f}")
print(f"Test R2 Score: {r2_score(y_test, test_pred):.2f}")
print(f"Val R2 Score: {r2_score(y_val, val_pred):.2f}\n")

Decision Tree Results:
Test MSE: 0.07
Val MSE: 0.08
Test R2 Score: 0.93
Val R2 Score: 0.92



Random Forest

In [188]:
rf = RandomForestRegressor(n_estimators=100, random_state=20)
rf.fit(X_train, y_train)
test_pred = rf.predict(X_test)
val_pred = rf.predict(X_val)
print("Random Forest Results:")
print(f"Test MSE: {mean_squared_error(y_test, test_pred):.2f}")
print(f"Val MSE: {mean_squared_error(y_val, val_pred):.2f}")
print(f"Test R2 Score: {r2_score(y_test, test_pred):.2f}")
print(f"Val R2 Score: {r2_score(y_val, val_pred):.2f}\n")


  return fit_method(estimator, *args, **kwargs)


Random Forest Results:
Test MSE: 0.03
Val MSE: 0.03
Test R2 Score: 0.97
Val R2 Score: 0.97



Neural Network:

In [189]:
class CarPriceNN(nn.Module):
    def __init__(self, input_dim):
        super(CarPriceNN, self).__init__()
        
        # First block with wider layers
        self.layer1 = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Second block with residual connection
        self.layer2 = nn.Sequential(
            nn.Linear(64, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Third block decreasing dimensions
        self.layer3 = nn.Sequential(
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Final prediction layers
        self.output_layers = nn.Sequential(
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
        )
        
        # Initialize weights
        #self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.kaiming_normal_(module.weight, nonlinearity='relu')
            if module.bias is not None:
                nn.init.constant_(module.bias, 0)
                
    def forward(self, x):
        # Forward pass with residual connection
        x1 = self.layer1(x)
        x2 = self.layer2(x1)
        x3 = self.layer3(x2)
        out = self.output_layers(x3)
        return out

def calculate_r2(y_true, y_pred):
    # Ensure inputs are the right shape and scale
    y_true = y_true.squeeze()  # Remove extra dimensions
    y_pred = y_pred.squeeze()
    
    # Convert to numpy if they're torch tensors
    if torch.is_tensor(y_true):
        y_true = y_true.detach().cpu().numpy()
    if torch.is_tensor(y_pred):
        y_pred = y_pred.detach().cpu().numpy()
    
    # Calculate R2
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

def train_model(model, train_loader, val_loader, test_loader, epochs=50):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)

    
    best_val_loss = float('inf')
    patience_counter = 0
    patience = 10  # Early stopping patience
    
    for epoch in tqdm(range(epochs)):
        # Training
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            y_batch = y_batch
            #print(y_batch.shape, X_batch.shape)

            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        # Validation/Test
        model.eval()
        val_preds =[]
        val_true=[]
        test_preds=[]
        test_true=[]
        val_loss = 0
        test_loss = 0

        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                #y_batch = y_batch
                #print(y_batch.shape, X_batch.shape)
                y_pred = model(X_batch)
                val_loss += criterion(y_pred, y_batch).item()
                val_preds.append(y_pred)
                val_true.append(y_batch)

            for X_batch, y_batch in test_loader:
                #y_batch = y_batch
                #print(y_batch.shape, X_batch.shape)
                y_pred = model(X_batch)
                test_loss += criterion(y_pred, y_batch).item()
                test_preds.append(y_pred)
                test_true.append(y_batch)
            
        # Average losses
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        avg_test_loss = test_loss / len(test_loader)
        
         # Concatenate all predictions and true values
        val_true = torch.cat(val_true)
        val_pred = torch.cat(val_preds)
        test_true = torch.cat(test_true)
        test_pred = torch.cat(test_preds)

    
        # Calculate R2 score
        val_r2 = calculate_r2(val_true, val_pred)
        test_r2 = calculate_r2(test_true, test_pred)
    
       
        
        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= patience:
            print(f'Early stopping at epoch {epoch}')
            break
        if epoch % 10 == 0:
            print(f'Epoch {epoch}: \n Train Loss: {avg_train_loss:.4f}, \n Val Loss: {avg_val_loss:.4f} \n Test Loss: {avg_test_loss:.4f}')
            print(f'Val R2: {np.mean(val_r2):.4f}, \n'
                  f'Test R2: {np.mean(test_r2):.4f}')
            
    return avg_test_loss, avg_val_loss, avg_train_loss, val_r2, test_r2
            
       

In [190]:
X_train = pd.read_csv("data/X_train_scaled.csv")
y_train = pd.read_csv("data/y_train_scaled.csv")
X_test = pd.read_csv("data/X_test_scaled.csv")
y_test = pd.read_csv("data/y_test_scaled.csv")
X_val = pd.read_csv("data/X_val_scaled.csv")
y_val = pd.read_csv("data/y_val_scaled.csv")

scaler_y = pickle.load(open("data/scaler_y.pkl", "rb"))
scaler_X = pickle.load(open("data/scaler_X.pkl", "rb"))
scalers = pickle.load(open("data/scalers.pkl", "rb"))


X_train_tensor = torch.FloatTensor(X_train.values) 
y_train_tensor = torch.FloatTensor(y_train.values)

train_dataset=TensorDataset(X_train_tensor,y_train_tensor)

X_test_tensor = torch.FloatTensor(X_test.values) 
y_test_tensor = torch.FloatTensor(y_test.values)

test_dataset=TensorDataset(X_test_tensor,y_test_tensor)

X_val_tensor = torch.FloatTensor(X_val.values) 
y_val_tensor = torch.FloatTensor(y_val.values)

val_dataset=TensorDataset(X_val_tensor,y_val_tensor)


batch_size = 32
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [191]:
#model = CarPriceNN(10)
#avg_test_loss, avg_val_loss, avg_train_loss, val_r2, test_r2 = train_model(model,train_data_loader,val_data_loader,test_data_loader)

In [174]:
X_train = pd.read_csv("data/X_train_scaled.csv")
y_train = pd.read_csv("data/y_train_scaled.csv")
X_test = pd.read_csv("data/X_test_scaled.csv")
y_test = pd.read_csv("data/y_test_scaled.csv")
X_val = pd.read_csv("data/X_val_scaled.csv")
y_val = pd.read_csv("data/y_val_scaled.csv")


In [192]:

def set_seed(seed):
    """Set all random seeds for reproducibility"""
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

def run_trial(seed, X_train, y_train, X_val, y_val, X_test, y_test):
    """Run a single trial with given random seed"""
    # Set the seed
    set_seed(seed)
    
    # Create datasets
    X_train_tensor = torch.FloatTensor(X_train.values) 
    y_train_tensor = torch.FloatTensor(y_train.values)

    train_dataset=TensorDataset(X_train_tensor,y_train_tensor)

    X_test_tensor = torch.FloatTensor(X_test.values) 
    y_test_tensor = torch.FloatTensor(y_test.values)

    test_dataset=TensorDataset(X_test_tensor,y_test_tensor)

    X_val_tensor = torch.FloatTensor(X_val.values) 
    y_val_tensor = torch.FloatTensor(y_val.values)

    val_dataset=TensorDataset(X_val_tensor,y_val_tensor)


    batch_size = 32
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    

    
    # Initialize model
    input_dim = X_train.shape[1]
    model = CarPriceNN(input_dim)
    
    # Train model
    avg_test_loss, avg_val_loss, avg_train_loss, val_r2, test_r2 = train_model(model, train_data_loader, val_data_loader, test_data_loader)

    return avg_test_loss, avg_val_loss, avg_train_loss, val_r2, test_r2



    

# Run multiple trials with different seeds
seeds = [42, 43, 44, 45, 46, 100, 200, 400, 333, 73] 
results = []

for seed in seeds:
    print(f"\nRunning trial with seed {seed}")
    avg_test_loss, avg_val_loss, avg_train_loss, val_r2, test_r2 = run_trial(seed, X_train, y_train, X_val, y_val, X_test, y_test)
    results.append({
        'seed': seed,
        'val loss': avg_val_loss,   
        'test loss': avg_test_loss,
        'train loss': avg_train_loss,
        'val r2': val_r2,
        'test r2': test_r2
    })
    print(f"MSE: {avg_test_loss:.4f}")
    print(f"R2 Score: {test_r2:.4f}")

# Calculate average performance
avg_mse_test = np.mean([r['test loss'] for r in results])
avg_r2_test = np.mean([r['test r2'] for r in results])
avg_mse_val = np.mean([r['val loss'] for r in results])
avg_r2_val = np.mean([r['val r2'] for r in results])

print("\nOverall Results:")
print(f"Average Test Loss: {avg_mse_test:.4f}")
print(f"Average Test r2 Score: {avg_r2_test:.4f}")

# Print individual results



Running trial with seed 42


  2%|▏         | 1/50 [00:00<00:39,  1.26it/s]

Epoch 0: 
 Train Loss: 0.2502, 
 Val Loss: 0.0304 
 Test Loss: 0.0309
Val R2: 0.9679, 
Test R2: 0.9681


 22%|██▏       | 11/50 [00:10<00:35,  1.09it/s]

Epoch 10: 
 Train Loss: 0.0727, 
 Val Loss: 0.0099 
 Test Loss: 0.0099
Val R2: 0.9895, 
Test R2: 0.9898


 42%|████▏     | 21/50 [00:19<00:26,  1.07it/s]

Epoch 20: 
 Train Loss: 0.0787, 
 Val Loss: 0.0206 
 Test Loss: 0.0198
Val R2: 0.9783, 
Test R2: 0.9795


 54%|█████▍    | 27/50 [00:26<00:22,  1.01it/s]


Early stopping at epoch 27
MSE: 0.0114
R2 Score: 0.9882

Running trial with seed 43


  2%|▏         | 1/50 [00:00<00:40,  1.20it/s]

Epoch 0: 
 Train Loss: 0.3263, 
 Val Loss: 0.0254 
 Test Loss: 0.0261
Val R2: 0.9732, 
Test R2: 0.9731


 22%|██▏       | 11/50 [00:09<00:34,  1.12it/s]

Epoch 10: 
 Train Loss: 0.0747, 
 Val Loss: 0.0214 
 Test Loss: 0.0209
Val R2: 0.9774, 
Test R2: 0.9785


 36%|███▌      | 18/50 [00:17<00:31,  1.00it/s]


Early stopping at epoch 18
MSE: 0.0061
R2 Score: 0.9937

Running trial with seed 44


  2%|▏         | 1/50 [00:00<00:42,  1.14it/s]

Epoch 0: 
 Train Loss: 0.2965, 
 Val Loss: 0.0158 
 Test Loss: 0.0158
Val R2: 0.9833, 
Test R2: 0.9837


 22%|██▏       | 11/50 [00:10<00:37,  1.05it/s]

Epoch 10: 
 Train Loss: 0.0769, 
 Val Loss: 0.0163 
 Test Loss: 0.0154
Val R2: 0.9827, 
Test R2: 0.9841


 42%|████▏     | 21/50 [00:19<00:28,  1.02it/s]

Epoch 20: 
 Train Loss: 0.0743, 
 Val Loss: 0.0180 
 Test Loss: 0.0178
Val R2: 0.9809, 
Test R2: 0.9817


 62%|██████▏   | 31/50 [00:29<00:20,  1.06s/it]

Epoch 30: 
 Train Loss: 0.0729, 
 Val Loss: 0.0134 
 Test Loss: 0.0130
Val R2: 0.9859, 
Test R2: 0.9866


 66%|██████▌   | 33/50 [00:32<00:16,  1.01it/s]


Early stopping at epoch 33
MSE: 0.0157
R2 Score: 0.9839

Running trial with seed 45


  2%|▏         | 1/50 [00:00<00:40,  1.22it/s]

Epoch 0: 
 Train Loss: 0.2569, 
 Val Loss: 0.0235 
 Test Loss: 0.0239
Val R2: 0.9752, 
Test R2: 0.9753


 22%|██▏       | 11/50 [00:10<00:40,  1.03s/it]

Epoch 10: 
 Train Loss: 0.0697, 
 Val Loss: 0.0269 
 Test Loss: 0.0284
Val R2: 0.9716, 
Test R2: 0.9707


 42%|████▏     | 21/50 [00:21<00:31,  1.09s/it]

Epoch 20: 
 Train Loss: 0.0698, 
 Val Loss: 0.0072 
 Test Loss: 0.0070
Val R2: 0.9923, 
Test R2: 0.9928


 54%|█████▍    | 27/50 [00:28<00:24,  1.06s/it]


Early stopping at epoch 27
MSE: 0.0083
R2 Score: 0.9915

Running trial with seed 46


  2%|▏         | 1/50 [00:00<00:46,  1.06it/s]

Epoch 0: 
 Train Loss: 0.2265, 
 Val Loss: 0.0309 
 Test Loss: 0.0304
Val R2: 0.9674, 
Test R2: 0.9686


 22%|██▏       | 11/50 [00:10<00:36,  1.07it/s]

Epoch 10: 
 Train Loss: 0.0730, 
 Val Loss: 0.0146 
 Test Loss: 0.0143
Val R2: 0.9846, 
Test R2: 0.9853


 32%|███▏      | 16/50 [00:16<00:34,  1.01s/it]


Early stopping at epoch 16
MSE: 0.0096
R2 Score: 0.9901

Running trial with seed 100


  2%|▏         | 1/50 [00:01<01:02,  1.27s/it]

Epoch 0: 
 Train Loss: 0.2720, 
 Val Loss: 0.0306 
 Test Loss: 0.0301
Val R2: 0.9677, 
Test R2: 0.9690


 22%|██▏       | 11/50 [00:10<00:34,  1.13it/s]

Epoch 10: 
 Train Loss: 0.0716, 
 Val Loss: 0.0165 
 Test Loss: 0.0165
Val R2: 0.9825, 
Test R2: 0.9830


 42%|████▏     | 21/50 [00:20<00:31,  1.08s/it]

Epoch 20: 
 Train Loss: 0.0721, 
 Val Loss: 0.0182 
 Test Loss: 0.0179
Val R2: 0.9808, 
Test R2: 0.9815


 62%|██████▏   | 31/50 [00:29<00:16,  1.15it/s]

Epoch 30: 
 Train Loss: 0.0751, 
 Val Loss: 0.0219 
 Test Loss: 0.0213
Val R2: 0.9769, 
Test R2: 0.9781


 66%|██████▌   | 33/50 [00:32<00:16,  1.01it/s]


Early stopping at epoch 33
MSE: 0.0118
R2 Score: 0.9879

Running trial with seed 200


  2%|▏         | 1/50 [00:00<00:39,  1.23it/s]

Epoch 0: 
 Train Loss: 0.2575, 
 Val Loss: 0.0296 
 Test Loss: 0.0307
Val R2: 0.9687, 
Test R2: 0.9683


 22%|██▏       | 11/50 [00:10<00:40,  1.04s/it]

Epoch 10: 
 Train Loss: 0.0743, 
 Val Loss: 0.0229 
 Test Loss: 0.0233
Val R2: 0.9759, 
Test R2: 0.9760


 42%|████▏     | 21/50 [00:20<00:29,  1.03s/it]

Epoch 20: 
 Train Loss: 0.0757, 
 Val Loss: 0.0115 
 Test Loss: 0.0113
Val R2: 0.9879, 
Test R2: 0.9884


 48%|████▊     | 24/50 [00:24<00:26,  1.03s/it]


Early stopping at epoch 24
MSE: 0.0107
R2 Score: 0.9889

Running trial with seed 400


  2%|▏         | 1/50 [00:01<00:50,  1.04s/it]

Epoch 0: 
 Train Loss: 0.2519, 
 Val Loss: 0.0404 
 Test Loss: 0.0422
Val R2: 0.9574, 
Test R2: 0.9565


 22%|██▏       | 11/50 [00:10<00:35,  1.09it/s]

Epoch 10: 
 Train Loss: 0.0769, 
 Val Loss: 0.0148 
 Test Loss: 0.0145
Val R2: 0.9844, 
Test R2: 0.9850


 36%|███▌      | 18/50 [00:18<00:32,  1.03s/it]


Early stopping at epoch 18
MSE: 0.0261
R2 Score: 0.9731

Running trial with seed 333


  2%|▏         | 1/50 [00:00<00:39,  1.23it/s]

Epoch 0: 
 Train Loss: 0.3051, 
 Val Loss: 0.0260 
 Test Loss: 0.0262
Val R2: 0.9726, 
Test R2: 0.9730


 22%|██▏       | 11/50 [00:10<00:39,  1.00s/it]

Epoch 10: 
 Train Loss: 0.0809, 
 Val Loss: 0.0169 
 Test Loss: 0.0156
Val R2: 0.9822, 
Test R2: 0.9840


 42%|████▏     | 21/50 [00:20<00:26,  1.09it/s]

Epoch 20: 
 Train Loss: 0.0627, 
 Val Loss: 0.0099 
 Test Loss: 0.0112
Val R2: 0.9896, 
Test R2: 0.9885


 56%|█████▌    | 28/50 [00:28<00:22,  1.01s/it]


KeyboardInterrupt: 