In [37]:
import sys

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, mean_absolute_error, root_mean_squared_error, \
    mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from torch.utils.data import TensorDataset, DataLoader

from src.models.used_car_quote_nn import UsedCarQuoteNN

sys.path.append('../src')
from src.models.pipeline import CarsPipeline

In [12]:
data = pd.read_csv('../datasets/Car details v3.csv')

data["selling_price_log"] = np.log1p(data["selling_price"])

X = data.drop(columns=['selling_price', 'selling_price_log'])
y = data['selling_price']
y_log = data['selling_price_log']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.3, random_state=42)
final_pipeline = CarsPipeline()

X_train_processed = final_pipeline.fit_transform_df(X_train)
X_test_processed = final_pipeline.transform_df(X_test)

X_train_tensor = torch.tensor(X_train_processed.to_numpy(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_processed.to_numpy(), dtype=torch.float32)
y_train_tensor_log = torch.tensor(y_train_log.to_numpy(), dtype=torch.float32).view(-1, 1)
y_test_tensor_log = torch.tensor(y_test_log.to_numpy(), dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor_log)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor_log)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)



In [26]:
model = UsedCarQuoteNN(X_train_processed.shape[1])

criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [29]:
def train_model_batch(model, train_loader, X_val, y_val, epochs=100):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            y_pred_train = model(X_batch)
            loss = criterion(y_pred_train, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            y_pred_val = model(X_val)
            mae = criterion(y_pred_val, y_val).item()

        if (epoch + 1) % 10 == 0:
            # print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss:.4f}, Val Loss: {mae.item():.4f}')
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}, MAE: {mae:.4f}')


train_model_batch(model, train_loader, X_test_tensor, y_test_tensor_log, epochs=1000)

Epoch [10/1000], Loss: 0.4027, MAE: 0.8940
Epoch [20/1000], Loss: 0.4168, MAE: 0.7905
Epoch [30/1000], Loss: 0.3811, MAE: 0.7700
Epoch [40/1000], Loss: 0.4889, MAE: 1.0600
Epoch [50/1000], Loss: 0.4116, MAE: 0.9899
Epoch [60/1000], Loss: 0.4138, MAE: 0.7483
Epoch [70/1000], Loss: 0.4413, MAE: 0.7005
Epoch [80/1000], Loss: 0.6319, MAE: 0.5917
Epoch [90/1000], Loss: 0.4556, MAE: 1.6547
Epoch [100/1000], Loss: 0.6996, MAE: 0.7337
Epoch [110/1000], Loss: 0.3588, MAE: 1.9598
Epoch [120/1000], Loss: 0.6309, MAE: 0.5850
Epoch [130/1000], Loss: 0.4181, MAE: 0.6972
Epoch [140/1000], Loss: 0.5705, MAE: 1.1463
Epoch [150/1000], Loss: 0.4473, MAE: 0.5674
Epoch [160/1000], Loss: 0.5571, MAE: 1.4464
Epoch [170/1000], Loss: 0.2996, MAE: 0.5784
Epoch [180/1000], Loss: 0.4773, MAE: 0.6573
Epoch [190/1000], Loss: 0.4529, MAE: 0.7618
Epoch [200/1000], Loss: 0.3944, MAE: 2.4196
Epoch [210/1000], Loss: 0.6088, MAE: 4.1434
Epoch [220/1000], Loss: 0.5203, MAE: 0.5663
Epoch [230/1000], Loss: 0.3527, MAE: 1.47

In [38]:
y_pred_ridge_log = model(X_test_tensor)
y_pred_ridge = np.expm1(y_pred_ridge_log.detach().numpy())

metrics_ridge = {
    "name": "NN",
    "MAE_training": mean_absolute_error(y_train_log, model(X_test_tensor).detach().numpy()),
    "MAE": mean_absolute_error(y_test, y_pred_ridge),
    "RMSE": root_mean_squared_error(y_test, y_pred_ridge),
    "MAPE": mean_absolute_percentage_error(y_test, y_pred_ridge),
    "R2": r2_score(y_test, y_pred_ridge)
}

  y_pred_ridge = np.expm1(y_pred_ridge_log.detach().numpy())


ValueError: Found input variables with inconsistent numbers of samples: [5689, 2439]