In [18]:
import sys
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, \
    mean_absolute_percentage_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import os
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
sys.path.append(src_path)

# Verificar si la ruta ha sido añadida correctamente
print(f"Ruta añadida al sys.path: {src_path}")

# Ahora puedes intentar importar los módulos
from models.pipeline import CarsPipeline
from models.used_car_quote_nn import UsedCarQuoteNN

Ruta añadida al sys.path: c:\Users\Iñaki\Desktop\Master inteligencia artificial\Machine Learning 1\trabajo final\tp_amq1_17co2024\src


In [19]:
data = pd.read_csv('../datasets/Car details v3.csv')

data["selling_price_log"] = np.log1p(data["selling_price"])

X = data.drop(columns=['selling_price', 'selling_price_log'])
y = data['selling_price']
y_log = data['selling_price_log']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.3, random_state=42)
final_pipeline = CarsPipeline()

X_train_processed = final_pipeline.fit_transform_df(X_train)
X_test_processed = final_pipeline.transform_df(X_test)

X_train_tensor = torch.tensor(X_train_processed.to_numpy(), dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_processed.to_numpy(), dtype=torch.float32)
y_train_tensor_log = torch.tensor(y_train_log.to_numpy(), dtype=torch.float32).view(-1, 1)
y_test_tensor_log = torch.tensor(y_test_log.to_numpy(), dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor_log)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor_log)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)



In [31]:
class ImprovedCarQuoteNN(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedCarQuoteNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),  # Más neuronas
            nn.ReLU(),  # Activación no lineal
            nn.Dropout(0.3),  # Dropout para evitar sobreajuste
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Regresión a un solo valor
        )
    
    def forward(self, x):
        return self.model(x)

# Redefinir el modelo con la nueva arquitectura
input_dim = X_train_processed.shape[1]
model = ImprovedCarQuoteNN(input_dim)


In [32]:
loss_fn = nn.HuberLoss()

In [33]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Probar con Adam y una tasa de aprendizaje más baja


In [34]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Early stopping puede ser implementado manualmente o usando ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)

n_epochs = 100
best_loss = float('inf')
patience, trials = 20, 0

for epoch in range(n_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Validación
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for X_val, y_val in test_loader:
            y_val_pred = model(X_val)
            val_loss += loss_fn(y_val_pred, y_val).item()

    val_loss /= len(test_loader)
    print(f"Epoch {epoch+1}, Validation Loss: {val_loss:.4f}")
    
    # Early stopping
    scheduler.step(val_loss)
    if val_loss < best_loss:
        best_loss = val_loss
        trials = 0
    else:
        trials += 1
        if trials >= patience:
            print("Early stopping triggered")
            break



Epoch 1, Validation Loss: 0.8356
Epoch 2, Validation Loss: 0.2899
Epoch 3, Validation Loss: 0.2753
Epoch 4, Validation Loss: 0.1492
Epoch 5, Validation Loss: 0.1962
Epoch 6, Validation Loss: 0.1155
Epoch 7, Validation Loss: 0.1022
Epoch 8, Validation Loss: 0.0832
Epoch 9, Validation Loss: 0.1270
Epoch 10, Validation Loss: 0.0960
Epoch 11, Validation Loss: 0.1618
Epoch 12, Validation Loss: 0.1533
Epoch 13, Validation Loss: 0.0919
Epoch 14, Validation Loss: 0.1439
Epoch 15, Validation Loss: 0.0792
Epoch 16, Validation Loss: 0.1208
Epoch 17, Validation Loss: 0.0440
Epoch 18, Validation Loss: 0.0538
Epoch 19, Validation Loss: 0.0414
Epoch 20, Validation Loss: 0.0411
Epoch 21, Validation Loss: 0.0631
Epoch 22, Validation Loss: 0.0870
Epoch 23, Validation Loss: 0.0361
Epoch 24, Validation Loss: 0.0640
Epoch 25, Validation Loss: 0.0353
Epoch 26, Validation Loss: 0.0679
Epoch 27, Validation Loss: 0.0365
Epoch 28, Validation Loss: 0.0406
Epoch 29, Validation Loss: 0.0324
Epoch 30, Validation Lo

In [35]:
model.eval()

y_pred_log_list = []
y_test_log_list = []

# Desactivar el gradiente para la evaluación
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        y_pred = model(X_batch)
        y_pred_log_list.append(y_pred.numpy())
        y_test_log_list.append(y_batch.numpy())

In [39]:
# Convertir a arrays numpy
y_pred_log_np = np.vstack(y_pred_log_list)
y_test_log_np = np.vstack(y_test_log_list)

# Revertir la transformación logarítmica para obtener los valores originales
y_pred_np = np.expm1(y_pred_log_np)
y_test_np = np.expm1(y_test_log_np)

# Definir función para RMSE
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

# Calcular métricas
metrics_nn = {
    "name": "NN",
    "MAE_log": mean_absolute_error(y_test_log_np, y_pred_log_np),  # MAE en escala log
    "MAE": mean_absolute_error(y_test_np, y_pred_np),  # MAE en escala original
    "RMSE": root_mean_squared_error(y_test_np, y_pred_np),
    "MAPE": mean_absolute_percentage_error(y_test_np, y_pred_np),
    "R2": r2_score(y_test_np, y_pred_np)
}

# Imprimir las métricas
for metric, value in metrics_nn.items():
    if isinstance(value, (int, float)):  # Solo formatear si es un número
        print(f"{metric}: {value:.4f}")
    else:
        print(f"{metric}: {value}")

name: NN
MAE_log: 0.15879565477371216
MAE: 88356.421875
RMSE: 193866.421875
MAPE: 0.16696685552597046
R2: 0.9454


In [38]:
y_pred_np

array([[132235.58],
       [394876.4 ],
       [623562.25],
       ...,
       [583029.25],
       [328244.2 ],
       [390925.6 ]], dtype=float32)

In [27]:
y_test

1971    198000
4664    500000
5448    425000
3333    150000
2316    525000
         ...  
462     600000
1956    400000
3782    500000
799     400000
2402    425000
Name: selling_price, Length: 2439, dtype: int64

In [3]:
model = UsedCarQuoteNN(X_train_processed.shape[1])

criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

In [16]:
def train_model_batch(model, train_loader, X_val, y_val, epochs=100):
    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            y_pred_train = model(X_batch)
            loss = criterion(y_pred_train, y_batch)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        # TODO: add validation batch
        # model.eval()
        # with torch.no_grad():
        #     y_pred_val = model(X_val)
        #     criterion(y_pred_val, y_val).item()


train_model_batch(model, train_loader, X_test_tensor, y_test_tensor_log, epochs=1000)

In [15]:

y_pred_list = []
y_pred_list_log = []
y_test_list_log = []

model.eval()

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        y_test_pred_log = model(X_batch)

        y_test_pred = torch.expm1(y_test_pred_log)

        y_pred_list.append(y_test_pred)  
        y_pred_list_log.append(y_test_pred_log)
        y_test_list_log.append(y_batch)

y_pred_tensor = torch.cat(y_pred_list).squeeze()
y_pred_log_tensor = torch.cat(y_pred_list_log).squeeze()
y_test_log_tensor = torch.cat(y_test_list_log).squeeze()

y_pred_np = y_pred_tensor.numpy()
y_pred_log_np = y_pred_log_tensor.numpy()
y_test_log_np = y_test_log_tensor.numpy()

y_test_np = np.expm1(y_test_log_np)

metrics_nn = {
    "name": "NN",
    "MAE_training": mean_absolute_error(y_test_log_np, y_pred_log_np),
    "MAE": mean_absolute_error(y_test_np, y_pred_np),
    "RMSE": root_mean_squared_error(y_test_np, y_pred_np),
    "MAPE": mean_absolute_percentage_error(y_test_np, y_pred_np),
    "R2": r2_score(y_test_np, y_pred_np)
}

ValueError: Input contains infinity or a value too large for dtype('float32').

In [None]:
print(metrics_nn)