In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/MyDrive #editar esta ruta

In [None]:
!pip install -q --upgrade transformers scikit-learn pandas

In [None]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch import nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast
import matplotlib.pyplot as plt
import joblib

from data.dataset import PeptideDataset
from model.network import BertRegressor

In [None]:
df = pd.read_csv("data/data_rfu.csv")

scaler = StandardScaler()
df['label'] = scaler.fit_transform(df[['label']])
joblib.dump(scaler, "./saved_model/scaler.pkl")

# Dividir: 81% train / 9% val / 10% test
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(
    df['sequence'], df['label'], test_size=0.10, random_state=42
)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_val_texts, train_val_labels, test_size=0.10, random_state=42
)

# Configuracion del tokenizador
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
train_dataset = PeptideDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = PeptideDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)
test_dataset = PeptideDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

# Configuraracion de DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertRegressor().to(device)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
loss_fn = nn.MSELoss()
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)

In [None]:
def train(model, train_loader, val_loader, optimizer, loss_fn, device, epochs, patience=5):
    scaler = GradScaler()
    best_val_rmse = float('inf')
    patience_counter = 0
    best_model_path = "./saved_model/best_model.pth"

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Época {epoch+1}", leave=False)

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            with autocast():
                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs, labels)

            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            progress_bar.set_postfix({'pérdida_lote': loss.item()})

        avg_train_loss = total_loss / len(train_loader)

        #evaluar en validación
        val_rmse, val_mae, val_r2, _, _ = evaluate(model, val_loader, device)
        print(f"📘 Época {epoch+1} | Pérdida de entrenamiento: {avg_train_loss:.4f} | Val RMSE: {val_rmse:.4f} | Val MAE: {val_mae:.4f} | Val R²: {val_r2:.4f}")

        #actualizar programador
        scheduler.step(val_rmse)

        #Detención temprana para evitar sobre ajuste
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            patience_counter = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Detención temprana en la época {epoch+1}")
                break

    #Cargar el mejor modelo
    model.load_state_dict(torch.load(best_model_path, map_location=device))
    return avg_train_loss

def evaluate(model, test_loader, device):
    model.eval()
    predictions, actuals = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluando", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)
            predictions.extend(outputs.cpu().numpy())
            actuals.extend(labels.cpu().numpy())

    predictions = np.array(predictions)
    actuals = np.array(actuals)

    rmse = np.sqrt(mean_squared_error(actuals, predictions))
    mae = mean_absolute_error(actuals, predictions)
    r2 = r2_score(actuals, predictions)

    # Guardar resultados
    df_results = pd.DataFrame({
        "real": actuals,
        "predicho": predictions
    })
    df_results.to_csv("./saved_model/test_results.csv", index=False)

    # Graficar predicciones
    plt.figure(figsize=(8, 6))
    plt.scatter(actuals, predictions, alpha=0.5)
    plt.plot([min(actuals), max(actuals)], [min(actuals), max(actuals)], 'r--', lw=2)
    plt.xlabel("Valores reales (escalados)")
    plt.ylabel("Predicciones (escalados)")
    plt.title("Predicciones vs. Valores reales")
    plt.savefig("./saved_model/predictions_plot.png")
    plt.close()

    return rmse, mae, r2, predictions, actuals

In [None]:
EPOCHS = 50
train(model, train_loader, val_loader, optimizer, loss_fn, device, EPOCHS)

  scaler = GradScaler()
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 1 | Pérdida de entrenamiento: 0.8873 | Val RMSE: 0.8526 | Val MAE: 0.7141 | Val R²: 0.2077


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 2 | Pérdida de entrenamiento: 0.7019 | Val RMSE: 0.7249 | Val MAE: 0.5588 | Val R²: 0.4273


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 3 | Pérdida de entrenamiento: 0.4718 | Val RMSE: 0.6367 | Val MAE: 0.4883 | Val R²: 0.5581


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 4 | Pérdida de entrenamiento: 0.4004 | Val RMSE: 0.8427 | Val MAE: 0.6295 | Val R²: 0.2260


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 5 | Pérdida de entrenamiento: 0.3819 | Val RMSE: 0.7036 | Val MAE: 0.5211 | Val R²: 0.4604


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 6 | Pérdida de entrenamiento: 0.3159 | Val RMSE: 0.6626 | Val MAE: 0.5140 | Val R²: 0.5215


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 7 | Pérdida de entrenamiento: 0.2862 | Val RMSE: 0.5959 | Val MAE: 0.4504 | Val R²: 0.6129


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 8 | Pérdida de entrenamiento: 0.2476 | Val RMSE: 0.6142 | Val MAE: 0.4766 | Val R²: 0.5888


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 9 | Pérdida de entrenamiento: 0.2440 | Val RMSE: 0.5486 | Val MAE: 0.4119 | Val R²: 0.6720


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 10 | Pérdida de entrenamiento: 0.2041 | Val RMSE: 0.6510 | Val MAE: 0.4834 | Val R²: 0.5380


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 11 | Pérdida de entrenamiento: 0.1849 | Val RMSE: 0.5706 | Val MAE: 0.4393 | Val R²: 0.6451


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 12 | Pérdida de entrenamiento: 0.1576 | Val RMSE: 0.6346 | Val MAE: 0.4586 | Val R²: 0.5610


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 13 | Pérdida de entrenamiento: 0.1404 | Val RMSE: 0.5947 | Val MAE: 0.4862 | Val R²: 0.6145


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 14 | Pérdida de entrenamiento: 0.1353 | Val RMSE: 0.5449 | Val MAE: 0.4642 | Val R²: 0.6764


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 15 | Pérdida de entrenamiento: 0.0986 | Val RMSE: 0.6366 | Val MAE: 0.4503 | Val R²: 0.5583


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 16 | Pérdida de entrenamiento: 0.1101 | Val RMSE: 0.6123 | Val MAE: 0.4740 | Val R²: 0.5913


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 17 | Pérdida de entrenamiento: 0.0962 | Val RMSE: 0.6448 | Val MAE: 0.4878 | Val R²: 0.5468


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 18 | Pérdida de entrenamiento: 0.0820 | Val RMSE: 0.6009 | Val MAE: 0.4624 | Val R²: 0.6064


  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():
  with autocast():


📘 Época 19 | Pérdida de entrenamiento: 0.0845 | Val RMSE: 0.5907 | Val MAE: 0.4540 | Val R²: 0.6197
Detención temprana en la época 19


0.08445967696995164

In [None]:
rmse, mae, r2, preds, actuals = evaluate(model, test_loader, device)
print("\n📌 Evaluación en conjunto de prueba:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R²  : {r2:.4f}")

#Desescalar predicciones y valores reales para interpretación
scaler = joblib.load("./saved_model/scaler.pkl")
actuals_unscaled = scaler.inverse_transform(actuals.reshape(-1, 1)).flatten()
preds_unscaled = scaler.inverse_transform(preds.reshape(-1, 1)).flatten()

# Guardar resultados desescalados
df_results_unscaled = pd.DataFrame({
    "real": actuals_unscaled,
    "predicho": preds_unscaled
})




📌 Evaluación en conjunto de prueba:
RMSE: 0.8251
MAE : 0.5573
R²  : 0.5808
