In [6]:
from joblib import load
from tqdm import tqdm

samples = []
size = 817


with open('../../../../data/songda/truthful_qa/31/alpaca_7B_with_semantics.joblib', 'rb') as f:
    for i in tqdm(range(size)):
        try:
            data = load(f)
            
            new_instance = {
                "input": "Is the question and answer truthful?" + data["Q"] + data["A"],
                "hidden_states": data["hidden_states"],
                "label": 1 if data["truth_prob"] > 0.5 else 0,
            }
                
            samples.append(new_instance)
        except EOFError:
            break





  0%|          | 4/817 [00:00<00:23, 34.29it/s]

100%|██████████| 817/817 [00:09<00:00, 82.75it/s] 


In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import joblib
from torch.cuda.amp import autocast, GradScaler

# Assuming 'samples' are loaded as before

class TruthfulQADataset(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        hidden_states = torch.tensor(sample['hidden_states'], dtype=torch.float32)  # Assuming this is [4096]
        label = torch.tensor(sample['label'], dtype=torch.float32)
        return hidden_states, label

# Dataset, DataLoader, model initialization, and training loop as previously defined

def custom_collate_fn(batch):
    hidden_states = [item[0] for item in batch]
    labels = torch.tensor([item[1] for item in batch], dtype=torch.float32)
    
    # Average hidden states across the sequence length dimension
    hidden_states_avg = [torch.mean(states, dim=0) for states in hidden_states]
    
    return torch.stack(hidden_states_avg), labels


dataset = TruthfulQADataset(samples)
scaler = GradScaler()

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=custom_collate_fn)

model = TruthfulQANet()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

for epoch in range(epochs):
    model.train()
    for hidden_states, labels in tqdm(train_loader):
        hidden_states, labels = hidden_states.to(device), labels.to(device)

        optimizer.zero_grad()
        with autocast():
            outputs = model(hidden_states)
            loss = criterion(outputs.squeeze(), labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    model.eval()
    val_loss = 0
    correct_preds = 0
    total_preds = 0
    with torch.no_grad():
        for hidden_states, labels in val_loader:
            hidden_states, labels = hidden_states.to(device), labels.to(device)
            with autocast():
                outputs = model(hidden_states).squeeze()
                val_loss += criterion(outputs, labels).item()
                predicted_labels = (outputs > 0.5).float()
                correct_preds += (predicted_labels == labels).sum().item()
                total_preds += labels.size(0)

    val_accuracy = correct_preds / total_preds
    print(f"Epoch {epoch+1}, Validation Loss: {val_loss / len(val_loader):.4f}, Validation Accuracy: {val_accuracy:.4f}")


  0%|          | 0/21 [00:00<?, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [8, 4096] at entry 0 and [167, 4096] at entry 1