In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForTokenClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

# Load tokenized data
data_path = "../data/tokenized_data.pt"
input_ids, attention_masks = torch.load(data_path)
labels = torch.load("../data/labels.pt")

# Split data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.1)

# Convert data to tensors
train_inputs = torch.cat(train_inputs, dim=0)
validation_inputs = torch.cat(validation_inputs, dim=0)
train_labels = torch.cat(train_labels, dim=0)
validation_labels = torch.cat(validation_labels, dim=0)
train_masks = torch.cat(train_masks, dim=0)
validation_masks = torch.cat(validation_masks, dim=0)

# Define data loaders for training and validation sets
batch_size = 8

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Initialize RoBERTaForTokenClassification model
model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=len(labels_dict))

# Train model
epochs = 5
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()   
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    # Evaluate model on validation set
    val_loss, val_accuracy = evaluate(model, validation_dataloader)
    
    print("Epoch: {}/{}".format(epoch+1, epochs))
    print("Training Loss: {:.4f}".format(loss.item()))
    print("Validation Loss: {:.4f}".format(val_loss))
    print("Validation Accuracy: {:.4f}".format(val_accuracy))
    print("-------------------------------")

# Save fine-tuned model
output_dir = '../models/fine_tuned_roberta'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model.save_pretrained(output_dir)
