<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/Longformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install accelerate
!pip -q install transformers datasets

import torch
torch.cuda.empty_cache()


In [None]:
from datasets import DatasetDict, Dataset, Features, Sequence, Value
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

# Step 1: Prepare DatasetDict with train, validation, and test splits
data = {
    "train": {
        "text": ["This is a training example.", "Another training sample here."],
        "labels": [[1, 0, 0], [0, 1, 1]],  # Multi-label (3 classes in this example)
    },
    "validation": {
        "text": ["This is a validation example.", "Another validation sample."],
        "labels": [[1, 0, 1], [0, 1, 0]],
    },
    "test": {
        "text": ["This is a test example.", "Another test sample."],
        "labels": [[1, 0, 0], [0, 1, 1]],
    },
}

# Convert to DatasetDict
dataset = DatasetDict({
    split: Dataset.from_dict(data_split, features=Features({
        'text': Value(dtype='string'),  # Keep the 'text' column
        'labels': Sequence(feature=Value(dtype='float32'))
    }))
    for split, data_split in data.items()
})

# Step 2: Load Longformer tokenizer
tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        max_length=1024, #2048, # 4096,  # Adjust max length for Longformer
        padding="max_length",
        truncation=True,
        #return_tensors="pt",
    )

def preprocess_labels(examples):
    """
    print(f"examples['labels']: {len(examples['labels'])} {examples['labels']}")
    print(f"examples['labels'][0]: {len(examples['labels'][0])} {examples['labels'][0]}")
    # Ensure labels are flattened and converted to float
    if isinstance(examples["labels"][0], list):  # Nested lists
        print("Nested list")
        flattened = [float(label) for sublist in examples["labels"] for label in sublist]
    else:  # Single list
        print("Single list")
        flattened = [float(label) for label in examples["labels"]]

    # Validate the length of labels
    #expected_length = 2  # Replace with the correct number of labels for your task
    #if len(flattened) != expected_length:
    #    raise ValueError(f"Labels length mismatch: Expected {expected_length}, got {len(flattened)}")

    examples["labels"] = flattened
    print(f"examples['labels']: {len(examples['labels'])} {examples['labels']}")
    return examples
    """
    # Convert each label list to a list of floats
    examples["labels"] = [[float(label_item) for label_item in label_list] for label_list in examples["labels"]]
    return examples


encoded_dataset = dataset.map(tokenize_function, batched=True)

print(f"encoded_dataset['train'][0]: {encoded_dataset['train'][0]}")  # Inspect a sample
encoded_dataset = encoded_dataset.map(preprocess_labels, batched=True)
print(f"encoded_dataset['train'][0]: {encoded_dataset['train'][0]}")  # Inspect a sample


# Step 3: Load Longformer model for sequence classification
model = LongformerForSequenceClassification.from_pretrained(
    "allenai/longformer-base-4096",
    num_labels=3,  # Number of labels in your multi-label problem
    problem_type="multi_label_classification",
)


# Step 4: Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits > 0).astype(int)  # Convert logits to binary predictions
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="macro")
    roc_auc = roc_auc_score(labels, logits, average="macro", multi_class="ovr")
    return {"precision": precision, "recall": recall, "f1": f1, "roc_auc": roc_auc}

# Step 5: Set up Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=1,  # Reduce batch size
    per_device_eval_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=4,  # Simulate larger batch size
    fp16=True,  # Enable mixed precision
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

!nvidia-smi

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Step 6: Train the Model
trainer.train()




In [None]:
# Step 7: Evaluate the Model
results = trainer.evaluate(encoded_dataset["test"])
print("Test Results:", results)