<a href="https://colab.research.google.com/github/claudelepere/ML_GitHub/blob/main/Longformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install accelerate
!pip -q install transformers datasets

import torch
torch.cuda.empty_cache()


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m23.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from datasets import DatasetDict, Dataset, Features, Sequence, Value
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

# Step 1: Prepare DatasetDict with train, validation, and test splits
data = {
    "train": {
        "text": ["This is a training example.", "Another training sample here."],
        "labels": [[1, 0, 0], [0, 1, 1]],  # Multi-label (3 classes in this example)
    },
    "validation": {
        "text": ["This is a validation example.", "Another validation sample."],
        "labels": [[1, 0, 1], [0, 1, 0]],
    },
    "test": {
        "text": ["This is a test example.", "Another test sample."],
        "labels": [[1, 0, 0], [0, 1, 1]],
    },
}

# Convert to DatasetDict
dataset = DatasetDict({
    split: Dataset.from_dict(data_split, features=Features({
        'text': Value(dtype='string'),  # Keep the 'text' column
        'labels': Sequence(feature=Value(dtype='float32'))
    }))
    for split, data_split in data.items()
})

# Step 2: Load Longformer tokenizer
tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")

# Tokenize datasets
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        max_length=1024, #2048, # 4096,  # Adjust max length for Longformer
        padding="max_length",
        truncation=True,
        #return_tensors="pt",
    )

def preprocess_labels(examples):
    """
    print(f"examples['labels']: {len(examples['labels'])} {examples['labels']}")
    print(f"examples['labels'][0]: {len(examples['labels'][0])} {examples['labels'][0]}")
    # Ensure labels are flattened and converted to float
    if isinstance(examples["labels"][0], list):  # Nested lists
        print("Nested list")
        flattened = [float(label) for sublist in examples["labels"] for label in sublist]
    else:  # Single list
        print("Single list")
        flattened = [float(label) for label in examples["labels"]]

    # Validate the length of labels
    #expected_length = 2  # Replace with the correct number of labels for your task
    #if len(flattened) != expected_length:
    #    raise ValueError(f"Labels length mismatch: Expected {expected_length}, got {len(flattened)}")

    examples["labels"] = flattened
    print(f"examples['labels']: {len(examples['labels'])} {examples['labels']}")
    return examples
    """
    # Convert each label list to a list of floats
    examples["labels"] = [[float(label_item) for label_item in label_list] for label_list in examples["labels"]]
    return examples


encoded_dataset = dataset.map(tokenize_function, batched=True)

print(f"encoded_dataset['train'][0]: {encoded_dataset['train'][0]}")  # Inspect a sample
#encoded_dataset = encoded_dataset.map(preprocess_labels, batched=True)
#print(f"encoded_dataset['train'][0]: {encoded_dataset['train'][0]}")  # Inspect a sample


# Step 3: Load Longformer model for sequence classification
model = LongformerForSequenceClassification.from_pretrained(
    "allenai/longformer-base-4096",
    num_labels=3,  # Number of labels in your multi-label problem
    problem_type="multi_label_classification",
)


# Step 4: Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (logits > 0).astype(int)  # Convert logits to binary predictions
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="macro")
    roc_auc = roc_auc_score(labels, logits, average="macro", multi_class="ovr")
    return {"precision": precision, "recall": recall, "f1": f1, "roc_auc": roc_auc}

# Step 5: Set up Trainer
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=1,  # Reduce batch size
    per_device_eval_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=4,  # Simulate larger batch size
    fp16=True,  # Enable mixed precision
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

!nvidia-smi

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Step 6: Train the Model
trainer.train()




vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

encoded_dataset['train'][0]: {'text': 'This is a training example.', 'labels': [1.0, 0.0, 0.0], 'input_ids': [0, 713, 16, 10, 1058, 1246, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fri Dec 20 09:40:26 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Roc Auc
1,No log,0.693693,0.166667,0.333333,0.222222,0.666667
2,No log,0.693122,0.166667,0.333333,0.222222,0.666667
3,No log,0.692784,0.166667,0.333333,0.222222,0.666667


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=3, training_loss=0.3558661142985026, metrics={'train_runtime': 65.1146, 'train_samples_per_second': 0.092, 'train_steps_per_second': 0.046, 'total_flos': 3941138018304.0, 'train_loss': 0.3558661142985026, 'epoch': 3.0})

In [3]:
# Step 7: Evaluate the Model
results = trainer.evaluate(encoded_dataset["test"])
print("Test Results:", results)

Test Results: {'eval_loss': 0.6923288106918335, 'eval_precision': 0.16666666666666666, 'eval_recall': 0.3333333333333333, 'eval_f1': 0.2222222222222222, 'eval_roc_auc': 1.0, 'eval_runtime': 0.2165, 'eval_samples_per_second': 9.237, 'eval_steps_per_second': 9.237, 'epoch': 3.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
