In [1]:
import os 
DATA_PATH="NLP_CS/data/"
train_data=os.path.join(DATA_PATH,"traindata.csv")
test_data=os.path.join(DATA_PATH,"testdata.csv")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from NLP_CS.src.clearml import safe_init_clearml
task=safe_init_clearml(
    project_name="NLP_CS",
    task_name="Fine-tune GPT2",
)

ClearML Task: created new task id=b5def593185942e4a8d630055e944dcf


 83%|████████▎ | 196/235 [03:37<00:43,  1.11s/it]

2025-04-07 10:46:54,053 - clearml.Task - INFO - Storing jupyter notebook directly as code





ClearML results page: https://app.clear.ml/projects/459d62b15dfc436e81a0bf48eacd0c8f/experiments/b5def593185942e4a8d630055e944dcf/output/log


Running inside MIG, Nvidia driver cannot export utilization, pushing fixed value 100


In [2]:
# load the csv  by hand 
def load_data(file):
    polarity=[]
    Aspect_Category=[]
    Target_term=[]
    Character_offset=[]
    Sentence=[]
    polarity_to_label={
        "positive":0,
        "negative":1,
        "neutral":2,
    }
    labels=[]
    with open(train_data) as f:
        for line in f:
            line=line.strip()

            # split by space and remove the \t 
            tokens=line.split("\t") 
            polarity.append(tokens[0])
            Aspect_Category.append(tokens[1])
            Target_term.append(tokens[2])
            Character_offset.append(tokens[3])
            assert len(tokens[4:])==1,"sentence should be one token,got "+str(len(tokens[4:]))
            Sentence.append(str(tokens[4:][0]))
            labels.append(polarity_to_label[tokens[0]])
    ds_train=pd.DataFrame({"polarity":polarity,
                        "Aspect_Category":Aspect_Category,
                        "Target_term":Target_term,
                        "Character_offset":Character_offset,

                        "labels":labels,
                        "Sentence":Sentence})
    ds_train
    from datasets import Dataset
    ds_train = Dataset.from_pandas(ds_train)
    ds_train
    return ds_train 
ds_train=load_data(train_data)
ds_test=load_data(test_data)

# Reprise du TD de NLP

In [12]:
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModel, pipeline
from transformers import logging as hflogging

plm_name="facebook/opt-125m"
# plm_name="google-bert/bert-base-cased"
# Load the config, the tokenizer and the model itself:
lmconfig = AutoConfig.from_pretrained(plm_name)
lmtokenizer = AutoTokenizer.from_pretrained(plm_name)
lm = AutoModel.from_pretrained(plm_name, output_attentions=False)


Basic top Model

In [5]:
from transformers import TrainingArguments, Trainer
import numpy as np


class TransformerBinaryClassifier(torch.nn.Module):

    def __init__(self, plm_name: str):
        super(TransformerBinaryClassifier, self).__init__()
        self.lmconfig = AutoConfig.from_pretrained(plm_name)
        self.lmtokenizer = AutoTokenizer.from_pretrained(plm_name)
        self.lm = AutoModel.from_pretrained(plm_name, output_attentions=False)
        self.emb_dim = self.lmconfig.hidden_size
        self.output_size = 1
        self.classifier = torch.nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(self.emb_dim, self.output_size),
            torch.nn.Sigmoid()
        )
        self.loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')


    def forward(self, x):
        x : torch.Tensor = self.lm(x['input_ids'], x['attention_mask']).last_hidden_state
        global_vects = x.mean(dim=1)
        x = self.classifier(global_vects)
        return x.squeeze(-1)

    def compute_loss(self, predictions, target):
        return self.loss_fn(predictions, target)


model = TransformerBinaryClassifier(plm_name)

In [6]:
X_train_encoded = model.lmtokenizer(ds_train["Sentence"],
                            truncation=True,
                            padding=False,
                            add_special_tokens=False,
                            return_tensors=None,
                            return_offsets_mapping=False,
                        )
X_val_encoded = model.lmtokenizer(ds_test["Sentence"],
                            truncation=True,
                            padding=False,
                            add_special_tokens=False,
                            return_tensors=None,
                            return_offsets_mapping=False,
                        )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
def tokenize_function(examples):
    return model.lmtokenizer(examples["Sentence"], truncation=True)
def tokenize_function2(examples):
    # Concatenate fields into a single input string
    combined_input = [f"{a} [SEP] {t} [SEP] {s}" for a, t, s in zip(examples["Aspect_Category"], examples["Target_term"], examples["Sentence"])]
    return model.lmtokenizer(combined_input, truncation=True)
def get_tok_ds(ds):
    tok_ds = ds.map(tokenize_function, batched=True)
    tok_ds = tok_ds.remove_columns(["polarity", "Aspect_Category", "Target_term", "Character_offset", "Sentence"])
    return tok_ds
tok_ds_train = ds_train.map(tokenize_function2, batched=True)

tok_ds_train = tok_ds_train.remove_columns(["polarity", "Aspect_Category", "Target_term", "Character_offset", "Sentence"])

# tok_ds_train = tok_ds_train.rename_column("label", "labels")
tok_ds_test=get_tok_ds(ds_test)

Map: 100%|██████████| 1503/1503 [00:00<00:00, 21597.53 examples/s]
Map: 100%|██████████| 1503/1503 [00:00<00:00, 33684.42 examples/s]


In [8]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader



# just for testing


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
from tqdm.auto import tqdm



def train_model():
    data_collator = DataCollatorWithPadding(tokenizer=model.lmtokenizer, padding=True, return_tensors='pt')

    train_dataloader = DataLoader(tok_ds_train, shuffle=True, batch_size=32, collate_fn=data_collator)# couldn't increase number of workers
    val_dataloader = DataLoader(tok_ds_test, shuffle=False, batch_size=32, collate_fn=data_collator)# couldn't increase number of workers

    num_epochs = 5
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)

    progress_bar = tqdm(range(num_training_steps))
    model.train()

    for epoch in range(num_epochs):
        correct = 0
        total = 0
        train_loss = 0

        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            predictions = model(batch)
            loss = model.loss_fn(predictions, batch['labels'].float())
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

            preds = (predictions > 0.5).float()
            correct += (preds == batch['labels']).sum().item()
            total += batch['labels'].size(0)
            train_loss += loss.item()

        train_accuracy = correct / total
        avg_train_loss = train_loss / len(train_dataloader)

        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for batch in val_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                predictions = model(batch)
                loss = model.loss_fn(predictions, batch['labels'].float())

                preds = (predictions > 0.5).float()
                val_correct += (preds == batch['labels']).sum().item()
                val_total += batch['labels'].size(0)
                val_loss += loss.item()

        val_accuracy = val_correct / val_total
        avg_val_loss = val_loss / len(val_dataloader)

        print(f"Epoch {epoch + 1}/{num_epochs} - Train Acc: {train_accuracy:.4f}, Train Loss: {avg_train_loss:.4f} - Val Acc: {val_accuracy:.4f}, Val Loss: {avg_val_loss:.4f}")

        model.train()


In [10]:
train_model()

  0%|          | 0/235 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 20%|██        | 47/235 [00:27<01:39,  1.89it/s]TOKENIZERS_PARALLELISM=(true | false)


Epoch 1/5 - Accuracy: 0.7159


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 40%|████      | 94/235 [00:54<01:17,  1.82it/s]

Epoch 2/5 - Accuracy: 0.8064


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 60%|██████    | 141/235 [01:21<00:53,  1.77it/s]

Epoch 3/5 - Accuracy: 0.8576


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 80%|████████  | 188/235 [01:48<00:28,  1.67it/s]

Epoch 4/5 - Accuracy: 0.8756


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
 83%|████████▎ | 196/235 [01:53<00:21,  1.80it/s]

KeyboardInterrupt: 

  3%|▎         | 25/940 [00:04<02:57,  5.14it/s]
 20%|██        | 190/940 [00:30<02:09,  5.81it/s]

Epoch 1/5 - Accuracy: 0.7186


 40%|████      | 378/940 [01:00<01:23,  6.72it/s]

Epoch 2/5 - Accuracy: 0.8277


 60%|██████    | 566/940 [01:31<01:00,  6.16it/s]

Epoch 3/5 - Accuracy: 0.8516


 80%|████████  | 754/940 [02:01<00:30,  6.04it/s]

Epoch 4/5 - Accuracy: 0.8596


100%|██████████| 940/940 [02:31<00:00,  6.55it/s]

Epoch 5/5 - Accuracy: 0.8570
