In [1]:
import os 
DATA_PATH="NLP_CS/data/"
train_data=os.path.join(DATA_PATH,"traindata.csv")
test_data=os.path.join(DATA_PATH,"testdata.csv")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# load the csv  by hand 
polarity=[]
Aspect_Category=[]
Target_term=[]
Character_offset=[]
Sentence=[]
polarity_to_label={
    "positive":0,
    "negative":1,
    "neutral":2,
}
labels=[]
with open(train_data) as f:
    for line in f:
        line=line.strip()

        # split by space and remove the \t 
        tokens=line.split("\t") 
        polarity.append(tokens[0])
        Aspect_Category.append(tokens[1])
        Target_term.append(tokens[2])
        Character_offset.append(tokens[3])
        assert len(tokens[4:])==1,"sentence should be one token,got "+str(len(tokens[4:]))
        Sentence.append(str(tokens[4:][0]))
        labels.append(polarity_to_label[tokens[0]])

In [3]:
Sentence[0]


"short and sweet – seating is great:it's romantic,cozy and private."

In [4]:
ds_train=pd.DataFrame({"polarity":polarity,
                        "Aspect_Category":Aspect_Category,
                        "Target_term":Target_term,
                        "Character_offset":Character_offset,

                        "labels":labels,
                        "Sentence":Sentence})
ds_train
from datasets import Dataset
ds_train = Dataset.from_pandas(ds_train)
ds_train

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['polarity', 'Aspect_Category', 'Target_term', 'Character_offset', 'labels', 'Sentence'],
    num_rows: 1503
})

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

In [6]:
# Encode input
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
inputs = tokenizer("what is your name?", return_tensors="pt")

# Generate output
outputs = model.generate(**inputs, max_new_tokens=50)

# Decode and print
print("yes")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


yes
what is your name?
I'm not sure, I'm just a guy who likes to play with people. I'm a pretty good player, but I'm not a good player. I'm not a good player either.
I'm a guy who likes to play


# Reprise du TD de NLP

In [7]:
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModel, pipeline
from transformers import logging as hflogging

plm_name="facebook/opt-125m"
# Load the config, the tokenizer and the model itself:
lmconfig = AutoConfig.from_pretrained(plm_name)
lmtokenizer = AutoTokenizer.from_pretrained(plm_name)
lm = AutoModel.from_pretrained(plm_name, output_attentions=False)


Basic top Model

In [8]:
from transformers import TrainingArguments, Trainer
import numpy as np


class TransformerBinaryClassifier(torch.nn.Module):

    def __init__(self, plm_name: str):
        super(TransformerBinaryClassifier, self).__init__()
        self.lmconfig = AutoConfig.from_pretrained(plm_name)
        self.lmtokenizer = AutoTokenizer.from_pretrained(plm_name)
        self.lm = AutoModel.from_pretrained(plm_name, output_attentions=False)
        self.emb_dim = self.lmconfig.hidden_size
        self.output_size = 1
        self.classifier = torch.nn.Sequential(
            torch.nn.Dropout(0.2),
            torch.nn.Linear(self.emb_dim, self.output_size),
            torch.nn.Sigmoid()
        )
        self.loss_fn = torch.nn.CrossEntropyLoss(reduction='mean')


    def forward(self, x):
        x : torch.Tensor = self.lm(x['input_ids'], x['attention_mask']).last_hidden_state
        global_vects = x.mean(dim=1)
        x = self.classifier(global_vects)
        return x.squeeze(-1)

    def compute_loss(self, predictions, target):
        return self.loss_fn(predictions, target)


model = TransformerBinaryClassifier(plm_name)

In [9]:
ds_train

Dataset({
    features: ['polarity', 'Aspect_Category', 'Target_term', 'Character_offset', 'labels', 'Sentence'],
    num_rows: 1503
})

In [10]:
X_train_encoded = model.lmtokenizer(ds_train["Sentence"],
                            truncation=True,
                            padding=False,
                            add_special_tokens=False,
                            return_tensors=None,
                            return_offsets_mapping=False,
                        )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [27]:
def tokenize_function(examples):
    return model.lmtokenizer(examples["Sentence"], truncation=True)
def tokenize_function2(examples):
    # Concatenate fields into a single input string
    combined_input = [f"{a} [SEP] {t} [SEP] {s}" for a, t, s in zip(examples["Aspect_Category"], examples["Target_term"], examples["Sentence"])]
    return model.lmtokenizer(combined_input, truncation=True)
tok_ds_train = ds_train.map(tokenize_function2, batched=True)

tok_ds_train = tok_ds_train.remove_columns(["polarity", "Aspect_Category", "Target_term", "Character_offset", "Sentence"])

# tok_ds_train = tok_ds_train.rename_column("label", "labels")


Map: 100%|██████████| 1503/1503 [00:00<00:00, 21881.73 examples/s]


In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader



# just for testing


In [None]:
from torch.optim import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)
from tqdm.auto import tqdm



def train_model():
    data_collator = DataCollatorWithPadding(tokenizer=model.lmtokenizer, padding=True, return_tensors='pt')

    train_dataloader = DataLoader(tok_ds_train, shuffle=True, batch_size=32, collate_fn=data_collator,num_workers=2)
    num_epochs = 5
    num_training_steps = num_epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # device = 'cpu'

    model.to(device)


    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        correct = 0
        total = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            predictions = model(batch)
            loss = model.loss_fn(predictions, batch['labels'].float())
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)


            # Compute binary predictions: threshold at 0.5
            preds = (predictions > 0.5).float()
            correct += (preds == batch['labels']).sum().item()
            total += batch['labels'].size(0)
            
            
        accuracy = correct / total
        print(f"Epoch {epoch + 1}/{num_epochs} - Accuracy: {accuracy:.4f}")

In [None]:
train_model()


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Epoch 1/5 - Accuracy: 0.8410



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Epoch 2/5 - Accuracy: 0.8649



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Epoch 3/5 - Accuracy: 0.8869



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

Epoch 4/5 - Accuracy: 0.8949



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

  3%|▎         | 25/940 [00:04<02:57,  5.14it/s]
 20%|██        | 190/940 [00:30<02:09,  5.81it/s]

Epoch 1/5 - Accuracy: 0.7186


 40%|████      | 378/940 [01:00<01:23,  6.72it/s]

Epoch 2/5 - Accuracy: 0.8277


 60%|██████    | 566/940 [01:31<01:00,  6.16it/s]

Epoch 3/5 - Accuracy: 0.8516


 80%|████████  | 754/940 [02:01<00:30,  6.04it/s]

Epoch 4/5 - Accuracy: 0.8596


100%|██████████| 940/940 [02:31<00:00,  6.55it/s]

Epoch 5/5 - Accuracy: 0.8570
