In [None]:
from google.colab import userdata

hf_auth = userdata.get('HF_TOKEN')

lm_model_inst = 'meta-llama/Llama-3.2-1B-Instruct'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

tokenizer = AutoTokenizer.from_pretrained(lm_model_inst, token=hf_auth)
model = AutoModelForCausalLM.from_pretrained(lm_model_inst, token=hf_auth)
device = 'cuda'
model.to(device)

In [None]:
# https://huggingface.co/docs/transformers/v4.46.3/en/main_classes/text_generation#transformers.GenerationConfig

def ask(question, model, device):

    tk = tokenizer(question, return_tensors='pt')
    tk['input_ids'] = tk['input_ids'].to(device)
    tk['attention_mask'] = tk['attention_mask'].to(device)

    gen_config = GenerationConfig(
        do_sample=True,
        max_new_tokens=256,
        temperature=0.0000001)

    response = model.generate(
        input_ids=tk['input_ids'],
        attention_mask=tk['attention_mask'],
        generation_config=gen_config)

    answer = tokenizer.batch_decode(response[:, len(tk['input_ids'][0]):], skip_special_tokens=True)[0]

    return response, answer

In [4]:
from sklearn.model_selection import train_test_split
from google.colab import drive
import pandas as pd
import nltk
import torch
from tqdm import tqdm
import numpy as np

drive.mount('/content/gdrive')
root = "/content/gdrive/MyDrive/Colab Notebooks/torch/"
df = pd.read_csv(root+"data/BBC-text/bbc-text.csv")

# nota, considero le labels testuali, camnio la creazione del dataset
labels_list = list(set(df['category']))

(x_train, x_test, y_train, y_test) = train_test_split(df['text'], df['category'], test_size=0.2, random_state=17)
(x_train, x_val, y_train, y_val) = train_test_split(x_train, y_train, test_size=0.1, random_state=17)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

class Dataset(torch.utils.data.Dataset):

    def __init__(self, x, y, stopwords):

        # x e y sono series di pandas
        tokens_litt = [nltk.word_tokenize(text, language='english')
         for text in list(x)]
        text_clean = []

        if stopwords:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence if
                    not w.lower() in nltk.corpus.stopwords.words("english")]))
        else:
            for sentence in tqdm(tokens_litt, desc='Tokenizing ... '):
                text_clean.append(' '.join([w.lower() for w in sentence]))
            # ogni token è separato dall'altro con uno spazio

        self.texts = text_clean
        self.labels = [label for label in y]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):

        batch_texts = self.texts[idx]
        batch_labels = self.labels[idx]

        return batch_texts, batch_labels

hyperparameters = {
    "epochs": 5,
    "learning_rate": 1e-3,
    "batch_size": 64,
    "dropout": 0.1,
    "stopwords": False,
    "layers": 1,
    "h_dim": 300,
    "bilstm": True,
    "patience": 5,
    "min_delta": 0.01
}

train_dataset = Dataset(x_train, y_train, hyperparameters["stopwords"])
val_dataset = Dataset(x_val, y_val, hyperparameters["stopwords"])
test_dataset = Dataset(x_test, y_test, hyperparameters["stopwords"])

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Tokenizing ... : 100%|██████████| 1602/1602 [00:00<00:00, 14431.84it/s]
Tokenizing ... : 100%|██████████| 178/178 [00:00<00:00, 12662.59it/s]
Tokenizing ... : 100%|██████████| 445/445 [00:00<00:00, 15206.91it/s]


In [5]:
print(test_dataset[0][0])
print(test_dataset[0][1])

butler strikes gold in spain britain s kathy butler continued her impressive year with victory in sunday s 25th cross internacional de venta de banos in spain . the scot who led gb to world cross country bronze earlier this year moved away from the field with ines monteiro halfway into the 6.6km race . she then shrugged off her portuguese rival to win in 20 minutes 38 seconds . meanwhile briton karl keska battled bravely to finish seventh in the men s 10.6km race in a time of 31:41. kenenisa bekele of ethiopia - the reigning world long and short course champion - was never troubled by any of the opposition winning leisurely in 30.26. butler said of her success : i felt great throughout the race and hope this is a good beginning for a marvellous 2005 season for me . elsewhere abebe dinkessa of ethiopia won the brussels iaaf cross-country race on sunday completing the 10 500m course in 33.22. gelete burka then crowned a great day for ethiopia by claiming victory in the women s race .
spo

In [None]:
messages = [
    {'role': 'system', 'content': 'Classify the following article in one of the folllowing categories: business, politics, tech, sport or entertainment'},
    {'role': 'user', 'content': test_dataset[0][0]}
]

print(tokenizer.apply_chat_template(messages, tokenize=False))

In [None]:
messages = [
    {'role': 'system', 'content': 'Classify the following article in one of the folllowing categories: business, politics, tech, sport or entertainment'},
    {'role': 'user', 'content': test_dataset[0][0]}
]

response, answer = ask(tokenizer.apply_chat_template(messages, tokenize=False), model, device)
print(answer)

In [None]:
# iterate over the entiere dataset

results = []

for idx, (texts, labels) in enumerate(tqdm(test_dataset, desc='test set')):
    messages = [
        {'role': 'system', 'content': 'Classify the following article in one of the folllowing categories: business, politics, tech, sport or entertainment'},
        {'role': 'user', 'content': texts}
    ]

    response, answer = ask(tokenizer.apply_chat_template(messages, tokenize=False), model, device)
    results.append((answer, labels))

In [None]:
print(results[0])
print(results[1])
print(results[2])
print(results[3])

In [None]:
# sistemo l'output, normalizzo le predizione rispetto alla presenza di una label

goldens = []
pred_fixed = []

for pred, golden in results:
    goldens.append(golden)
    preds_lower = pred.lower()

    found = False
    for l in labels_list:
        if l in preds_lower:
            pred_fixed.append(l)
            found = True
            break
    if not found:
        pred_fixed.append('')

from sklearn.metrics import classification_report
print(classification_report(goldens, pred_fixed))

In [None]:
# constrain decoding, forzo il modello a restituire delle parole specifiche, le nostre label

labels_tok = [[tokenizer(' '+l, add_special_tokens=False)['input_ids']] for l in labels_list]
print(labels_tok)

In [None]:
def ask_constrain(question, model, device, force_words_ids):
    tk = tokenizer(question, return_tensors='pt')
    tk['input_ids'] = tk['input_ids'].to(device)
    tk['attention_mask'] = tk['attention_mask'].to(device)

    gen_config = GenerationConfig(
        force_words_ids=force_words_ids,
        max_new_tokens=10,
        temperature = 0.000001
        num_beams=2,
    )

    response = model.generate(
        input_ids=tk['input_ids'],
        attention_mask=tk['attention_mask'],
        generation_config=gen_config)

    answer = tokenizer.batch_decode(response[:, len(tk['input_ids'][0]):], skip_special_tokens=True)[0]

    return response, answer

In [None]:
# ridefinisco anche l'input, scrivo parte della risposta del LLM seguendo il chat template
# più info sul formato di prompt per i singoli modelli sulla scheda del modello o sul paper pubblicato di riferimento
# vedere qua per Llama 3
    # https://www.llama.com/docs/how-to-guides/prompting#prompting
    # https://www.llama.com/docs/model-cards-and-prompt-formats/llama3_1#prompt-template

tokenizer.apply_chat_template(messages, tokenize=False)+'<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI would classify the article as:'

In [None]:
results_2 = []

for idx, (texts, labels) in enumerate(tqdm(test_dataset, desc='test set')):
    messages = [
        {'role': 'system', 'content': 'Classify the following article in one of the folllowing categories: business, politics, tech, sport or entertainment'},
        {'role': 'user', 'content': texts}
    ]

    response, answer = ask_constrain(tokenizer.apply_chat_template(messages, tokenize=False)+'<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI would classify the article as:', model, device, labels_tok)
    results_2.append((answer, labels))


In [None]:
print(results_2[0])
print(results_2[1])
print(results_2[2])
print(results_2[3])

In [None]:
goldens_2 = []
pred_fixed_2 = []

for pred, golden in results:
    goldens.append(golden)
    preds_lower = pred.lower()

    found = False
    for l in list(labels_list):
        if l in preds_lower:
            pred_fixed.append(l)
            found = True
            break
    if not found:
        pred_fixed.append('')

print(classification_report(goldens_2, pred_fixed_2))