# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
## Important libs ##
import os
from pathlib import Path
import huggingface_hub
from datasets import load_dataset
import random
import re
from sklearn.metrics import classification_report

os.chdir(Path.cwd().parent)

from src.utils import load_env_file

load_env_file()
api_key = os.getenv("HF_TOKEN")

huggingface_hub.login(api_key)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Generating Synthetic (annotated) Datasets 

In [3]:
dataset = load_dataset("financial_phrasebank", "sentences_allagree", split='train', trust_remote_code=True).shuffle(seed=42)

#dataset = load_dataset("financial_phrasebank", "sentences_allagree", trust_remote_code=True)

# create a new column with the numeric label verbalised as label_text (e.g. "positive" instead of "0")
label_map = {
    i: label_text 
    for i, label_text in enumerate(dataset.features["label"].names)
}

def add_label_text(example):
    example["label_text"] = label_map[example["label"]]
    return example

dataset = dataset.map(add_label_text)

print(dataset)
# Dataset({
#    features: ['sentence', 'label', 'label_text'],
#    num_rows: 2264
#})

Dataset({
    features: ['sentence', 'label', 'label_text'],
    num_rows: 2264
})


In [9]:
dataset['label_text'][:20]

['neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'negative',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'positive']

## Prompt Engineering

In [23]:
#"You are a highly qualified expert trained to annotate machine learning training data."
prompt_financial_sentiment = """\
Your task is to analyze the sentiment in the TEXT below from an investor perspective and label it with only one the three labels:
positive, negative, or neutral.

Base your label decision only on the TEXT and do not speculate e.g. based on prior knowledge about a company. 

Do not provide any explanations and ONLY respond with one of the labels as one word: negative, positive, or neutral

Examples:
Text: Operating profit increased, from EUR 7m to 9m compared to the previous reporting period.
Label: positive
Text: The company generated net sales of 11.3 million euro this year.
Label: neutral
Text: Profit before taxes decreased to EUR 14m, compared to EUR 19m in the previous period.	
Label: negative

Your TEXT to analyse:
TEXT: {text}
Label: """


In [12]:
prompt_financial_sentiment = """\
Your task is to analyze the sentiment in the TEXT below from an investor perspective and label it with only one the three labels:
positive, negative, or neutral.

ONLY respond with one of the labels as one word: negative, positive, or neutral. DONT BRING ANYTHING ELSE IN THE ANSWER, JUST THE LABEL

If there is no comparison (comparing previous year with current year, or comparing two companies results), we can assume that the comment is neutral.

Examples:
Text: Operating profit increased, from EUR 7m to 9m compared to the previous reporting period.
Answer: positive
Text: The company generated net sales of 11.3 million euro this year.
Answer: neutral
Text: There are 100 companies listed in the stock exchange.
Answer: neutral
Text: Profit before taxes decreased to EUR 14m, compared to EUR 19m in the previous period.	
Answer: negative

Your TEXT to analyse:
TEXT: {text}
Answer: """

In [None]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")

# chat_financial_sentiment = [{"role": "user", "content": prompt_financial_sentiment}]

# prompt_financial_sentiment = tokenizer.apply_chat_template(chat_financial_sentiment, tokenize=False)

# The prompt now includes special tokens: '<s>[INST] You are a highly qualified expert ...  [/INST]'


In [None]:
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 

model_id = "microsoft/Phi-3-mini-4k-instruct" #"microsoft/Phi-3-mini-4k-instruct"
torch.random.manual_seed(0) 
model = AutoModelForCausalLM.from_pretrained( 
    model_id,  
    device_map="cuda",  
    torch_dtype="auto",  
    trust_remote_code=True,  
) 

tokenizer = AutoTokenizer.from_pretrained(model_id) 

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [39]:
pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
)

In [22]:
labels = ["positive", "negative", "neutral"]

# ---- Cleaning function (maps to labels) ----
def clean_output(outputs, labels=("positive", "negative", "neutral")):
    results = []
    for out in outputs:
        text = out[0]["generated_text"].strip()  # each out is a list of dicts
        found = None
        for label in labels:
            if label.lower() in text.lower():
                found = label
                break
        if not found:
            found = "FAIL"
        results.append(found)
    return results

In [90]:
messages = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}, 
] 

generation_args = { 
    "max_new_tokens": 500, 
    "return_full_text": False, 
    "temperature": 0.9, 
    "do_sample": False, 
} 

output = pipe(messages, **generation_args) 
print(output[0]['generated_text'])



 Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some creative ideas for incorporating both fruits into your meals or snacks:

1. Smoothie: Blend together a ripe banana, a few slices of dragon fruit, a handful of spinach or kale, a splash of almond milk, and a tablespoon of honey or agave syrup for sweetness. Add a scoop of your favorite protein powder or a handful of ice for a refreshing and nutritious smoothie.

2. Fruit Salad: Slice a ripe banana and a few pieces of dragon fruit, and combine them with other fruits like strawberries, blueberries, and kiwi. Toss the fruits with a drizzle of honey and a squeeze of lime juice for a colorful and flavorful fruit salad.

3. Tropical Salsa: Dice a ripe banana and a few pieces of dragon fruit, and combine them with diced mango, pineapple, and red bell pepper. Add a squeeze of lime juice, a drizzle of honey, and a sprinkle of chopped cilantro for a sweet and tangy tropical salsa. Serve with tortilla chi

In [18]:
one_shot_text = dataset["sentence"][15]

generation_params = dict(
    top_p=0.95,
    temperature=0.4,
    max_new_tokens=128,
    return_full_text=False,
    use_cache=False
)

prompt_formatted = prompt_financial_sentiment.format(text=one_shot_text)

messages = [
    {"role": "system", "content": "You are a highly qualified expert trained to annotate machine learning training data."},
    {"role": "user", "content": prompt_formatted},
]

output = pipe(messages, **generation_params)


#output = pipe(prompt_formatted, **generation_params)
label = clean_output([output])
print(f"Input: {one_shot_text}")
print(f"Prediction: {output[0]['generated_text'].strip()}")
print("-" * 40)
# print(f"Text: {one_shot_text}\nLabel: {label}\n")



Input: Foundries division reports its sales increased by 9.7 % to EUR 63.1 mn from EUR 57.5 mn in the corresponding period in 2006 , and sales of the Machine Shop division increased by 16.4 % to EUR 41.2 mn from EUR 35.4 mn in the corresponding period in 2006 .
Prediction: positive
----------------------------------------


## Batch inference

In [40]:
N_samples = 300

### 100 samples took 3min to run

In [41]:
# ---- Build batch prompts (messages style) ----
batch_messages = [
    [
        {"role": "system", "content": "You are a highly qualified expert trained to annotate machine learning training data."},
        {"role": "user", "content": prompt_financial_sentiment.format(text=text)},
    ]
    for text in dataset["sentence"][:N_samples]
]

# ---- Run batch inference ----
raw_outputs = pipe(batch_messages, **generation_params)

# ---- Extract labels ----
predicted_labels = clean_output(raw_outputs)



In [27]:
negative_pred = [i for i, k in enumerate(predicted_labels) if k == "negative"]
negative_pred

[7, 12, 16]

In [30]:
label_negative = [i for i, k in enumerate(label_experts) if k == "negative"]
label_negative

[12]

In [54]:
def compute_metrics(label_experts, label_pred):
    # classification report gives us both aggregate and per-class metrics 
    metrics_report = classification_report(
        label_experts, label_pred, digits=2, output_dict=True, zero_division='warn'
    )
    return metrics_report

In [None]:


label_experts = dataset["label_text"][:N_samples]
label_pred = predicted_labels

metrics = compute_metrics(label_experts, label_pred)
metrics


{'negative': {'precision': 0.75,
  'recall': 1.0,
  'f1-score': 0.8571428571428571,
  'support': 15.0},
 'neutral': {'precision': 0.9545454545454546,
  'recall': 0.7924528301886793,
  'f1-score': 0.865979381443299,
  'support': 53.0},
 'positive': {'precision': 0.8055555555555556,
  'recall': 0.90625,
  'f1-score': 0.8529411764705882,
  'support': 32.0},
 'accuracy': 0.86,
 'macro avg': {'precision': 0.8367003367003368,
  'recall': 0.8995676100628932,
  'f1-score': 0.8586878050189148,
  'support': 100.0},
 'weighted avg': {'precision': 0.8761868686868687,
  'recall': 0.86,
  'f1-score': 0.8604816772069652,
  'support': 100.0}}

In [42]:
def compute_metrics(label_experts, label_pred):
    # classification report gives us both aggregate and per-class metrics 
    metrics_report = classification_report(
        label_experts, label_pred, digits=2, output_dict=True, zero_division='warn'
    )
    return metrics_report

label_experts = dataset["label_text"][:N_samples]
label_pred = predicted_labels

metrics = compute_metrics(label_experts, label_pred)
metrics

{'negative': {'precision': 0.6923076923076923,
  'recall': 1.0,
  'f1-score': 0.8181818181818182,
  'support': 54.0},
 'neutral': {'precision': 0.9705882352941176,
  'recall': 0.7764705882352941,
  'f1-score': 0.8627450980392157,
  'support': 170.0},
 'positive': {'precision': 0.7790697674418605,
  'recall': 0.881578947368421,
  'f1-score': 0.8271604938271605,
  'support': 76.0},
 'accuracy': 0.8433333333333334,
 'macro avg': {'precision': 0.8139885650145567,
  'recall': 0.8860165118679051,
  'f1-score': 0.8360291366827316,
  'support': 300.0},
 'weighted avg': {'precision': 0.8719797257006558,
  'recall': 0.8433333333333334,
  'f1-score': 0.8457089412644969,
  'support': 300.0}}

## Notes

- Accuracy of 86% achieved with the Phi-3-4k model
- Latency is still a problem, it can take 30s to classify a text, for batch inference for thousands of articles, that can be a problem

# Fine-tuning a Language Model

We will use the annotated dataset from the previous step (pretending that we don't have the ground truth labels)

In [4]:
import wandb

project_name = "llm_annotation_ft"
group = "news_classification"
# This will open a window so you can login to W&B.
# If that doesn't work, set your W&B API key below
# If you do, remove your key before publishing to GitHub.

# %env WANDB_API_KEY=YOUR_WANDB_API_KEY
#wandb.login()
run = wandb.init(project=project_name, group=group, mode="online")

[34m[1mwandb[0m: Currently logged in as: [33mgabrieldiasmp[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
from datasets import Dataset, DatasetDict

full_dataset = DatasetDict(
    {
        "train": dataset.select(range(300, 500)),
        "validation": dataset.select(range(500, 600))
    }
)

print(full_dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'label_text'],
        num_rows: 200
    })
    validation: Dataset({
        features: ['sentence', 'label', 'label_text'],
        num_rows: 100
    })
})


In [6]:
from src.model.dataset_configs import tokenize_hugging_face

full_dataset_tokenized = tokenize_hugging_face(full_dataset, text_column_name="sentence", model_str="distilbert-base-uncased")

In [7]:
full_dataset_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)


In [8]:
from src.model.dataset_configs import HFTextDataset
from torch.utils.data import DataLoader

In [9]:
train_ds = HFTextDataset(full_dataset_tokenized["train"])

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=32,
    shuffle=True,
)

In [10]:
val_ds = HFTextDataset(full_dataset_tokenized["validation"])

val_loader = DataLoader(
    dataset=val_ds,
    batch_size=32,
    shuffle=False,
)

In [11]:
from src.model.transformer_models import get_huggingface_model

In [12]:
model = get_huggingface_model(num_classes=3,
                              model_str="distilbert-base-uncased", 
                              train_last_layers_only=False)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from src.model.training import HFLightningModel, train_model_lightning
from lightning.pytorch.loggers import WandbLogger

lightning_model = HFLightningModel(model=model, label_name="label", learning_rate=0.05, num_classes=3)

wandb_logger = WandbLogger(log_model="best")

trainer = train_model_lightning(
    lightning_model=lightning_model,
    train_loader=train_loader,
    val_loader=val_loader,
    logger=wandb_logger,
    max_epochs=20,
    project_name=project_name,
    group=group
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(
    model=lightning_model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)



In [None]:
wandb.finish()

## Test inference

In [59]:
test_dataset = DatasetDict(
    {
        "test": dataset.select(range(0, 300))
    }
)

In [60]:
test_dataset_tokenized = tokenize_hugging_face(test_dataset, model_str="distilbert-base-uncased")

test_dataset_tokenized.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

#test_ds = HFTextDataset(test_dataset_tokenized)

test_loader = DataLoader(
    dataset=test_dataset_tokenized['test'],
    batch_size=32,
    shuffle=False,
)

In [61]:
f"{wandb.run.entity}/{project_name}/model-{wandb.run.id}:best"

'gabrieldiasmp/llm_annotation_ft/model-l0zsmva2:best'

In [37]:
# Define checkpoint reference.
checkpoint_reference = f"{wandb.run.entity}/{project_name}/model-{wandb.run.id}:best"

# Download checkpoint locally (if not already cached).
artifact = run.use_artifact(checkpoint_reference, type="model")
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact model-l0zsmva2:best, 255.52MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:1.1 (239.5MB/s)


In [38]:
# Load checkpoint.
model = HFLightningModel.load_from_checkpoint(str(artifact_dir) + "/model.ckpt")

/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/pytorch/utilities/parsing.py:209: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.


In [47]:
import torch

predicted_labels = batch_outputs = trainer.predict(model=model, dataloaders=test_loader)
logits = torch.cat([batch_output["logits"] for batch_output in batch_outputs])
predicted_labels = torch.argmax(logits, dim=1)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

In [55]:
compute_metrics(test_dataset['test']['label'], predicted_labels)

{'0': {'precision': 0.7666666666666667,
  'recall': 0.42592592592592593,
  'f1-score': 0.5476190476190477,
  'support': 54.0},
 '1': {'precision': 0.84375,
  'recall': 0.9529411764705882,
  'f1-score': 0.8950276243093923,
  'support': 170.0},
 '2': {'precision': 0.5769230769230769,
  'recall': 0.5921052631578947,
  'f1-score': 0.5844155844155844,
  'support': 76.0},
 'accuracy': 0.7666666666666667,
 'macro avg': {'precision': 0.7291132478632477,
  'recall': 0.6569907885181362,
  'f1-score': 0.6756874187813414,
  'support': 300.0},
 'weighted avg': {'precision': 0.7622788461538461,
  'recall': 0.7666666666666667,
  'f1-score': 0.7538056970653656,
  'support': 300.0}}

In [49]:
len(predicted_labels)

300

In [52]:
test_dataset['test']['label']

[1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 2,
 0,
 0,
 1,
 1,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 2,
 1,
 0,
 2,
 2,
 1,
 2,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 0,
 2,
 0,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 2,
 2,
 1,
 2,
 2,
 0,
 1,
 2,
 1,
 2,
 1,
 0,
 1,
 1,
 2,
 0,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 0,
 2,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 2,
 2,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 2,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 2,
 1,
 2,
 2,
 1,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 1,
 0,
 2,
 2,
 1,
 1,
 0,
 2,
 1,
 1,
 2,
 2,
 2,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 2,
 1,
 0,
 1,
 0,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 0,
 2,
 1,
 1,
 1,
 2,
 2,
