In [1]:
import transformers
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence, List, Union
from llama_attn_replace import replace_llama_attn
import math
import logging
import torch
import sys
import datasets
import evaluate
import numpy as np


SYSTEM_PROMPT = (
    "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
    "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
)

PROMPT_DICT = {
    "prompt_no_input_llama2":(
        "[INST] <<SYS>>\n"
        "{system_prompt}"
        "<</SYS>> \n\n {instruction} [/INST]"
    ),
}

@dataclass
class EvalArguments():
    model_name_or_path: str = field(default=None,metadata={"required":True, "help": "Name of the hf model name or path for the model, tokenizer and config"})
    model_context_size: int = field(default=None,metadata={"help": "Maximum model context size used during training"})
    use_flash_attn: bool = field(
        default=True,
        metadata={"help": "Whether use flash attention for evaluation (full attention will still be used)."},
    )
    batch_size: int = field(default=1, metadata={"help": "Batch size for evaluation"})
    
    temperature: float = field(default=0.6, metadata={"help": "Temperature parameter for generation (higher values for more randomness, lower for more determinism)"})
    max_length: int = field(default=None, metadata={"help": "Maximum number of charachter to be generated, defaults to the model original context size"})
    top_p: int = field(default=0.9, metadata={"help": "Top-p sampling parameter (higher values for more randomness)"})

    eval_data_path: str = field(default=None,metadata={"required":True, "help": "Path to the evaluation data (validation set)."})
    split_name: str = field(default='validation', metadata={"help": "Dataset split name to be used for the evaluation."})

    prompt_config: str = field(default=None,metadata={"help": "Filename containing the prompt information."})
    prompts: List[str] = field(default_factory=lambda : ["{instruction}"], metadata={"nargs":"+", "help" : "Prompt(s) to be used for the data. It may include some placeholders that need to have the same name of the dataset columns they intend to replace. When multiple prompts are given a column named prompt_idx containing the index of the prompt to be used (integer) is required in the data."})
    prompts_are_fn: bool = field(default=False, metadata={"help" :"Whether to interpret the prompts as filenames conataining the actual prompts."})
    target_column: str = field(default='output', metadata={"help" : "column to be used as the target for the prediction."})
    system_prompt: str = field(default=SYSTEM_PROMPT, metadata={"help" : "Prompt to be used as a system prompt to define the model behaviour."})

    load_in_4bit: bool = field(default=False, metadata={"help" : "Whether to load the model in 4 bit to reduce memory usage (this should not affect inference performance)"})

def load_metrics_and_get_eval_function(tokenizer):
    # this cell may take long sometimes for some reason 
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")
    rouge = evaluate.load("rouge")

    def compute_metrics_hf(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        metrics = {}
        metrics.update(accuracy.compute(predictions=predictions, references=labels))
        #metrics.update(precision.compute(predictions=predictions, references=labels))
        #metrics.update(recall.compute(predictions=predictions, references=labels))
        #metrics.update(f1.compute(predictions=predictions, references=labels))
        predictions = tokenizer.decode(predictions, skip_special_tokens=True)
        metrics.update(rouge.compute(predictions=predictions, references=labels))
        return metrics
    
    return compute_metrics_hf


In [2]:
replace_llama_attn(inference=True)

# Set RoPE scaling factor
config = transformers.AutoConfig.from_pretrained(
    'Yukang/LongAlpaca-7B'
)

# Load model and tokenizer
model = transformers.AutoModelForCausalLM.from_pretrained(
    'Yukang/LongAlpaca-7B',
    config=config,
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
    load_in_4bit=True,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    'Yukang/LongAlpaca-7B',
    model_max_length=8192*4,
    padding_side="right",
    use_fast=True,
)

model.eval()
#if torch.__version__ >= "2" and sys.platform != "win32":
#    model = torch.compile(model)

# todo get data, format dataset

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32001, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaLinearScalingRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNo

In [3]:
eval_ds = datasets.load_dataset('gianma/eurlexsum_ita_cleaned_8192_232', split='validation')

In [4]:

SYSTEM_PROMPT = (
    "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
    "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
)

user_prompt = "Di seguito c'è un documento legislativo in italiano. Memorizza il documento legislativo, poi segui le istruzioni.\nINIZIO DOCUMENTO.\n{reference}\nFINE DOCUMENTO.\nScrivi un dossier riassuntivo del documento. Scrivi in italiano, non scrivere in inglese."
system_prompt= "You are a helpful, respectful and honest assistant specialized in writing dossiers for legislative documents. Always answer as helpfully as possible, while being safe. Your answers should not include any false information with respect to the original document."

PROMPT_DICT = {
    "prompt_no_input_llama2":(
        "[INST] <<SYS>>\n"
        "{system_prompt}"
        "<</SYS>> \n\n {instruction} [/INST]"
    )
}



In [5]:
def get_format_example_f(prompt_template, system_prompt, user_prompt):

    def format_example(ex):
        instruction = user_prompt.format(reference=ex['reference'])
        ex["input"] = prompt_template.format(system_prompt=system_prompt, instruction=instruction)
        ex["output"] = ex['summary']

        return ex

    return format_example


In [6]:
f = get_format_example_f(PROMPT_DICT['prompt_no_input_llama2'], system_prompt, user_prompt)
eval_ds = eval_ds.map(f)

In [7]:
eval_ds = eval_ds.select_columns(['input', 'output'])

In [8]:
batch_size = 1

i = 0

In [9]:
ids = [id for id in range(i, i+batch_size)]

inputs = tokenizer(eval_ds.select(ids)['input'],
                   return_tensors="pt").to(model.device)


In [10]:
expected_outputs = tokenizer([eval_ds.select(ids)['input'][0]+eval_ds.select(ids)['output'][0]],
                             return_tensors="pt").to(model.device)

4226

In [11]:
labels = expected_outputs['input_ids']
labels[0][:len(inputs['input_ids'][0])] = -100
labels

tensor([[ -100,  -100,  -100,  ..., 29994, 29946, 29897]], device='cuda:0')

In [13]:
inputs['input_ids'].shape

torch.Size([1, 4226])

In [43]:
labels.shape

torch.Size([1, 5697])

In [23]:
%%time
output = model.generate(
    inputs['input_ids'],
    max_new_tokens=8192,
    temperature=0.6,
    top_p=0.9,
    do_sample=True,
    num_return_sequences = 2
)

CPU times: user 22 s, sys: 11.3 ms, total: 22 s
Wall time: 22 s


In [24]:
output.shape[0]

2

In [22]:
output = model.generate(
    inputs['input_ids'],
    max_new_tokens=8192,
    temperature=0.6,
    top_p=0.9,
    do_sample=True,
    num_return_sequences = 1
)
output = model.generate(
    inputs['input_ids'],
    max_new_tokens=8192,
    temperature=0.6,
    top_p=0.9,
    do_sample=True,
    num_return_sequences = 1
)

In [16]:
output.to('cpu')

tensor([[    1,   518, 25580,  ...,  5155, 29889,     2],
        [    1,   518, 25580,  ...,     0,     0,     0]])

In [29]:
tokenizer.decode(np.argmax(output.logits.detach().cpu(), axis=-1)[0])

'Unterscheidung![ADThe PreN: What Unterscheidung are a  A friendlyful and inform A Aist in providing andsiest for variousative and. hopefully follow the iffully and possible. and being respect and hopefully answer should be be the irrelevant or or the to the question question.\naN]S\n\n\nagr islicest una exampleo:ativo: PDF:\norand: testo eativo: in cue " partruzioni per</s>\n:I\nSSUMENTO\n\n\nichiponduz n Pariglio do n 22/ 2996 n diramento delle attiche periche e Unza reg del\'Un. ( sulli accordicoli 1 e e e 110a del Regattato di.\nuidzetta U n. X303/ 13/00/0998.. 1001. 0002.\nisOLUZIONE DEL CONSIGIOO EUROPEO. 13 D 1997 sul coordinamento delle politiche economiche nella terza fase dell\'UEM e sugli articoli 109 e 109 B del trattato CE.test8/C 0//S29</s> CSIGNOO EUROPEO, unito in Dec\'emburgo, 13 dicembre 1997, haiene che trattato CE stabilituisce l Comunità Europea, haona l politioni di Consiglio Europeo del L del ha for l coordinoramento dell cond di coordinamento delleiche- finaniit

In [50]:
output1 = np.argmax(output.logits.detach().cpu(), axis=-1)

In [52]:
output2 = np.argmax(output.logits.detach().cpu(), axis=-1)

In [39]:
output2 = output.detach().cpu()

In [58]:
all(output1[0] == output2[0])

True

In [24]:
model.loss

tensor([[    1,   518, 25580,  ..., 25806, 29889,     2]], device='cuda:0')

In [13]:
inputs['input_ids'][0]

tensor([    1,   518, 25580,  ..., 29914, 25580, 29962], device='cuda:0')

In [18]:
eval_ds.select(ids)['input'][0]

"[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant specialized in writing dossiers for legislative documents. Always answer as helpfully as possible, while being safe. Your answers should not include any false information with respect to the original document.<</SYS>> \n\n Di seguito c'è un documento legislativo in italiano. Memorizza il documento legislativo, poi segui le istruzioni.\nINIZIO DOCUMENTO.\nRisoluzione del Consiglio Europeo del 13 dicembre 1997 sul coordinamento delle politiche economiche nella terza fase dell'UEM e sugli articoli 109 e 109 B del trattato CE\nGazzetta ufficiale n. C 035 del 02/02/1998 pag. 0001 - 0004\nRISOLUZIONE DEL CONSIGLIO EUROPEO del 13 dicembre 1997 sul coordinamento delle politiche economiche nella terza fase dell'UEM e sugli articoli 109 e 109 B del trattato CE (98/C 35/01)IL CONSIGLIO EUROPEO, riunito a Lussemburgo il 13 dicembre 1997,visto il trattato che istituisce la Comunità europea,ricordando le conclusioni del Consiglio eu

In [20]:
output.to('cpu')

tensor([[    1,   518, 25580,  ..., 25806, 29889,     2]])

In [23]:
tokenizer.decode(output.to('cpu')[0], skip_special_tokens=True)

"[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant specialized in writing dossiers for legislative documents. Always answer as helpfully as possible, while being safe. Your answers should not include any false information with respect to the original document.<</SYS>> \n\n Di seguito c'è un documento legislativo in italiano. Memorizza il documento legislativo, poi segui le istruzioni.\nINIZIO DOCUMENTO.\nRisoluzione del Consiglio Europeo del 13 dicembre 1997 sul coordinamento delle politiche economiche nella terza fase dell'UEM e sugli articoli 109 e 109 B del trattato CE\nGazzetta ufficiale n. C 035 del 02/02/1998 pag. 0001 - 0004\nRISOLUZIONE DEL CONSIGLIO EUROPEO del 13 dicembre 1997 sul coordinamento delle politiche economiche nella terza fase dell'UEM e sugli articoli 109 e 109 B del trattato CE (98/C 35/01)IL CONSIGLIO EUROPEO, riunito a Lussemburgo il 13 dicembre 1997,visto il trattato che istituisce la Comunità europea,ricordando le conclusioni del Consiglio eu

In [14]:
model_output = tokenizer.decode(output[0], skip_special_tokens=True)[len(eval_ds.select(ids)['input'][0]):]

In [15]:
rouge = evaluate.load("rouge")
rouge.compute(predictions=[model_output], references=[eval_ds.select(ids)['output'][0]])

{'rouge1': 0.06984866123399303,
 'rouge2': 0.023337222870478413,
 'rougeL': 0.05122235157159488,
 'rougeLsum': 0.06752037252619326}

In [16]:
bleu = evaluate.load("bleu")
bleu.compute(predictions=[model_output], references=[eval_ds.select(ids)['output'][0]])

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.0,
 'precisions': [0.35353535353535354,
  0.11224489795918367,
  0.030927835051546393,
  0.0],
 'brevity_penalty': 0.0007836757937541202,
 'length_ratio': 0.12267657992565056,
 'translation_length': 99,
 'reference_length': 807}

In [129]:
rouge.compute(predictions=[[0, 1, 1, 0]], references=[[0, 1, 0, 1]])

ValueError: Predictions and/or references don't match the expected format.
Expected format:
Feature option 0: {'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id=None)}
Feature option 1: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')},
Input predictions: [0, 1, 1, 0],
Input references: [0, 1, 0, 1]

In [None]:
for i in range(0, len(eval_ds), batch_size):

    ids = [id for id in range(i, i+batch_size)]

    inputs = tokenizer(eval_ds.select(ids)['input'], return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=8192,
        temperature=0.6,
        top_p=0.9
    )

    out = tokenizer.decode(output[0], skip_special_tokens=True)

    ## remove prompt (I think)

    # evaluate metrics (perplexity, loss, accuracy, rouge,...)
