In [1]:
import os
os.environ

environ{'USER': 'gmacri',
        'SSH_CLIENT': '94.33.225.9 64230 22',
        'XDG_SESSION_TYPE': 'tty',
        'SHLVL': '1',
        'MOTD_SHOWN': 'pam',
        'HOME': '/home/gmacri',
        'DBUS_SESSION_BUS_ADDRESS': 'unix:path=/run/user/1005/bus',
        'LOGNAME': 'gmacri',
        '_': '/home/gmacri/miniconda3/bin/python',
        'XDG_SESSION_CLASS': 'user',
        'XDG_SESSION_ID': '737',
        'PATH': '/home/gmacri/miniconda3/bin:/home/gmacri/.vscode-server/bin/d037ac076cee195194f93ce6fe2bdfe2969cc82d/bin/remote-cli:/home/gmacri/miniconda3/bin:/home/gmacri/miniconda3/condabin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin',
        'VSCODE_AGENT_FOLDER': '/home/gmacri/.vscode-server',
        'XDG_RUNTIME_DIR': '/run/user/1005',
        'LANG': 'en_US.UTF-8',
        'SHELL': '/bin/bash',
        'PWD': '/home/gmacri',
        'SSH_CONNECTION': '94.33.225.9 64230 158.110.146.233 22',
        'VSCODE_HANDLES_SIGPIPE'

In [4]:
import transformers
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence, List, Union
from llama_attn_replace import replace_llama_attn
import math
import logging
import torch
import sys
import datasets
import evaluate
import numpy as np


SYSTEM_PROMPT = (
    "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
    "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
)

PROMPT_DICT = {
    "prompt_no_input_llama2":(
        "<s>[INST] <<SYS>>\n"
        "{system_prompt}"
        "<</SYS>> \n\n {instruction} [/INST]"
    ),
}

@dataclass
class EvalArguments():
    model_name_or_path: str = field(default=None,metadata={"required":True, "help": "Name of the hf model name or path for the model, tokenizer and config"})
    model_context_size: int = field(default=None,metadata={"help": "Maximum model context size used during training"})
    use_flash_attn: bool = field(
        default=True,
        metadata={"help": "Whether use flash attention for evaluation (full attention will still be used)."},
    )
    batch_size: int = field(default=1, metadata={"help": "Batch size for evaluation"})
    
    temperature: float = field(default=0.6, metadata={"help": "Temperature parameter for generation (higher values for more randomness, lower for more determinism)"})
    max_length: int = field(default=None, metadata={"help": "Maximum number of charachter to be generated, defaults to the model original context size"})
    top_p: int = field(default=0.9, metadata={"help": "Top-p sampling parameter (higher values for more randomness)"})

    eval_data_path: str = field(default=None,metadata={"required":True, "help": "Path to the evaluation data (validation set)."})
    split_name: str = field(default='validation', metadata={"help": "Dataset split name to be used for the evaluation."})

    prompt_config: str = field(default=None,metadata={"help": "Filename containing the prompt information."})
    prompts: List[str] = field(default_factory=lambda : ["{instruction}"], metadata={"nargs":"+", "help" : "Prompt(s) to be used for the data. It may include some placeholders that need to have the same name of the dataset columns they intend to replace. When multiple prompts are given a column named prompt_idx containing the index of the prompt to be used (integer) is required in the data."})
    prompts_are_fn: bool = field(default=False, metadata={"help" :"Whether to interpret the prompts as filenames conataining the actual prompts."})
    target_column: str = field(default='output', metadata={"help" : "column to be used as the target for the prediction."})
    system_prompt: str = field(default=SYSTEM_PROMPT, metadata={"help" : "Prompt to be used as a system prompt to define the model behaviour."})

    load_in_4bit: bool = field(default=False, metadata={"help" : "Whether to load the model in 4 bit to reduce memory usage (this should not affect inference performance)"})

def load_metrics_and_get_eval_function(tokenizer):
    # this cell may take long sometimes for some reason 
    accuracy = evaluate.load("accuracy")
    precision = evaluate.load("precision")
    recall = evaluate.load("recall")
    f1 = evaluate.load("f1")
    rouge = evaluate.load("rouge")

    def compute_metrics_hf(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        metrics = {}
        metrics.update(accuracy.compute(predictions=predictions, references=labels))
        #metrics.update(precision.compute(predictions=predictions, references=labels))
        #metrics.update(recall.compute(predictions=predictions, references=labels))
        #metrics.update(f1.compute(predictions=predictions, references=labels))
        predictions = tokenizer.decode(predictions, skip_special_tokens=True)
        metrics.update(rouge.compute(predictions=predictions, references=labels))
        return metrics
    
    return compute_metrics_hf


In [9]:
replace_llama_attn(inference=True)

# Set RoPE scaling factor
config = transformers.AutoConfig.from_pretrained(
    'Yukang/LongAlpaca-7B'
)

# Load model and tokenizer
model = transformers.AutoModelForCausalLM.from_pretrained(
    'Yukang/LongAlpaca-7B',
    config=config,
    torch_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto",
    load_in_4bit=True,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    'Yukang/LongAlpaca-7B',
    model_max_length=8192*4,
    padding_side="right",
    use_fast=True,
)

model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

# todo get data, format dataset

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
eval_ds = datasets.load_dataset('gianma/eurlexsum_ita_cleaned_8192_232', split='validation')

In [22]:

SYSTEM_PROMPT = (
    "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\n"
    "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n"
)

user_prompt = "Di seguito c'è un documento legislativo in italiano. Memorizza il documento legislativo, poi segui le istruzioni.\nINIZIO DOCUMENTO.\n{reference}\nFINE DOCUMENTO.\nScrivi un dossier riassuntivo del documento. Scrivi in italiano, non scrivere in inglese."
system_prompt= "You are a helpful, respectful and honest assistant specialized in writing dossiers for legislative documents. Always answer as helpfully as possible, while being safe. Your answers should not include any false information with respect to the original document."

PROMPT_DICT = {
    "prompt_no_input_llama2":(
        "<s>[INST] <<SYS>>\n"
        "{system_prompt}"
        "<</SYS>> \n\n {instruction} [/INST]"
    )
}



In [16]:
tokenizer(['just a test', 'just'], padding='longest')

{'input_ids': [[1, 925, 263, 1243], [1, 925, 32000, 32000]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 0, 0]]}

In [29]:
def get_format_example_f(prompt_template, system_prompt, user_prompt):

    def format_example(ex):
        instruction = user_prompt.format(reference=ex['reference'])
        ex["input"] = prompt_template.format(system_prompt=system_prompt, instruction=instruction)
        ex["output"] = ex['summary']

        return ex

    return format_example


In [30]:
f = get_format_example_f(PROMPT_DICT['prompt_no_input_llama2'], system_prompt, user_prompt)
eval_ds = eval_ds.map(f)

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

In [35]:
tokenizer(eval_ds.select([0,1])['input'])

{'input_ids': [[1, 1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 4266, 1891, 297, 5007, 3248, 1039, 414, 363, 13332, 1230, 10701, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 2089, 2472, 411, 3390, 304, 278, 2441, 1842, 19423, 829, 14816, 29903, 6778, 29871, 13, 13, 4671, 16587, 274, 29915, 30000, 443, 1842, 29877, 13332, 11692, 297, 9032, 29889, 8133, 272, 24990, 980, 1842, 29877, 13332, 11692, 29892, 11899, 2377, 1481, 454, 338, 509, 3365, 3688, 29889, 13, 1177, 26664, 5971, 11662, 29907, 5005, 3919, 29949, 29889, 13, 29934, 275, 324, 14285, 628, 2138, 335, 5991, 4092, 29877, 628, 29871, 29896, 29941, 17309, 29871, 29896, 29929, 29929, 29955, 5394, 6615, 4487, 4884, 2832, 4070, 7766, 4070, 4952, 1935, 1362, 20851, 3572, 29915, 4462, 29924, 321, 29746, 492, 1616, 293, 5079, 29871, 29896, 29900, 29929, 321, 29871, 29896, 29900, 29929, 350, 628, 534, 1131

In [40]:
batch_size = 1

i = 0

In [41]:
ids = [id for id in range(i, i+batch_size)]

inputs = tokenizer(eval_ds.select(ids)['input'], return_tensors="pt").to(model.device)

output = model.generate(
    **inputs,
    max_new_tokens=8192,
    temperature=0.6,
    top_p=0.9
)

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


In [50]:
output[0][-10:]

tensor([  278, 17407,  9833,   322,  1601,   300,   653,  9833, 29889,     2],
       device='cuda:0')

In [54]:
tokenizer.decode(inputs['input_ids'][0][-10:])

'non scrivere in inglese. [/INST]'

In [None]:
for i in range(0, len(eval_ds), batch_size):

    ids = [id for id in range(i, i+batch_size)]

    inputs = tokenizer(eval_ds.select(ids)['input'], return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=8192,
        temperature=0.6,
        top_p=0.9
    )

    out = tokenizer.decode(output[0], skip_special_tokens=True)

    ## remove prompt (I think)

    # evaluate metrics (perplexity, loss, accuracy, rouge,...)
