In [1]:
import os
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, set_seed
from optimum.bettertransformer import BetterTransformer
from datasets import load_dataset, DatasetDict, Dataset
from dotenv import load_dotenv
from tqdm.auto import tqdm

load_dotenv()

GPU_AVAILABLE = torch.cuda.is_available()
NUM_GPUS = torch.cuda.device_count()

print(f"GPU Available: {GPU_AVAILABLE} | Number of GPUs: {NUM_GPUS}")

HUGGINGFACE_TOKEN = os.environ["HUGGINGFACE_TOKEN"]

MODEL_NAME = "meta-llama/Llama-2-7b-hf"
# original and tinier df, used to train first version of boto
DATASET_NAME = "felipeoes/filtered_qa_blue_amazon_legislation"
BATCH_SIZE = 10

DOCKER_VOLUME_BIND = "workspace/data"  # removed slash for each's workstation

OUTPUT_FILE = f"{DOCKER_VOLUME_BIND}/{DATASET_NAME}_{MODEL_NAME}.txt"
OUTPUT_DATASET_PATH = f"{DOCKER_VOLUME_BIND}/{DATASET_NAME}_{MODEL_NAME}.csv"

# create output file directory if it doesn't exist
if not os.path.exists(OUTPUT_FILE):
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)


# Set seed for reproducibility
SEED = 202310
set_seed(SEED)

# Display entire pandas column width
pd.set_option("display.max_colwidth", 150)


  from .autonotebook import tqdm as notebook_tqdm


GPU Available: True | Number of GPUs: 1


In [2]:
# using float16 for faster inference and less memory usage
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    token=HUGGINGFACE_TOKEN,
    device_map={"": 0},
    torch_dtype=torch.bfloat16,
    # use_flash_attention_2=True
)
model.config.use_cache = False


# better_model = BetterTransformer.transform(model)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HUGGINGFACE_TOKEN)
tokenizer.padding_side = "left"  # necessary for padding in batch inference
tokenizer.pad_token = tokenizer.eos_token

# # adjust for inference
model.eval()


Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.68s/it]


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_

In [9]:
MAX_NEW_TOKENS = 250  # median length of 'abstract' column (=150)
TOP_K = 30
TOP_P = 0.9
TEMPERATURE = 0.3
REP_PENALTY = 1.2
NO_REPEAT_NGRAM_SIZE = 10
NUM_RETURN_SEQUENCES = 1
DO_SAMPLE = True


gen_config = GenerationConfig(
    max_new_tokens=MAX_NEW_TOKENS,
    top_k=TOP_K,
    top_p=TOP_P,
    temperature=TEMPERATURE,
    repetition_penalty=REP_PENALTY,
    no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE,
    num_return_sequences=NUM_RETURN_SEQUENCES,
    do_sample=DO_SAMPLE,
)


def create_boto_v1_prompt(user_question: str):
    INTRO_BLURB = (
        "Responda à pergunta abaixo, forneça uma resposta completa e detalhada."
    )
    INSTRUCTION_KEY = "### Pergunta:"
    # INPUT_KEY = "Input:" # later try  with context, if necessary
    RESPONSE_KEY = "### Resposta:"
    # END_KEY = "### Fim"

    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{user_question}"
    # input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response = f"{RESPONSE_KEY}"
    # end = f"{END_KEY}"

    # parts = [part for part in [blurb, instruction, response, end] if part]
    parts = [part for part in [blurb, instruction, response] if part]
    formatted_prompt = "\n\n".join(parts)

    return formatted_prompt


def apply_model_to_dataframe(
    dataframe: pd.DataFrame,
    model: AutoModelForCausalLM,
    model_name: str,
    tokenizer: AutoTokenizer,
    gen_config: GenerationConfig,
    batch_size: int,
    start_from: int = 0,
):
    generated_answers = []

    # get only question that weren't answered by the model yet
    prompts = [
        create_boto_v1_prompt(question)
        for index, question in enumerate(dataframe["question"])
    ]

    # save dataframe every 10 rows
    SAVE_EVERY = 1 # CHANGING TO 1 BECAUSE OF BATCH SIZE
    fh = open(OUTPUT_FILE, "w")

    # print prompts length to file
    fh.write(f"Prompts length: {len(prompts)}")

    for i in tqdm(range(0, len(prompts), batch_size), file=fh):
        start = i
        end = start + batch_size
        range_to_save = range(start, end)

        # check if index is out of bounds and adjust range_to_save
        if end > len(prompts):
            range_to_save = range(start, len(prompts))
            
        # check if generated answers already exist for the current range
        if dataframe[model_name].iloc[range_to_save].notnull().all():
            continue

        batch = prompts[i: i + batch_size]
        encodings = tokenizer(batch, return_tensors="pt",
                              padding=True).to(model.device)
        
        with torch.no_grad():
            generation_outputs = model.generate(
                **encodings,
                generation_config=gen_config,
            )
        answers = tokenizer.batch_decode(
            generation_outputs, skip_special_tokens=True)

        # concat answers to dataframe

        dataframe.loc[range_to_save, model_name] = answers
        generated_answers.extend(answers)

        # save every 10 rows
        if i % (SAVE_EVERY * batch_size) == 0:
            dataframe.to_csv(OUTPUT_DATASET_PATH, index=False)
            
    return dataframe


In [4]:
dataset = load_dataset(DATASET_NAME, token=HUGGINGFACE_TOKEN)
dataset["train"]


Dataset({
    features: ['file_index', 'file_name', 'context', 'question', 'answer', '__index_level_0__'],
    num_rows: 15964
})

In [10]:
# try to load dataframe from csv
try:
    dataframe = pd.read_csv(OUTPUT_DATASET_PATH)
    # create empty column if it doesn't exist
    if MODEL_NAME not in dataframe.columns:
        dataframe[MODEL_NAME] = np.nan

    print(f"Loaded dataframe from csv {OUTPUT_DATASET_PATH}")
except FileNotFoundError:
    print(f"File {OUTPUT_DATASET_PATH} not found. Creating new dataframe.")
    dataframe = dataset["train"].to_pandas()
    dataframe[MODEL_NAME] = np.nan

print("Length of dataframe: ", len(dataframe))

question = "O que é a Amazônia Azul?"
prompt = create_boto_v1_prompt(question)
print(prompt)

Loaded dataframe from csv workspace/data/felipeoes/filtered_qa_blue_amazon_legislation_meta-llama/Llama-2-7b-hf.csv
Length of dataframe:  15964
Responda à pergunta abaixo, forneça uma resposta completa e detalhada.

### Pergunta:
O que é a Amazônia Azul?

### Resposta:


In [13]:
df = apply_model_to_dataframe(
    dataframe, model, MODEL_NAME, tokenizer, gen_config, BATCH_SIZE, start_from=0
)
keep_columns = ["file_name", "context", "question", "answer", MODEL_NAME]
df = df[keep_columns]
df.to_csv(OUTPUT_DATASET_PATH, index=False)
