In [None]:
!pip install -U bitsandbytes
!pip install -U transformers
!pip install -U accelerate
!pip install -U peft
#!pip install huggingface_hub



In [None]:
base_model_id = "NousResearch/Meta-Llama-3-8B"
new_model = "drive/MyDrive/project_cs685/model/llama-3-finetuned_v2/causal-reasoning-finetuned"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
from transformers import BitsAndBytesConfig

In [None]:
device_map = {"": 0}

In [None]:
# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:

base_model = AutoModelForCausalLM.from_pretrained(base_model_id,
                                             quantization_config=quantization_config,
                                             device_map=device_map)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
model = PeftModel.from_pretrained(base_model, new_model)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
import torch
import torch.nn.functional as F

# Set the pad token ID
pad_token_id = 128001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def get_llama3_ft_response(prompt):
  input_text = prompt
  input = tokenizer(input_text, return_tensors='pt').to(device)

  output = model.generate(input['input_ids'], \
                        attention_mask=input["attention_mask"], \
                        pad_token_id=pad_token_id, \
                        output_scores=True, \
                        return_dict_in_generate=True, \
                        do_sample=False, temperature=None, top_p=None, \
                        max_length=500)

  len_output = len(output['scores'])
  output_tokens = output['sequences'].squeeze()[-len_output:]
  output_text = tokenizer.decode(output['sequences'].squeeze(), skip_special_tokens=True)


  topk_values, topk_indices, topk_texts = None, None, None
  for i in range(len_output):
    _token_text = tokenizer.decode(output_tokens[i], skip_special_tokens=True)
    if ("yes" in _token_text.lower()) or ("no" in _token_text.lower()):

      # logits and log probabilities
      logits = output['scores'][i]
      log_probs = torch.log(torch.softmax(logits, dim=-1))

      n = 2
      topk_values, topk_indices = torch.topk(log_probs, k=n)
      topk_texts = [tokenizer.decode(topk_indices.squeeze()[i]) for i in range(n)]
      break

  return {
      'generated_text': output_text,
      'topk_log_probs': topk_values.squeeze().tolist(),
      'topk_indices': topk_indices.squeeze().tolist(),
      'topk_texts': topk_texts
  }


In [None]:
%%time

prompt = """<s>[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Refer to the given context to respond the question with either 'yes' or 'no'.

Input:
Imagine a self-contained, hypothetical world with only the following conditions, and without any unmentioned factors or causal relationships: Glopp has a direct effect on fritz and zibbo. Fritz has a direct effect on flump. Flump has a direct effect on zibbo. Glopp is unobserved. The overall probability of fritz is 10%. For those who are not fritz, the probability of zibbo is 30%. For those who are fritz, the probability of zibbo is 48%. Is zibbo less likely than not zibbo overall?

### Response:
 [/INST]"""

get_llama3_ft_response(prompt)

CPU times: user 890 ms, sys: 990 µs, total: 891 ms
Wall time: 888 ms


{'generated_text': "<s>[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nRefer to the given context to respond the question with either 'yes' or 'no'.\n\nInput:\nImagine a self-contained, hypothetical world with only the following conditions, and without any unmentioned factors or causal relationships: Glopp has a direct effect on fritz and zibbo. Fritz has a direct effect on flump. Flump has a direct effect on zibbo. Glopp is unobserved. The overall probability of fritz is 10%. For those who are not fritz, the probability of zibbo is 30%. For those who are fritz, the probability of zibbo is 48%. Is zibbo less likely than not zibbo overall?\n\n### Response:\n [/INST] no\n\n### End of Instruction\n",
 'topk_log_probs': [-0.6644107103347778, -0.7581607103347778],
 'topk_indices': [912, 10035],
 'topk_texts': [' no', ' yes']}

In [None]:
import pandas as pd

In [None]:
# The instruction dataset to use
dataset_name = "drive/MyDrive/project_cs685/dataset/test_dataset.csv"

In [None]:
test_dataset_df = pd.read_csv(dataset_name)

In [None]:
instruction_text = "Refer to the given context to respond the question with either 'yes' or 'no'."

#"If you're unsure of the answer, simply state 'na' without guessing."
#"Use the following context to answer the question. The answer will be 'yes' or 'no'. If you don't know the answer, just say 'na', don't try to make up an answer."


def create_prompt_formats(sample):
    """
    Creates a formatted prompt template for a prompt in the instruction dataset

    :param sample: Prompt or sample from the instruction dataset
    """

    # Initialize static strings for the prompt template
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"

    # Combine a prompt with the static strings
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{instruction_text}"
    input_context = f"{INPUT_KEY}\n{sample['background']} {sample['given_info']} {sample['question']}" #if sample["input"] else None
    response = f"{RESPONSE_KEY}\n" # "{sample['answer']}"
    #end = f"{END_KEY}"

    # Create a list of prompt template elements
    parts = [part for part in [blurb, instruction, input_context, response] if part] # response, end

    # Join prompt template elements into a single string to create the prompt template
    formatted_prompt = "\n\n".join(parts)
    formatted_prompt = f"<s>[INST] {formatted_prompt} [/INST]"

    # Store the formatted prompt template in a new key "text"
    #sample["text"] = formatted_prompt

    return formatted_prompt #sample


In [None]:
test_dataset_df2 = test_dataset_df[['question_id', 'desc_id', 'given_info', 'question', 'answer', 'background']].reset_index(drop = True)

In [None]:
from tqdm import tqdm

In [None]:
test_dataset_df2["prompt_text"] = tqdm(test_dataset_df2.apply(create_prompt_formats, axis = 1))

100%|██████████| 2464/2464 [00:00<00:00, 1272127.65it/s]


In [None]:
test_dataset_df2.head()

Unnamed: 0,question_id,desc_id,given_info,question,answer,background,prompt_text
0,3888,nonsense13-frontdoor-marginal-modelNone-spec5-q1,The overall probability of fritz is 10%. For t...,Is zibbo less likely than not zibbo overall?,yes,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...
1,8401,nonsense17-frontdoor-nie-modelNone-spec8-q0,"For those who are not bleen, the probability o...",Does bleen positively affect zibby through flarn?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...
2,8132,nonsense17-confounding-ate-modelNone-spec7-q1,"For those who are not flarn and are not bleen,...",Will bleen decrease the chance of zibby?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...
3,4701,nonsense14-confounding-ate-modelNone-spec0-q0,For those who are not flurnt and are not plizz...,Will plizz increase the chance of brifft?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...
4,7168,nonsense16-mediation-ate-modelNone-spec9-q1,"For those who are not gruntz, the probability ...",Will gruntz decrease the chance of flurrk?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...


In [None]:
test_dataset_df2["llama3_ft_response"] = test_dataset_df2.prompt_text.apply(get_llama3_ft_response)

In [None]:
test_dataset_df2.head()

Unnamed: 0,question_id,desc_id,given_info,question,answer,background,prompt_text,llama3_ft_response
0,3888,nonsense13-frontdoor-marginal-modelNone-spec5-q1,The overall probability of fritz is 10%. For t...,Is zibbo less likely than not zibbo overall?,yes,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...
1,8401,nonsense17-frontdoor-nie-modelNone-spec8-q0,"For those who are not bleen, the probability o...",Does bleen positively affect zibby through flarn?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...
2,8132,nonsense17-confounding-ate-modelNone-spec7-q1,"For those who are not flarn and are not bleen,...",Will bleen decrease the chance of zibby?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...
3,4701,nonsense14-confounding-ate-modelNone-spec0-q0,For those who are not flurnt and are not plizz...,Will plizz increase the chance of brifft?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...
4,7168,nonsense16-mediation-ate-modelNone-spec9-q1,"For those who are not gruntz, the probability ...",Will gruntz decrease the chance of flurrk?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...


In [None]:
test_dataset_df2.llama3_ft_response[0]

{'generated_text': "<s>[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nRefer to the given context to respond the question with either 'yes' or 'no'.\n\nInput:\nImagine a self-contained, hypothetical world with only the following conditions, and without any unmentioned factors or causal relationships: Glopp has a direct effect on fritz and zibbo. Fritz has a direct effect on flump. Flump has a direct effect on zibbo. Glopp is unobserved. The overall probability of fritz is 10%. For those who are not fritz, the probability of zibbo is 30%. For those who are fritz, the probability of zibbo is 48%. Is zibbo less likely than not zibbo overall?\n\n### Response:\n [/INST] no\n\n### End of Instruction\n",
 'topk_log_probs': [-0.6644107103347778, -0.7581607103347778],
 'topk_indices': [912, 10035],
 'topk_texts': [' no', ' yes']}

In [None]:
test_dataset_df2["llama3_ft_generated_text"] = test_dataset_df2.llama3_ft_response.apply(lambda x: x['generated_text'])

test_dataset_df2["top1_log_probs"] = test_dataset_df2.llama3_ft_response.apply(lambda x: x['topk_log_probs'][0])
#test_dataset_df2["top1_indices"] = test_dataset_df2.llama3_ft_response.apply(lambda x: x['topk_indices'][0])
test_dataset_df2["top1_texts"] = test_dataset_df2.llama3_ft_response.apply(lambda x: x['topk_texts'][0])

test_dataset_df2["top2_log_probs"] = test_dataset_df2.llama3_ft_response.apply(lambda x: x['topk_log_probs'][1])
#test_dataset_df2["top2_indices"] = test_dataset_df2.llama3_ft_response.apply(lambda x: x['topk_indices'][1])
test_dataset_df2["top2_texts"] = test_dataset_df2.llama3_ft_response.apply(lambda x: x['topk_texts'][1])

In [None]:
test_dataset_df2["llama3_ft_generated_text"][2]

"<s>[INST] Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nRefer to the given context to respond the question with either 'yes' or 'no'.\n\nInput:\nImagine a self-contained, hypothetical world with only the following conditions, and without any unmentioned factors or causal relationships: Flarn has a direct effect on bleen and zibby. Bleen has a direct effect on zibby. For those who are not flarn and are not bleen, the probability of zibby is 21%. For those who are not flarn and are bleen, the probability of zibby is 27%. For those who are flarn and are not bleen, the probability of zibby is 20%. For those who are flarn and are bleen, the probability of zibby is 53%. The overall probability of flarn is 37%. Will bleen decrease the chance of zibby?\n\n### Response:\n [/INST] no\n\n### End of Instruction\n"

In [None]:
def process_llama3_ft_response(response_text):
  _index = response_text.find("[/INST]")
  if _index != -1:
    response_text = response_text[(_index + len("[/INST]")):]
  else:
    return 'na'
  response_text_lst = response_text.split(' ')
  if len(response_text_lst) > 0:
    for i in response_text_lst:
      if 'yes' in i.lower():
        return 'yes'
      elif 'no' in i.lower():
        return 'no'
  else:
    return 'na'

In [None]:
test_dataset_df2["llama3_ft_response_processed"] = test_dataset_df2.llama3_ft_generated_text.apply(process_llama3_ft_response)

In [None]:
test_dataset_df2.to_csv('results/output_data_llama3_finetuned.csv', index=False, header = True)

In [None]:
test_dataset_df2.head()

Unnamed: 0,question_id,desc_id,given_info,question,answer,background,prompt_text,llama3_ft_response,llama3_ft_generated_text,top1_log_probs,top1_texts,top2_log_probs,top2_texts,llama3_ft_response_processed
0,3888,nonsense13-frontdoor-marginal-modelNone-spec5-q1,The overall probability of fritz is 10%. For t...,Is zibbo less likely than not zibbo overall?,yes,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...,<s>[INST] Below is an instruction that describ...,-0.664411,no,-0.758161,yes,no
1,8401,nonsense17-frontdoor-nie-modelNone-spec8-q0,"For those who are not bleen, the probability o...",Does bleen positively affect zibby through flarn?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...,<s>[INST] Below is an instruction that describ...,-0.478671,yes,-1.009921,no,yes
2,8132,nonsense17-confounding-ate-modelNone-spec7-q1,"For those who are not flarn and are not bleen,...",Will bleen decrease the chance of zibby?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...,<s>[INST] Below is an instruction that describ...,-0.138821,no,-2.248196,yes,no
3,4701,nonsense14-confounding-ate-modelNone-spec0-q0,For those who are not flurnt and are not plizz...,Will plizz increase the chance of brifft?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...,<s>[INST] Below is an instruction that describ...,-0.565405,no,-0.909155,yes,no
4,7168,nonsense16-mediation-ate-modelNone-spec9-q1,"For those who are not gruntz, the probability ...",Will gruntz decrease the chance of flurrk?,no,"Imagine a self-contained, hypothetical world w...",<s>[INST] Below is an instruction that describ...,{'generated_text': '<s>[INST] Below is an inst...,<s>[INST] Below is an instruction that describ...,-0.069531,no,-3.163281,yes,no
