In [1]:
## ---------------------------------------------------------------------
## set up configs for huggingface hub and OS paths on HPC cluster -- make sure config.ini is correct
## ---------------------------------------------------------------------
import configparser
def auth_token():

    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["hugging_face"]["token"]

def scratch_path():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return "/scratch/" + config["user"]["username"] + "/"

import os
if os.path.isdir(scratch_path()):
    os.environ['TRANSFORMERS_CACHE'] = scratch_path() + '.cache/huggingface'
    os.environ['HF_DATASETS_CACHE'] = scratch_path() + '.cache/huggingface/datasets'
print(os.getenv('TRANSFORMERS_CACHE'))
print(os.getenv('HF_DATASETS_CACHE'))

## ---------------------------------------------------------------------
## Load libraries
## ---------------------------------------------------------------------

import numpy as np
import pandas as pd

import torch
import transformers
from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM, LlamaTokenizer

import torch.nn.functional as F

from entailma import * ## these are where the QA and prompting functions live now
from easyeditor.custom import EditedModel
from easyeditor import LoRAHyperParams

## ---------------------------------------------------------------------
## Ensure GPU is available -- device should == 'cuda'
## ---------------------------------------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device = ", device)

/scratch/dmpowell/.cache/huggingface
/scratch/dmpowell/.cache/huggingface/datasets


  warn(


device =  cuda


In [2]:
## ---------------------------------------------------------------------
## load llama-2 as a EditedModel class (not pipeline, to integrate better with other scripts/notebooks)
## ---------------------------------------------------------------------

MODEL_NAME = "meta-llama/Llama-2-7b-hf" 

# tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME)
# model = LlamaForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map = "auto")

hparams = LoRAHyperParams.from_hparams('hparams/LoRA/llama-7b-canonical.yaml')
model = EditedModel(hparams, auth_token())

2024-08-09 17:45:27,824 - easyeditor.editors.editor - INFO - Instantiating model
08/09/2024 17:45:27 - INFO - easyeditor.editors.editor -   Instantiating model


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.
2024-08-09 17:46:34,483 - easyeditor.editors.editor - INFO - AutoRegressive Model detected, set the padding side of Tokenizer to left...
08/09/2024 17:46:34 - INFO - easyeditor.editors.editor -   AutoRegressive Model detected, set the padding side of Tokenizer to left...


In [4]:
df = pd.read_csv("data/obqa/test.tsv", sep='\t')
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.lower()

df2 = df.copy().tail(10) # smaller df for testing
df2.head(5)

Unnamed: 0,id,question_stem,choices,complete_question,answer_key
490,9-743,where might a bunny live?,(A) a thicket (B) atop palm trees (C) a sewer ...,where might a bunny live? (A) a thicket (B) at...,A
491,9-645,A shark will be unable to survive on eating al...,(A) it is a predator (B) it is a vegetarian (C...,A shark will be unable to survive on eating al...,A
492,8-250,"A meadow vole just gave birth, and needs to fe...",(A) oil (B) deer (C) bugs (D) recycled plastic...,"A meadow vole just gave birth, and needs to fe...",C
493,283,The Grand Canyon was formed by,(A) a volcano erupting in 1782 (B) a river nam...,The Grand Canyon was formed by (A) a volcano e...,C
494,8-183,"A woman, with a pale complexion, wants to spen...",(A) UV rays are harmful (B) sunlight will be f...,"A woman, with a pale complexion, wants to spen...",A


## ~~answer_questions()~~ mc_choose_answer() function

This function will read a multiple choice question from the dataset and output a single letter response.

In [14]:
df2['model_answer'] = df2.apply(
    lambda row: mc_choose_answer(row['complete_question'], model.model, model.tok),
    axis=1
)


In [15]:
# sum(df2["Answer Key"] == df2["Model Answer"])  

7

This is getting ~58% accuracy. For reference, the original GPT-3 with 32-shot examples got 65.8% ([Brown et al., 2020](https://arxiv.org/abs/2005.14165v4)). So that seems not-too-bad.

## generate_premises() function
~~This function will read the model's statement from the data set and provide two premises that would make the statement true.~~

UPDATE: This seems to work better if we include the original question and answer, which eliminates a point of failure and gives more context for the explanation / premise generation.

UPDATE 2: This is in the `entailma` library in this repo, but I've reproduced it here to make it easier to play around with as you/we tweak prompts.


In [104]:
with open("entailma/prompt3b.txt", 'r') as file:
    premises_prompt = file.read()
    

def generate_premises(question, answer, model, tokenizer):
    
    input_str = f"\n\n{premises_prompt}Question: {question}\nAnswer: {answer}\n"

    pipe = transformers.pipeline(
        "text-generation",
        model = model,
        tokenizer = tokenizer,
        torch_dtype=torch.float16,
        # device_map="cuda",
        device = model.device,
        # use_auth_token = auth_token()
    )

    # sequences = pipe(
    #     input_str,
    #     do_sample=True,
    #     top_p = .95,
    #     # num_beams = 9, # beam search may be better ...
    #     # num_beam_groups = 3,
    #     max_new_tokens = 75,
    #     temperature = 0.7,
    #     num_return_sequences = 5
    # )

    sequences = pipe(
        input_str,
        penalty_alpha=0.9, 
        top_k=5,
        max_new_tokens = 75,
        num_return_sequences = 1
    )
    
    generated_texts = [s['generated_text'] for s in sequences]
    premises = [t[len(input_str):-1] for t in generated_texts]
    premlist = [p.split(".\n")[:2] for p in premises] 

    return premlist if len(premlist) > 1 else premlist[0]


# df2['Generated Premises'] = df2.tail(1).copy().apply(
#     lambda row: generate_premises(row['Complete Question'], row['Answer Key'], model.model, model.tok),
#     axis=1
# )

for row in df2.tail(5).itertuples():
    out = generate_premises(row.complete_question, row.answer_key, model.model, model.tok)
    print(out)

    

['Water boils at 100 degrees Celsius', 'Water that is 100 degrees Celsius is hot and can cause a burn']
['Water bubbles when it is heated', 'Water bubbles when it is heated']
['Diseases can be prevented by vaccinations and healthy lifestyles', 'A decrease in diseases leads to less sick people']
['Soil is made up of mineral particles and organic matter', 'Soil is not made up of living things']
['Sweat is a liquid produced by the skin that helps to cool the body', 'Animals living in hot environments sweat to regulate their body temperature']


In [103]:
premises_prompt

"Question: Electricity causes less damage to the Earth's atmosphere than (A) Gasoline (B) Potatoes (C) The sun (D) Water\nAnswer: A\nBurning gasoline releases pollutants.\nPollutants damage the Earth's Atmosphere.\n\nQuestion: If a river is rushing southwest on a sunny day, then it is safe to assume that (A) southwest is a good place to be (B) the land gently inclines in that direction (C) the world is mostly land (D) the land is supple\nAnswer: B\nWater flows from a high point to a lower point.\nThe direction of a river is the incline of the land.\n\nQuestion: Using a metal kitchen tool on a cheese can create (A) milk (B) blue cheese (C) melted cheese (D) small pieces\nAnswer: D\nMetal kitchen tools can cut food into small pieces.\nCheese can be easily cut with metal kitchen tools.\n\nQuestion: Poison causes harm to which of the following? (A) a Tree (B) a robot (C) a house (D) a car\nAnswer: A\nPoison causes harm to living things.\nA tree is a living thing.\n\nQuestion: There is most

In [99]:
df2.iloc[5].complete_question

'A person is heating water in order to cook pasta. He spills the pot of water on his leg and finds that the water (A) scalds (B) cools (C) toasts (D) freezes'

In [89]:
def reverse_statement(statement, model, tokenizer):
    
    with open("rephrase-prompt.txt", 'r') as file:
        reverse_prompt = file.read()
    input_str = reverse_prompt + "\n\n[original] " + statement + '\n[reversed]'
    # print(input_str)

    input_len = len(tokenizer(statement)['input_ids'])

    pipe = transformers.pipeline(
        "text-generation",
        model = model,
        tokenizer = tokenizer,
        torch_dtype=torch.float16,
        device = model.device,
    )

    sequences = pipe(
        input_str,
        # do_sample = False,
        penalty_alpha=0.9, 
        top_k=5,
        # num_beams = 5, # beam search may be better ...
        max_new_tokens = input_len + 10
    )
    
    generated_texts = [s['generated_text'] for s in sequences]
    generation = [t[len(input_str):-1] for t in generated_texts]


    return generation


reverse_statement('Soil is composed of mineral particles, organic matter, and microorganisms.', model.model, model.tok)

[' Soil is composed of microorganisms, mineral particles, and organic matter.\n\n[original] A tsunami is ']

In [122]:
def seq_logprob(text, model):
   # here: avged per token
   return(model.completion_logprob(text, text) / len(model.tok(text)['input_ids']))

seq_logprob('Soil is composed of mineral particles, and organic matter', model)

tensor(-3.3723, device='cuda:0')

In [161]:
model.choose(mc_answer_prompt + '\n\nQuestion: A person is heating water in order to cook pasta. He spills the pot of water on his leg and finds that the water (A) scalds (B) cools (C) toasts (D) freezes\nAnswer:', choices = ["A", "B", "C", "D"])

0

In [91]:
# somehow this doesn't work?

model.tok.decode(model.tok('hello world')['input_ids'][1:])

'hello world'

In [201]:
# def premise_logprob():
def tok_logprobs(text, model, start_ind = 1):
   x = model.logprobs(text)
   tok_idx = x['tokens']['input_ids'].squeeze()
   logits = x['logprobs']

   return logits[0, :, tok_idx[1:]].squeeze().diag()

def seq_logprob(text, model, start_ind = 1, norm = False):
   return tok_logprobs(text, model, start_ind).sum() if not norm else tok_logprobs(text, model).sum()/(len(model.tok(text)['input_ids']) - 1)

seq_logprob('dogs meow', model, norm = True)

# x['logprobs'][]

# def seq_logprob(text, model):
#    # here: avged per token
#    return(model.completion_logprob(text, text) / len(model.tok(text)['input_ids']))



# def validate_premises():
# def validate_premises(premises, question, answer, model):
#    premise_str = "\n".join(premises)
#    input_str =  "Question:" + question+"\n" + premise_str + "Answer:"
#    model.completion_logprob(input_str + )

# append the premises ahead of the (raw) question and check change in odds/probability of specified answer

tensor(-7.9128, device='cuda:0')

In [107]:
tok_logprobs('hello world', model).sum()

tensor(-13.2433, device='cuda:0')

In [197]:
x = model.logprobs('hello how are you today?')
tok_idx = x['tokens']['input_ids'].squeeze()
logits = x['logprobs']

logits[0, :, tok_idx[1:]].squeeze().diag()

tensor([-12.0732,  -6.5442,  -0.8319,  -0.1216,  -2.8828,  -1.2245],
       device='cuda:0')

In [196]:
# model.tok.decode(tokens.squeeze()[1:])

# tok_idx[1:]
# logits[0,1:, tok_idx[1:]]
print(tok_idx)
# model.tok.decode(tok_idx[1:])
logits[0][3][tok_idx[3]]

## looks like, 0th place, look at token index for 1st

tensor([    1, 22172,   920,   526,   366,  9826, 29973], device='cuda:0')


tensor(-8.5676, device='cuda:0')