In [1]:
## ---------------------------------------------------------------------
## set up configs for huggingface hub and OS paths on HPC cluster -- make sure config.ini is correct
## ---------------------------------------------------------------------
import configparser
def auth_token():

    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["hugging_face"]["token"]

def scratch_path():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return "/scratch/" + config["user"]["username"] + "/"

import os
if os.path.isdir(scratch_path()):
    os.environ['TRANSFORMERS_CACHE'] = scratch_path() + '.cache/huggingface'
    os.environ['HF_DATASETS_CACHE'] = scratch_path() + '.cache/huggingface/datasets'
print(os.getenv('TRANSFORMERS_CACHE'))
print(os.getenv('HF_DATASETS_CACHE'))

## ---------------------------------------------------------------------
## Load libraries
## ---------------------------------------------------------------------

import numpy as np
import pandas as pd

import torch
import transformers
from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM, LlamaTokenizer

import torch.nn.functional as F


## ---------------------------------------------------------------------
## Ensure GPU is available -- device should == 'cuda'
## ---------------------------------------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device = ", device)

/scratch/dmpowell/.cache/huggingface
/scratch/dmpowell/.cache/huggingface/datasets
device =  cuda


In [2]:
## ---------------------------------------------------------------------
## load llama-2 and set up a pipeline
## ---------------------------------------------------------------------

MODEL_NAME = "meta-llama/Llama-2-7b-hf" 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

pipeline = transformers.pipeline(
    "text-generation",
    model = MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    use_auth_token = auth_token()
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [221]:
df = pd.read_csv("test.tsv", sep='\t')
df2 = df.copy().tail(10) # smaller df for testing

with open("prompt1.txt", 'r') as file:
    answer_prompt = file.read()

<h2>answer_questions() function</h2>
<p>This function will read a multiple choice question from the dataset and output a single letter response.</p>

In [216]:
plist = []
for i in range(40):
    plist.append("Question: " + df.iloc[i]["Complete Question"] + "\nAnswer: " + df.iloc[i]["Answer Key"])

answer_prompt = "\n".join(plist)
print(answer_prompt)

Question: A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to (A) make more phone calls (B) quit eating lunch out (C) buy less with monopoly money (D) have lunch with friends
Answer: B
Question: There is most likely going to be fog around: (A) a marsh (B) a tundra (C) the plains (D) a desert
Answer: A
Question: Predators eat (A) lions (B) humans (C) bunnies (D) grass
Answer: C
Question: Oak tree seeds are planted and a sidewalk is paved right next to that spot, until eventually, the tree is tall and the roots must extend past the sidewalk, which means (A) roots may be split (B) roots may begin to die (C) parts may break the concrete (D) roots may fall apart
Answer: C
Question: An electric car runs on electricity via (A) gasoline (B) a power station (C) electrical conductors (D) fuel
Answer: C
Question: As the rain forest is deforested the atmospher

In [217]:
def answer_questions(question, model, tokenizer):
    input_str = answer_prompt + f"\nQuestion: {question}\nAnswer:"

    sequences = model(
        input_str,
        do_sample=False,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=1,
    )
    
    answer = sequences[0]['generated_text'][-1]
    #print(answer)
    return answer

df2['Model Answer'] = df2.apply(
    lambda row: answer_questions(row['Complete Question'], pipeline, tokenizer),
    axis=1
)




In [218]:
sum(df2["Answer Key"] == df2["Model Answer"]) # 

58

This is getting ~58% accuracy. For reference, the original GPT-3 with 32-shot examples got 65.8% ([Brown et al., 2020](https://arxiv.org/abs/2005.14165v4)). So that seems not-too-bad.

### ~~generate_statement() function~~
~~This function will read the multiple choice question and the model's answer from the dataset and output a statement.~~

**This ends up not being needed**

In [71]:
with open("prompt2.txt", 'r') as file:
    statement_prompt = file.read()
    
def generate_statement(question, answer, model, tokenizer):
    input_str = f"{statement_prompt}\n\nQuestion: {question}\nAnswer: {answer}\nStatement:"
    #print(input_str)
    sequences = model(
        input_str,
        do_sample=True,
        top_k=5,
        max_new_tokens=80,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id 
    )
    
    generated_text = sequences[0]['generated_text']
    statement = generated_text[len(input_str):-1] #added '*' direction in prompt2.txt to help with post processing
    
    return statement.split(".")[0]


# print(generate_statement(df2.iloc[1]["Complete Question"], df2.iloc[1]["Model Answer"], pipeline, tokenizer))
df2['Generated Statement'] = df2.apply(
    lambda row: generate_statement(row['Complete Question'], row['Model Answer'], pipeline, tokenizer),
    axis=1
)



## generate_premises() function
~~This function will read the model's statement from the data set and provide two premises that would make the statement true.~~

UPDATE: This seems to work better if we include the original question and answer, which eliminates a point of failure and gives more context for the explanation / premise generation.


In [238]:
with open("prompt3b.txt", 'r') as file:
    premises_prompt = file.read()
    
def generate_premises(question, answer, model, tokenizer):
    input_str = f"\n\n{premises_prompt}Question: {question}\nAnswer: {answer}\n"
    #print(input_str)
    sequences = model(
        input_str,
        # do_sample=True,
        # top_k = 50, 
        num_beams = 5, # beam search may be better ...
        max_new_tokens=150,
        temperature = 0.7
    )
    
    generated_text = sequences[0]['generated_text']
    premises = generated_text[len(input_str):-1] 
    return premises.split("\n")[:2]

df2['Generated Premises'] = df2.apply(
    lambda row: generate_premises(row['Complete Question'], row['Answer Key'], pipeline, tokenizer),
    axis=1
)



In [248]:
i = 9
df2.iloc[i]["Complete Question"], df2.iloc[i]["Generated Premises"]

('Some animals use a liquid coming from their skin to adjust to (A) cold (B) water (C) heat (D) humidity',
 ['Sweat is a liquid produced by the skin that helps regulate body temperature.',
  'Sweat helps animals regulate their body temperature in hot and humid environments.'])

In [190]:
# note: used chatgpt to help with prompt3b.txt -- worked pretty well

i = 31
df.iloc[i]["Complete Question"], df.iloc[i]["Answer Key"]

('What is the best way to guess a babies eye color? (A) The surroundings they are born in. (B) Their parents usual diet. (C) Just take a random guess. (D) The genealogy records of their family.',
 'D')

500