In [3]:
## ---------------------------------------------------------------------
## set up configs for huggingface hub and OS paths on HPC cluster -- make sure config.ini is correct
## ---------------------------------------------------------------------
import configparser
def auth_token():

    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["hugging_face"]["token"]

def scratch_path():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return "/scratch/" + config["user"]["username"] + "/"

import os
if os.path.isdir(scratch_path()):
    os.environ['TRANSFORMERS_CACHE'] = scratch_path() + '.cache/huggingface'
    os.environ['HF_DATASETS_CACHE'] = scratch_path() + '.cache/huggingface/datasets'
print(os.getenv('TRANSFORMERS_CACHE'))
print(os.getenv('HF_DATASETS_CACHE'))

## ---------------------------------------------------------------------
## Load libraries
## ---------------------------------------------------------------------

import numpy as np
import pandas as pd

import torch
import transformers
from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM, LlamaTokenizer

import torch.nn.functional as F

from entailma import * ## these are where the QA and prompting functions live now


## ---------------------------------------------------------------------
## Ensure GPU is available -- device should == 'cuda'
## ---------------------------------------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device = ", device)

/scratch/dmpowell/.cache/huggingface
/scratch/dmpowell/.cache/huggingface/datasets
device =  cuda


In [11]:
## ---------------------------------------------------------------------
## load llama-2 as a model (not pipeline, for reasons relating to other scripts)
## ---------------------------------------------------------------------

MODEL_NAME = "meta-llama/Llama-2-7b-hf" 

tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME)
model = LlamaForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map = "auto")


normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
df = pd.read_csv("data/obqa/test.tsv", sep='\t')
df2 = df.copy().tail(10) # smaller df for testing

## ~~answer_questions()~~ mc_choose_answer() function

This function will read a multiple choice question from the dataset and output a single letter response.

In [14]:
df2['Model Answer'] = df2.apply(
    lambda row: mc_choose_answer(row['Complete Question'], model, tokenizer),
    axis=1
)


In [15]:
sum(df2["Answer Key"] == df2["Model Answer"])  

7

This is getting ~58% accuracy. For reference, the original GPT-3 with 32-shot examples got 65.8% ([Brown et al., 2020](https://arxiv.org/abs/2005.14165v4)). So that seems not-too-bad.

## generate_premises() function
~~This function will read the model's statement from the data set and provide two premises that would make the statement true.~~

UPDATE: This seems to work better if we include the original question and answer, which eliminates a point of failure and gives more context for the explanation / premise generation.

UPDATE 2: This is in the `entailma` library in this repo, but I've reproduced it here to make it easier to play around with as you/we tweak prompts.


In [17]:
with open("entailma/prompt3b.txt", 'r') as file:
    premises_prompt = file.read()
    

def generate_premises(question, answer, model, tokenizer):
    
    input_str = f"\n\n{premises_prompt}Question: {question}\nAnswer: {answer}\n"

    pipe = transformers.pipeline(
        "text-generation",
        model = model,
        tokenizer = tokenizer,
        torch_dtype=torch.float16,
        # device_map="cuda",
        device = model.device,
        # use_auth_token = auth_token()
    )

    sequences = pipe(
        input_str,
        # do_sample=True,
        # top_k = 50, 
        num_beams = 5, # beam search may be better ...
        max_new_tokens=150,
        temperature = 0.7
    )
    
    generated_text = sequences[0]['generated_text']
    premises = generated_text[len(input_str):-1] 

    return premises.split("\n")[:2]


df2['Generated Premises'] = df2.apply(
    lambda row: generate_premises(row['Complete Question'], row['Answer Key'], model, tokenizer),
    axis=1
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [18]:
df2

Unnamed: 0,ID,Question Stem,Choices,Complete Question,Answer Key,Model Answer,Generated Premises
490,9-743,where might a bunny live?,(A) a thicket (B) atop palm trees (C) a sewer ...,where might a bunny live? (A) a thicket (B) at...,A,A,[Bunnies are small mammals that live in burrow...
491,9-645,A shark will be unable to survive on eating al...,(A) it is a predator (B) it is a vegetarian (C...,A shark will be unable to survive on eating al...,A,A,[Sharks are carnivorous predators that feed on...
492,8-250,"A meadow vole just gave birth, and needs to fe...",(A) oil (B) deer (C) bugs (D) recycled plastic...,"A meadow vole just gave birth, and needs to fe...",C,C,"[Meadow voles are herbivores, meaning they eat..."
493,283,The Grand Canyon was formed by,(A) a volcano erupting in 1782 (B) a river nam...,The Grand Canyon was formed by (A) a volcano e...,C,B,[The Grand Canyon was formed by the Colorado R...
494,8-183,"A woman, with a pale complexion, wants to spen...",(A) UV rays are harmful (B) sunlight will be f...,"A woman, with a pale complexion, wants to spen...",A,A,[Ultraviolet (UV) rays from the sun can cause ...
495,9-284,A person is heating water in order to cook pas...,(A) scalds (B) cools (C) toasts (D) freezes,A person is heating water in order to cook pas...,A,A,[Water boils at a temperature of 212°F (100°C)...
496,7-1186,Pasta may be cooked in water when,(A) the water is warm (B) the water is on the ...,Pasta may be cooked in water when (A) the wate...,C,A,[Water boils at a temperature of 212°F (100°C)...
497,926,A decrease in diseases,(A) has no impact on a population (B) leads to...,A decrease in diseases (A) has no impact on a ...,C,C,[A decrease in diseases leads to less sick peo...
498,7-519,"When soil is viewed in a scientific way, what ...",(A) insects like big beetles (B) tiny lifeform...,"When soil is viewed in a scientific way, what ...",B,B,"[Soil is composed of mineral particles, organi..."
499,7-7,Some animals use a liquid coming from their sk...,(A) cold (B) water (C) heat (D) humidity,Some animals use a liquid coming from their sk...,C,B,[Sweat is a liquid produced by the skin that h...
