In [1]:
## ---------------------------------------------------------------------
## set up configs for huggingface hub and OS paths on HPC cluster -- make sure config.ini is correct
## ---------------------------------------------------------------------
import configparser
def auth_token():

    config = configparser.ConfigParser()
    config.read("config.ini")
    return config["hugging_face"]["token"]

def scratch_path():
    config = configparser.ConfigParser()
    config.read("config.ini")
    return "/scratch/" + config["user"]["username"] + "/"

import os
if os.path.isdir(scratch_path()):
    os.environ['TRANSFORMERS_CACHE'] = scratch_path() + '.cache/huggingface'
    os.environ['HF_DATASETS_CACHE'] = scratch_path() + '.cache/huggingface/datasets'
print(os.getenv('TRANSFORMERS_CACHE'))
print(os.getenv('HF_DATASETS_CACHE'))

## ---------------------------------------------------------------------
## Load libraries
## ---------------------------------------------------------------------

import numpy as np
import pandas as pd

import torch
import transformers
from transformers import AutoTokenizer, AutoModel, LlamaForCausalLM, LlamaTokenizer

import torch.nn.functional as F


## ---------------------------------------------------------------------
## Ensure GPU is available -- device should == 'cuda'
## ---------------------------------------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device = ", device)

/scratch/cmusfel1/.cache/huggingface
/scratch/cmusfel1/.cache/huggingface/datasets
device =  cuda


In [2]:
## ---------------------------------------------------------------------
## load llama-2 and set up a pipeline
## ---------------------------------------------------------------------

MODEL_NAME = "meta-llama/Llama-2-7b-hf" 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

pipeline = transformers.pipeline(
    "text-generation",
    model = MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto",
    use_auth_token = auth_token()
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


<h2>answer_questions() function</h2>
<p>This function will read a multiple choice question from the dataset and output a single letter response.</p>

In [4]:
data = pd.read_csv("/home/cmusfel1/test.tsv", sep='\t')
data2 = data.copy().tail(10) #smaller df for testing

with open("/home/cmusfel1/prompt1.txt", 'r') as file:
    answer_prompt = file.read()

def answer_questions(question, model, tokenizer):
    input_str = answer_prompt + f"\nQuestion: {question}\nAnswer:"
    #print(input_str)
    sequences = model(
        input_str,
        do_sample=False,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_new_tokens=1,
    )
    
    answer = sequences[0]['generated_text'][-1]
    #print(answer)
    return answer

data2.loc[:, 'Model Answer'] = data2.apply(
    lambda row: answer_questions(row['Complete Question'], pipeline, tokenizer),
    axis=1
)


<h2>generate_statement() function</h2>
<p>This function will read the multiple choice question and the model's answer from the dataset and output a statement.</p>

In [5]:
with open("/home/cmusfel1/prompt2.txt", 'r') as file:
    statement_prompt = file.read()
    
def generate_statement(question, answer, model, tokenizer):
    input_str = f"\n{statement_prompt}\nQuestion: {question}\nAnswer: {answer}\nStatement:"
    #print(input_str)
    sequences = model(
        input_str,
        do_sample=True,
        top_p=0.95,
        max_new_tokens=80,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id 
    )
    
    generated_text = sequences[0]['generated_text']
    #print(generated_text)
    statement = generated_text.split('Statement:')[5].split('*')[0] #added '*' direction in prompt2.txt to help with post processing
    #print(statement)
    return statement

data2.loc[:, 'Generated Statement'] = data2.apply(
    lambda row: generate_statement(row['Complete Question'], row['Model Answer'], pipeline, tokenizer),
    axis=1
)



<h2>generate_premises() function</h2>
<p>This function will read the model's statement from the data set and provide two premises that would make the statement true.</p>


In [7]:
with open("/home/cmusfel1/prompt3.txt", 'r') as file:
    premises_prompt = file.read()
    
def generate_premises(statement, model, tokenizer):
    input_str = f"\n{premises_prompt}\nStatement: {statement}\nPremises:"
    #print(input_str)
    sequences = model(
        input_str,
        do_sample=True,
        top_p=0.95,
        max_new_tokens=150,
        temperature=0.7
    )
    
    generated_text = sequences[0]['generated_text']
    #print(generated_text)
    premises = generated_text.split('Premises:')[5].split('\n')[0] #post processing
    #print(premises)
    return premises

data2.loc[:, 'Generated Premises'] = data2.apply(
    lambda row: generate_premises(row['Generated Statement'], pipeline, tokenizer),
    axis=1
)



In [8]:
data2

Unnamed: 0,ID,Question Stem,Choices,Complete Question,Answer Key,Model Answer,Generated Statement,Generated Premises
490,9-743,where might a bunny live?,(A) a thicket (B) atop palm trees (C) a sewer ...,where might a bunny live? (A) a thicket (B) at...,A,A,where might a bunny live?,(1) Bunnies prefer to live in the country. (2...
491,9-645,A shark will be unable to survive on eating al...,(A) it is a predator (B) it is a vegetarian (C...,A shark will be unable to survive on eating al...,A,A,A shark will be unable to survive on eating a...,(1) Sharks are predators. (2) Moss and algae ...
492,8-250,"A meadow vole just gave birth, and needs to fe...",(A) oil (B) deer (C) bugs (D) recycled plastic...,"A meadow vole just gave birth, and needs to fe...",C,C,"A meadow vole just gave birth, and needs to f...",(1) A meadow vole eats bugs. (2) A meadow vol...
493,283,The Grand Canyon was formed by,(A) a volcano erupting in 1782 (B) a river nam...,The Grand Canyon was formed by (A) a volcano e...,C,B,The Grand Canyon was formed by a river named ...,(1) Rivers can cause valleys and canyons to f...
494,8-183,"A woman, with a pale complexion, wants to spen...",(A) UV rays are harmful (B) sunlight will be f...,"A woman, with a pale complexion, wants to spen...",A,A,"A woman, with a pale complexion, wants to spe...",(1) Sunscreen is used to protect skin from UV...
495,9-284,A person is heating water in order to cook pas...,(A) scalds (B) cools (C) toasts (D) freezes,A person is heating water in order to cook pas...,A,A,A person is heating water in order to cook pa...,(1) A rock dropped in a pond creates a rippli...
496,7-1186,Pasta may be cooked in water when,(A) the water is warm (B) the water is on the ...,Pasta may be cooked in water when (A) the wate...,C,A,Pasta may be cooked in water when the water i...,(1) Pasta is cooked in water (2) Water can be...
497,926,A decrease in diseases,(A) has no impact on a population (B) leads to...,A decrease in diseases (A) has no impact on a ...,C,C,A decrease in diseases leads to less sick peo...,(1) Diseases are the main cause of illness. (...
498,7-519,"When soil is viewed in a scientific way, what ...",(A) insects like big beetles (B) tiny lifeform...,"When soil is viewed in a scientific way, what ...",B,B,"When soil is viewed in a scientific way, what...",(1) Soil is made up of tiny lifeforms. (2) Th...
499,7-7,Some animals use a liquid coming from their sk...,(A) cold (B) water (C) heat (D) humidity,Some animals use a liquid coming from their sk...,C,C,Some animals use a liquid coming from their s...,"(1) Animals with glands, such as porcupines, ..."
