In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import pandas as pd

from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

from eig import compute_eig_fast
from eig.battleship import Ship, BattleshipHypothesis, Parser, Executor
from eig.battleship.program import ProgramSyntaxError

from battleship.translator import Translator

In [None]:
# Load HF_AUTH_TOKEN from .hf_auth_token
with open(os.path.join("../", ".hf_auth_token"), "r") as f:
    os.environ["HF_AUTH_TOKEN"] = f.read().strip()

HF_AUTH_TOKEN = os.environ["HF_AUTH_TOKEN"]

# Inference with pre-trained LLM

In [None]:
MODEL_NAME = "WizardLM/WizardCoder-Python-7B-V1.0"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_AUTH_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, token=HF_AUTH_TOKEN, device_map="auto", load_in_8bit=True,
)

In [None]:
translator = Translator(model=model, tokenizer=tokenizer)

In [None]:
completion = translator.question_to_code("How many blocks is the blue ship?")
print(completion)

In [None]:
completion = translator.question_to_code("What is at A7?")
print(completion)

# Evaluate the model on test examples

In [None]:
df_test = pd.read_csv('../battleship/prompts/test_examples.csv')
df_test

In [None]:
# Run inference on all questions in the test set
df_test['completion'] = [translator.question_to_code(q) for q in tqdm(df_test['question'])]

In [None]:
df_test

In [None]:
def check_parse(program):
    try:
        Parser.parse(program)
        return True
    except ProgramSyntaxError:
        return False

df_test['parseable'] = [check_parse(p) for p in df_test['completion']]
df_test['exact_match'] = [p == a for p, a in zip(df_test['completion'], df_test['code'])]

In [None]:
df_test

# Evaluate multiple models

In [None]:
def check_parse(program):
    try:
        Parser.parse(program)
        return True
    except ProgramSyntaxError:
        return False

In [None]:
MODEL_NAMES = [
    "bigcode/starcoder",
    "codellama/CodeLlama-7b-hf",
    "WizardLM/WizardCoder-15B-V1.0",
]

HF_AUTH_TOKEN = os.environ["HF_AUTH_TOKEN"]

for model_name in MODEL_NAMES:
    print(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=HF_AUTH_TOKEN)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, token=HF_AUTH_TOKEN, device_map="auto", load_in_8bit=True,
    )
    translator = Translator(model=model, tokenizer=tokenizer)

    df_test[f'completion-{MODEL_NAME}'] = [translator.question_to_code(q) for q in tqdm(df_test['question'])]
    df_test[f'parseable-{MODEL_NAME}'] = [check_parse(p) for p in df_test['completion']]
    df_test[f'exact_match-{MODEL_NAME}'] = [p == a for p, a in zip(df_test['completion'], df_test['code'])]

df_test

In [None]:
df_test.to_csv('../battleship/prompts/test_examples_with_completions.csv', index=False)