In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import asyncio
import os
import pandas as pd

from hfppl.llms import CachedCausalLM
from hfppl.inference import smc_standard

from battleship.v1.board import Board
from battleship.scoring import compute_score
from battleship.models import QuestionGenerationModel, SingleStepQuestionGenerationModel

In [3]:
# Load HF_AUTH_TOKEN from .hf_auth_token
with open(os.path.join("../../", ".hf_auth_token"), "r") as f:
    os.environ["HF_AUTH_TOKEN"] = f.read().strip()

HF_AUTH_TOKEN = os.environ["HF_AUTH_TOKEN"]

In [4]:
# Initialize the HuggingFace model
lm = CachedCausalLM.from_pretrained("codellama/CodeLlama-13b-hf", auth_token=HF_AUTH_TOKEN)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Prompting utils

In [None]:
df = pd.read_csv("../../battleship/prompts/examples.csv")

def format_example(user_input: str, response: str = None):
    return f"User: {user_input}\n" f"Assistant:{' ' + response if response else ''}"

def make_question_prompt(df, board=None, instructions=None):
    prompt = ""
    if instructions != None:
        prompt += f"Instructions:\n{instructions}\n"
    if board != None:
        prompt += "Board:\n" + board.to_textual_description() + "\n"
    prompt += "Questions:\n" + "\n".join(df.question) + "\n"
    return prompt

def make_question_to_code_prompt(df):
    prompt = "\n".join([format_example(q, r) for q, r in zip(df.question, df.code)]) + "\n"
    return prompt

In [None]:
instructions = "User input will be a series of sentences representing a board from Battleship, the board game, that you should aim to win. Tiles in the board can either be 'Water' tiles, 'Blue Ship' tiles, 'Red Ship' tiles, and 'Purple Ship' tiles (there are only these three battleships). Some tiles may also be 'Hidden' tiles, meaning they could be any of the others but have not been revealed yet. The user will denote coordinates as follows: columns are numbered from 1 onwards, where column 1 is the leftmost column, and rows are given a letter from A onwards where row A is the topmost row (so the cell at the second row and second column is B2). Your role is to ask the most informative possible question from the context given: strictly output the question only, and make sure the questions are relevant to the context: 'Which cells should I target to sink the battleships with the least number of moves?' is not a relevant question because it is the general goal of Battleship. Questions also need to be answerable with yes or no, no other questions will be considered in scope."

# Single board evaluation

In [None]:

async def single_smc_baseline(board_id,particle_num,instructions):
    board = Board.from_trial_id(board_id)
    board.to_textual_description()

    model = QuestionGenerationModel(
        lm=lm,
        board=board,
        question_prompt=make_question_prompt(df),
        translation_prompt=make_question_to_code_prompt(df),
    )

    model_combined = QuestionGenerationModel(
        lm=lm,
        board=board,
        question_prompt=make_question_prompt(df,board=board,instructions=instructions),
        translation_prompt=make_question_to_code_prompt(df),
    )

    particles = await smc_standard(model, n_particles=particle_num)
    print("Done with standard model...")
    particles_c = await smc_standard(model_combined, n_particles=particle_num)
    print("Done with combined model...")
    return [particles,particles_c]

In [None]:
df_results = []
particle_types = [particles,particles_c]

for particle_type in particle_types:
    for i, p in enumerate(particle_type):
        df_p = pd.DataFrame(p.get_final_results())
        df_p["particle"] = i
        df_results.append(df_p)
    df_results = pd.concat(df_results).reset_index(drop=True)
df_results

In [None]:
for p in particles:
    print(f"Question: {str(p.context)}")
    print(f"|- Program: {p.result['translation']}")
    print(f"|- EIG: {compute_score(board=board, program=p.result['translation'])}")
    print(f"|- Particle weight: {p.weight:.4f}")
    print()

# Multiple board evaluation

In [None]:
async def run_smc_baseline(n_particles=5, trial_ids=range(1, 19), model_types=["REGULAR", "COMBINED"], model_cls = QuestionGenerationModel, results_file = "hfppl_results.csv", verbose = False):
    results_all = []
    for trial_id in trial_ids:
        for model_type in model_types:
            print("-" * 80)
            print(f"Trial {trial_id}")
            print(f"Model type: {model_type}")
            print("-" * 80)
            board = Board.from_trial_id(trial_id)
            instructions_used = None if model_type == "REGULAR" else instructions
            model = model_cls(
                    lm=lm,
                    board=board,
                    question_prompt=make_question_prompt(df, board=board, instructions=instructions_used),
                    translation_prompt=make_question_to_code_prompt(df),
                    verbose=verbose,
                )
            particles = await smc_standard(model, n_particles=n_particles)
            results_trial = []
            for i, p in enumerate(particles):
                df_p = pd.DataFrame(p.get_final_results())
                df_p["particle"] = i
                df_p["model_type"] = model_type
                results_trial.append(df_p)
            df_trial = pd.concat(results_trial).reset_index(drop=True)
            df_trial["trial_id"] = trial_id
            results_all.append(df_trial)
            df_results = pd.concat(results_all).reset_index(drop=True)
            df_results.to_csv(results_file, index=False)
    return df_results

In [None]:
TRIAL_IDS = range(1, 19)
N_PARTICLES = 5

await run_smc_baseline(n_particles=N_PARTICLES, trial_ids=TRIAL_IDS)

# One-step SMC

In [14]:
TRIAL_IDS = range(1, 19)
# TRIAL_IDS = [13]
N_PARTICLES = 100

df_results = await run_smc_baseline(
    n_particles=N_PARTICLES,
    trial_ids=TRIAL_IDS,
    model_types=["COMBINED"],
    model_cls=SingleStepQuestionGenerationModel,
    results_file="hfppl_results_single_step.csv",
)

--------------------------------------------------------------------------------
Trial 1
Model type: COMBINED
--------------------------------------------------------------------------------
Does
The
Does
Does
Wh
Wh
Wh
How
What
What
Is
Is
How
Is
What
Can
How
Are
Wh
Are
Are
Are
How
Wh
Does
What
Does the
How


Where
Input
How
Do
Qu
Is
How
Are
Test
Al
is
Qu
How
What
How


The other
What
Are
Are
Is
What
How
How
What
What
How
Wh
How


Does the
How
What
How
Does the


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.55 GiB. GPU 0 has a total capacty of 22.19 GiB of which 835.50 MiB is free. Including non-PyTorch memory, this process has 21.37 GiB memory in use. Of the allocated memory 19.53 GiB is allocated by PyTorch, and 1.53 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [26]:
df_results.sort_values(by="score", ascending=False)

Unnamed: 0,prefix,completion,translation,score,type,particle,model_type,trial_id
27,How many blocks are there?,How many blocks are there?,(++ (map (lambda x0 (size x0)) (set AllColors))),2.029328,final,27,REGULAR,13
55,Note: Take cargo space into consideration.,Note: Take cargo space into consideration.,(++ (map (lambda x0 (size x0)) (set AllColors))),2.029328,final,55,REGULAR,13
29,Note: Take cargo space into consideration.,Note: Take cargo space into consideration.,(++ (map (lambda x0 (size x0)) (set AllColors))),2.029328,final,29,REGULAR,13
30,How many pieces could the purple deck fit into...,How many pieces could the purple deck fit into...,(++ (map (lambda x0 (size x0)) (set AllColors))),2.029328,final,30,REGULAR,13
31,How many total tiles can be seen?,How many total tiles can be seen?,(++ (map (lambda x0 (size x0)) (set AllColors))),2.029328,final,31,REGULAR,13
...,...,...,...,...,...,...,...,...
73,\n,\n,(== (size Red) 3),0.000000,final,73,REGULAR,13
19,\end{code}\n,\end{code}\n,(== (size Red) 3),0.000000,final,19,REGULAR,13
75,Re: Board Game Logic Tile Problem Unsolved\n,Re: Board Game Logic Tile Problem Unsolved\n,(define (coloredTiles color),0.000000,final,75,REGULAR,13
76,Is there a ship at 2E?,Is there a ship at 2E?,(not (== (color 2-E) Water)),0.000000,final,76,REGULAR,13
