In [43]:
from battleship.prompting import QuestionGenerationPrompt, TranslationPrompt
from battleship.board import Board

import pandas as pd
import os
import time
from tqdm import tqdm
from math import ceil 

from eig.battleship import Parser

from battleship.grammar import BattleshipGrammar
from battleship.scoring import compute_score_parallel, compute_score
from battleship.board import Board

## CodeLlama

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List

MODEL_NAME = "codellama/CodeLlama-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    load_in_8bit=True,
)

In [None]:
def get_completions(prompts: List[str], max_new_tokens: int = 32) -> List[str]:
    if isinstance(prompts, str):
        prompts = [prompts]

    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(
        device=model.device
    )
    #print(inputs)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Return only the completion
    completions = tokenizer.batch_decode(
        outputs[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True
    )
    # Remove everything after the first newline and strip whitespace
    completions = [completion.split("\n")[0].strip() for completion in completions]
    return completions

In [None]:
def generate_questions_local(generation_prompt: QuestionGenerationPrompt, n: int):
    generation_prompts, seeds = [], []
    for i in range(n):
        generation_prompt_temp = QuestionGenerationPrompt(
                target_trial_id= generation_prompt.target_trial_id,
                board_format= generation_prompt.board_format,
                n_example_trials=generation_prompt.n_example_trials,
                n_examples_per_trial=generation_prompt.n_questions_per_trial,
                include_system_prompt=generation_prompt.include_system_prompt,
                include_instructions=generation_prompt.include_instructions,
                include_board=generation_prompt.include_board,
                random_seed= generation_prompt.random_seed #TODO: Do we want different seeds?
        )
        generation_prompts.append(generation_prompt_temp)
        seeds.append(generation_prompt_temp.random_seed)

    questions = get_completions(prompts=[str(i) for i in generation_prompts])
    return (questions,seeds)

def translate_programs_local(questions: List[str], translation_prompt: TranslationPrompt, n: int):
    translation_prompts, seeds = [], []
    for i in range(n):
        translation_prompt_temp = TranslationPrompt(
            target_question=questions[i],
            target_trial_id=translation_prompt.target_trial_id,
            n_example_trials=translation_prompt.n_example_trials,
            n_examples_per_trial=translation_prompt.n_questions_per_trial,
            random_seed=translation_prompt.random_seed,
            include_system_prompt=translation_prompt.include_system_prompt,
            include_instructions=translation_prompt.include_instructions,
            include_board=translation_prompt.include_board,
        )
        translation_prompts.append(translation_prompt_temp)
        seeds.append(translation_prompt_temp.random_seed)
    
    programs = get_completions(prompts=[str(i) for i in translation_prompts])
    return (programs,seeds)

def generate_programs_local(generation_prompt: QuestionGenerationPrompt, translation_prompt: TranslationPrompt, n: int):
    #TODO: split up requests of large n into smaller n to avoid CUDA OutOfMemoryError
    questions, generation_seeds = generate_questions_local(generation_prompt, n)
    programs, translation_seeds = translate_programs_local(questions, translation_prompt, n)
    seed_couples = [(generation_seeds[i],translation_seeds[i]) for i in range(len(generation_seeds))]
    return (programs, seed_couples)

## GPT-4

In [None]:
from openai import OpenAI
from enum import StrEnum

client = OpenAI()

class OpenAIModels(StrEnum):
    TEXT = "gpt-4"
    VISION = "gpt-4-vision-preview"

In [None]:
def generate_questions_gpt(generation_prompt: QuestionGenerationPrompt, n: int):
        completion = client.chat.completions.create(
            model=OpenAIModels.TEXT,
            messages=generation_prompt.to_chat_format(),
            n=n,
            temperature=0.7
        )
        questions = [str(completion.choices[i].message.content).replace("Question:","").strip() for i in range(n)]
        seeds = [generation_prompt.random_seed for _ in range(n)]
        return (questions,seeds)

def translate_questions_gpt(questions: List[str], translation_prompt: TranslationPrompt, n: int):
    programs, seeds = [], []
    for i in range(n):
        translation_prompt_temp = TranslationPrompt(
            target_question=questions[i],
            target_trial_id=translation_prompt.target_trial_id,
            n_example_trials=translation_prompt.n_example_trials,
            n_examples_per_trial=translation_prompt.n_questions_per_trial,
            random_seed=translation_prompt.random_seed,
            include_system_prompt=translation_prompt.include_system_prompt,
            include_instructions=translation_prompt.include_instructions,
            include_board=translation_prompt.include_board,
        )
        completion = client.chat.completions.create(
            model=OpenAIModels.TEXT,
            messages=translation_prompt_temp.to_chat_format(),
            n=1,
            temperature=0.7
        )
        seeds.append(translation_prompt_temp.random_seed)
        program_temp = str(completion.choices[0].message.content).replace("Query:","").strip()
        programs.append(program_temp)
    return (programs,seeds)

        

def generate_programs_gpt(generation_prompt: QuestionGenerationPrompt, translation_prompt: TranslationPrompt, n: int):
    questions, generation_seeds = generate_questions_gpt(generation_prompt, n)
    programs, translation_seeds = translate_questions_gpt(questions, translation_prompt, n)
    seed_couples = [(generation_seeds[i],translation_seeds[i]) for i in range(len(generation_seeds))]
    return (programs, seed_couples)

## Baseline

In [None]:
prompt = QuestionGenerationPrompt(
    target_trial_id=None,
    board_format="textual",
    n_example_trials=3,
    n_examples_per_trial=3,
    include_system_prompt=True,
    include_instructions=True,
    include_board=True,
    random_seed=None,
)

translation_prompt = TranslationPrompt(
    target_question=None,
    target_trial_id=None,
    n_example_trials=10,
    n_examples_per_trial=1,
    random_seed=None,
    include_instructions=False,
)

In [None]:
dataDict = {
    'program':[],
    'board_id':[],
    'score':[],
    'model':[],
    'example_trials':[],
    'examples_per_trial':[],
    'random_seeds':[]
}
df = pd.DataFrame(dataDict)

if not os.path.isfile('llm_data.csv'):
   df.to_csv('llm_data.csv', header='column_names')

In [None]:
def llm_baseline(cores: int, generation_prompt: QuestionGenerationPrompt, translation_prompt: TranslationPrompt, n: int):
    scores = []
    for model in ["codellama","gpt4"]:
        for id in range(1,18+1):
            start_time = time.time()
            print(f"board {id}", end=" | ") 
            generation_prompt.target_trial_id = id

            if model == "codellama":
                programs, seed_couples = generate_programs_local(generation_prompt=generation_prompt, translation_prompt=translation_prompt, n=n)
            if model == "gpt4":
                programs, seed_couples = generate_programs_gpt(generation_prompt=generation_prompt, translation_prompt=translation_prompt, n=n)
            
            print(f"finished generating programs in {round(time.time()-start_time,2)}s", end=" | ")
            
            if cores > 1:
                #If more than one core is used, breaks it up into chunks of 50 programs to score in parallel as to not overwork the machine and kill the EC2 instance or the kernel.
                for i in tqdm(range(ceil(n/50))):
                    program_selection = [i for i in programs[i*50:((i+1)*50)]]
                    program_scores = compute_score_parallel(programs=program_selection,board=Board.from_trial_id(id),processes=cores,show_progress=False)
                    scores.extend(program_scores)
                    dataDict['program'].extend(program_selection)
                    dataDict['board_id'].extend([id for _ in range(len(program_scores))])
                    dataDict['score'].extend(program_scores)
                    dataDict['model'].extend([model for _ in range(len(program_scores))])
                    dataDict['example_trials'].extend([generation_prompt.n_example_trials for _ in range(len(program_scores))])
                    dataDict['examples_per_trial'].extend([generation_prompt.n_questions_per_trial for _ in range(len(program_scores))])
                    dataDict['random_seeds'].extend(seed_couples)
            else:
                #If only one core is used, computes programs scores sequentially
                for prog in programs:
                    score = compute_score(program=prog[0], board=Board.from_trial_id(id))
                    scores.append(score)

            
            print(f"finished scoring in {round(time.time()-start_time,2)}s from the start")
        print(f"finished with model {model}")

    df = pd.DataFrame(dataDict)
    df.to_csv('llm_data.csv', mode='a', header=False)
    print(f"finished {n}-shot sampling with model {model}")
    return df

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

llm_baseline(cores=os.cpu_count()-2, generation_prompt=prompt, translation_prompt=translation_prompt, n=100)

board 1 | 

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacty of 22.19 GiB of which 3.18 GiB is free. Including non-PyTorch memory, this process has 19.01 GiB memory in use. Of the allocated memory 15.38 GiB is allocated by PyTorch, and 3.32 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF