In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.auto import tqdm

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np
import string
import random
import os

In [None]:
# set params
seed = 52
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 4
checkpoint_path = 'unsloth/Llama-3.2-3B-Instruct'
debug = False

prompt = '''You are a concise and precise assistant. Answer the questions directly and as briefly as possible.
           Your answers should be one of the following:
            1. "Yes" if the answer is affirmative.
            2. "No" if the answer is negative.
            3. "Insufficient information" if you don't have enough information to answer.
            4. The specific entity related to the question (such as a personal name, company, etc.), if applicable.

            Do not explain or provide additional details. Just give the most relevant answer based on the question and your knowledge.
        '''

In [None]:
test_df = pd.read_csv('/kaggle/input/ioai-contest-3/test.csv', index_col=0)
test_df

In [None]:
# create custom dataset
class MyDataset(Dataset):
    def __init__(self, dataset, tokenizer, prompt):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.prompt = prompt

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset.iloc[idx]

        # form example for LLM
        input_text = f"<|system|> {self.prompt}\n"
        input_text += f"<|user|> {example['questions']}\n"
        input_text += f"<|assistant|> Answer:"
        return idx, input_text

def collate_fn(batch):
    idxs, queries = zip(*batch)

    # tokenize batch with padding according to the longest example
    inputs = tokenizer(
        list(queries),
        truncation=True,
        padding='longest',
        return_tensors="pt"
    ).to(device)

    return idxs, queries, inputs

In [None]:
# init tokenizer and model

tokenizer = AutoTokenizer.from_pretrained(
            checkpoint_path,
            trust_remote_code=True
        )

model = AutoModelForCausalLM.from_pretrained(
    checkpoint_path,
    device_map="auto",
    trust_remote_code=True)
model.config.pad_token_id = model.config.eos_token_id

In [None]:
# init ds, loader
dataset = MyDataset(test_df, tokenizer, prompt)
test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
# For answer postprocessing
def get_clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(228)

In [None]:
# generate answers
result_dict = {}
for idxs, queries, tokens in tqdm(test_loader):
    with torch.no_grad():
        outputs = model.generate(
            input_ids=tokens["input_ids"],
            attention_mask=tokens["attention_mask"],
            max_new_tokens=256,
            pad_token_id=model.config.pad_token_id,
        )

    for num, output in enumerate(outputs):
        response = tokenizer.decode(outputs[num, tokens["input_ids"][num].shape[0]:], skip_special_tokens=True)
        pred = get_clean_text(response)
        result_dict[idxs[num]] = pred
        if debug:
            print(f'Model input: {queries[num]}\n')
            print(f'Model answer: {pred}\n\n')


In [None]:
df = pd.DataFrame(result_dict.items(), columns=["ID", "answer"]).sort_values(by="ID")
df['answer'] = df['answer'].apply(lambda x:x.lower().replace('.', ''))

In [None]:
df['answer'] = [x if 'no' not in x.split(' ') else 'no' for x in df['answer']]
df['answer'] = [x if 'yes' not in x.split(' ') else 'yes' for x in df['answer']]
df.to_csv('baseline.csv', index=False)