In [None]:
!pip install transformers peft accelerate bitsandbytes \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files/lmsys

In [2]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import json
import unicodedata
import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import Gemma2ForSequenceClassification, GemmaTokenizerFast, BitsAndBytesConfig, AutoTokenizer
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

2024-08-05 05:35:51.302430: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 05:35:51.302535: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-05 05:35:51.427175: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
assert torch.cuda.device_count() == 2

## Configurations

In [4]:
@dataclass
class Config:
    gemma_dir = '/kaggle/input/gemma-2-9b-hf'
    lora_dir = '/kaggle/input/gemma2-finetuned-0716'
    max_length = 2400
    batch_size = 2
    device = torch.device("cuda")    
    tta = False  # test time augmentation. <prompt>-<model-b's response>-<model-a's response>
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

cfg = Config()

# Load & pre-process Data 

In [5]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

In [6]:
def process(text):
    json_text = json.loads(text)
    sentences = ["" if s is None else s for s in json_text]
    sentences = ["null" if s == "" else s for s in sentences]
    sentences = [repr(s) for s in sentences]
    sentences = [unicodedata.normalize('NFKC', s) for s in sentences]
    
    return " ".join(sentences) 


test.loc[:, 'prompt'] = test['prompt'].apply(process)
test.loc[:, 'response_a'] = test['response_a'].apply(process)
test.loc[:, 'response_b'] = test['response_b'].apply(process)

display(test.head(5))

Unnamed: 0,id,prompt,response_a,response_b
0,136060,"'I have three oranges today, I ate an orange y...",'You have two oranges today.','You still have three oranges. Eating an orang...
1,211333,"""You are a mediator in a heated political deba...","""Thank you for sharing the details of the situ...","""Mr Reddy and Ms Blue both have valid points i..."
2,1233961,'How to initialize the classification head whe...,"""When you want to initialize the classificatio...","""To initialize the classification head when pe..."


In [7]:
# Prepare text for model
test['text'] = 'User prompt: ' + test['prompt'] +  '\n\nModel A :\n' + test['response_a'] +'\n\n--------\n\nModel B:\n'  + test['response_b']
print(test['text'][0])

User prompt: 'I have three oranges today, I ate an orange yesterday. How many oranges do I have?'

Model A :
'You have two oranges today.'

--------

Model B:
'You still have three oranges. Eating an orange yesterday does not affect the number of oranges you have today.'


In [8]:
%%time

# tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/gemma2-finetuned-0716/gemma-tokenizer')
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = test.copy()

# Utility function giving token length
def get_token_lengths(texts):
    # tokenize and receive input_ids for reach text
    input_ids = tokenizer(texts.tolist(), return_tensors='np')['input_ids']
    # return length of inputs_ids for each text
    return [len(t) for t in input_ids]

def get_inputs(texts):
    # tokenize and receive input_ids for reach text
    input_ids = tokenizer(texts.tolist(), return_tensors='np')['input_ids']
    # return length of inputs_ids for each text
    return input_ids

prompt_inputs = get_inputs(data['prompt'])
res_a_inputs = get_inputs(data['response_a'])
res_b_inputs = get_inputs(data['response_b'])
data['total_tokens'] = get_token_lengths(data['text'])


#max_lengthを超える文章データのインデックスを取得
indicies = data[data['total_tokens']>cfg.max_length].index

#トークンを調整して、デコード処理、データフレームに調整した文章を戻す処理

from tqdm import tqdm
def assign_tokens(data, indicies, max_length):
    
    for ind in tqdm(indicies):
        
        len_p = len(prompt_inputs[ind])
        len_a = len(res_a_inputs[ind])
        len_b = len(res_b_inputs[ind])
        total = len_p + len_a + len_b
        
        p_assign = int((len_p / total) * max_length)
        a_assign = int((len_a / total) * max_length)
        b_assign = max_length - (p_assign + a_assign)
        
        prompt_inputs[ind] = prompt_inputs[ind][:p_assign]
        res_a_inputs[ind] = res_a_inputs[ind][:a_assign]
        res_b_inputs[ind] = res_b_inputs[ind][:b_assign]
        
        # デコードしてデータに格納
        data.loc[ind, 'prompt'] = tokenizer.decode(prompt_inputs[ind], skip_special_tokens=True)
        data.loc[ind, 'response_a'] = tokenizer.decode(res_a_inputs[ind], skip_special_tokens=True)
        data.loc[ind, 'response_b'] = tokenizer.decode(res_b_inputs[ind], skip_special_tokens=True)
    
    return data

test = assign_tokens(data, indicies, cfg.max_length)
print('text_preprocessed')

# Prepare text for model

del tokenizer, data

0it [00:00, ?it/s]

text_preprocessed
CPU times: user 758 ms, sys: 136 ms, total: 894 ms
Wall time: 1.07 s





In [9]:
test['prompt']

0    'I have three oranges today, I ate an orange y...
1    "You are a mediator in a heated political deba...
2    'How to initialize the classification head whe...
Name: prompt, dtype: object

# Tokenize

In [10]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    prompt = ["User prompt: " + p for p in prompt]
    response_a = ["\n\nModel A :\n" + r_a for r_a in response_a]
    response_b = ["\n\n--------\n\nModel B:\n" + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

In [11]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained('/kaggle/input/gemma2-finetuned-0716/gemma-tokenizer')
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

CPU times: user 715 ms, sys: 75.8 ms, total: 791 ms
Wall time: 790 ms


In [None]:
# augumentation（入れ替え）前と後の文章の確認

# print(tokenizer.decode(data["input_ids"][1],skip_special_tokens=True))
# print(tokenizer.decode(aug_data["input_ids"][0]))

# Load model

In [None]:
# BitsAndBytes configuration
bnb_config =  BitsAndBytesConfig(
    load_in_8bit=True,
#     bnb_8bit_compute_dtype=torch.float16,
#     bnb_8bit_use_double_quant=False
)

# Load base model on GPU 0
device_0 = torch.device('cuda:0')
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_0,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    use_cache=False,
)

model_0.config.pad_token_id = tokenizer.pad_token_id    #モデルのパディングidをトークナイザーとそろえる

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_1,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    use_cache=False,
)

model_1.config.pad_token_id = tokenizer.pad_token_id    #モデルのパディングidをトークナイザーとそろえる


#### Load LoRA adapter

In [15]:
model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)

# Inference


In [16]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

In [17]:
st = time.time()

# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

elapsed time: 4.9093921184539795


In [18]:
st = time.time()

if cfg.tta:
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    tta_result_df = pd.concat(list(results), axis=0)
    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    # average original result and TTA result.
    proba = (proba + tta_proba) / 2

print(f"elapsed time: {time.time() - st}")

elapsed time: 0.0001900196075439453


In [19]:
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
2,1233961,0.103607,0.692299,0.204094
0,136060,0.002074,0.968699,0.029227
1,211333,0.239302,0.410234,0.350464
