In [1]:
!pip install bitsandbytes --no-index --find-links=/kaggle/input/my-inference-packages-data/my_offline_packages

Looking in links: /kaggle/input/my-inference-packages-data/my_offline_packages
Processing /kaggle/input/my-inference-packages-data/my_offline_packages/bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Processing /kaggle/input/my-inference-packages-data/my_offline_packages/torch-2.9.1-cp311-cp311-manylinux_2_28_x86_64.whl (from bitsandbytes)
Processing /kaggle/input/my-inference-packages-data/my_offline_packages/sympy-1.14.0-py3-none-any.whl (from torch<3,>=2.3->bitsandbytes)
Processing /kaggle/input/my-inference-packages-data/my_offline_packages/nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl (from torch<3,>=2.3->bitsandbytes)
Processing /kaggle/input/my-inference-packages-data/my_offline_packages/nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (from torch<

In [2]:
!pip install torchvision==0.24.1 --no-index --find-links=/kaggle/input/torchvision0-24-1

Looking in links: /kaggle/input/torchvision0-24-1
Processing /kaggle/input/torchvision0-24-1/torchvision-0.24.1-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: torchvision
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.21.0+cu124
    Uninstalling torchvision-0.21.0+cu124:
      Successfully uninstalled torchvision-0.21.0+cu124
Successfully installed torchvision-0.24.1


In [3]:
# ==========================================
# Protobuf Patch
# ==========================================
try:
    from google.protobuf.message_factory import MessageFactory
    if not hasattr(MessageFactory, 'GetPrototype'):
        def get_prototype_replacement(self, descriptor):
            try:
                return self.GetMessageClass(descriptor)
            except:
                from google.protobuf.message import Message
                class DefaultMessage(Message):
                    def ParseFromString(self, s): pass
                    def SerializeToString(self): return b""
                return DefaultMessage
        MessageFactory.GetPrototype = get_prototype_replacement
except:
    pass

# ==========================================
# Imports
# ==========================================
import os
import json
import warnings
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from concurrent.futures import ThreadPoolExecutor
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

# ==========================================
# Config
# ==========================================
TEST_CSV = '/kaggle/input/lmsys-chatbot-arena/test.csv'
BASE_MODEL_PATH = "/kaggle/input/gemma29b-base/transformers/default/1/gemma_base_model"
ADAPTER_PATH = "/kaggle/input/gemma29b-lora/transformers/default/1/gemma_lora_final"
MAX_LEN = 1024
BATCH_SIZE = 4
TTA = True  # Test Time Augmentation

# ==========================================
# Helpers
# ==========================================
def process_text(text):
    try:
        text = str(text).replace("null", "'null'")
        parsed = json.loads(text)
        if isinstance(parsed, list) and len(parsed) > 0:
            return parsed[0]
        return str(text)
    except:
        return str(text)

def format_prompt(row, swap=False):
    p = process_text(row['prompt'])[:512]
    a = process_text(row['response_a'])[:1024]
    b = process_text(row['response_b'])[:1024]
    
    if swap:
        a, b = b, a  # Swap A and B
    
    return f"<start_of_turn>user\nWhich model's answer is better? Directly answer with 'A', 'B', or 'tie'.\n\n### Prompt\n{p}\n\n### Response A\n{a}\n\n### Response B\n{b}<end_of_turn>\n<start_of_turn>model\n"

def get_probs(logits, tokenizer):
    probs = torch.softmax(logits, dim=-1)
    top_probs, top_ids = torch.topk(probs, 10)
    
    results = []
    for i in range(logits.shape[0]):
        pa, pb, pt = 0.0, 0.0, 0.0
        for p, idx in zip(top_probs[i].cpu().numpy(), top_ids[i].cpu().numpy()):
            tok = tokenizer.decode([idx]).lower().strip().strip('.')
            if tok == 'a': pa += p
            elif tok == 'b': pb += p
            elif tok in ['tie', 'draw']: pt += p
        total = pa + pb + pt
        if total < 1e-6:
            results.append([0.33, 0.33, 0.34])
        else:
            results.append([pa/total, pb/total, pt/total])
    return results

@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, tokenizer, device):
    a_win, b_win, tie = [], [], []
    
    for i in range(0, len(df), BATCH_SIZE):
        batch = df.iloc[i:i+BATCH_SIZE]
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": batch["input_ids"].tolist(), 
             "attention_mask": batch["attention_mask"].tolist()},
            padding="longest",
            return_tensors="pt",
        ).to(device)
        
        logits = model(**inputs).logits[:, -1, :]
        for p in get_probs(logits, tokenizer):
            a_win.append(p[0])
            b_win.append(p[1])
            tie.append(p[2])
    
    df = df.copy()
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    return df

def run_inference(test, tokenizer, model_0, model_1, swap=False):
    """Run inference with optional A/B swap"""
    # Tokenize
    prompts = test.apply(lambda row: format_prompt(row, swap=swap), axis=1).tolist()
    tok = tokenizer(prompts, max_length=MAX_LEN, truncation=True, padding=False)
    
    data = pd.DataFrame({
        "id": test["id"],
        "input_ids": tok.input_ids,
        "attention_mask": tok.attention_mask,
        "length": [len(x) for x in tok.input_ids]
    })
    
    # Sort by length
    data = data.sort_values("length", ascending=False)
    
    # Split interleaved
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()
    
    # Parallel inference
    with ThreadPoolExecutor(max_workers=2) as ex:
        results = list(ex.map(
            inference,
            [sub_1, sub_2],
            [model_0, model_1],
            [tokenizer, tokenizer],
            [torch.device("cuda:0"), torch.device("cuda:1")]
        ))
    
    result = pd.concat(results).sort_values("id").reset_index(drop=True)
    return result

# ==========================================
# Main
# ==========================================
# Load data
test = pd.read_csv(TEST_CSV)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load models
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model_0 = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH, quantization_config=bnb_config,
    device_map={"": 0}, torch_dtype=torch.float16, use_cache=False
)
model_0 = PeftModel.from_pretrained(model_0, ADAPTER_PATH)
model_0.eval()

model_1 = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH, quantization_config=bnb_config,
    device_map={"": 1}, torch_dtype=torch.float16, use_cache=False
)
model_1 = PeftModel.from_pretrained(model_1, ADAPTER_PATH)
model_1.eval()

# ==========================================
# Original inference
# ==========================================
result_orig = run_inference(test, tokenizer, model_0, model_1, swap=False)

if TTA:
    # ==========================================
    # TTA: Swap A/B and run again
    # ==========================================
    result_swap = run_inference(test, tokenizer, model_0, model_1, swap=True)
    
    # Swap 的結果要反轉回來：
    # swap 版的 "A wins" 其實是原本的 "B wins"
    # swap 版的 "B wins" 其實是原本的 "A wins"
    # tie 不變
    
    orig_a = result_orig["winner_model_a"].values
    orig_b = result_orig["winner_model_b"].values
    orig_tie = result_orig["winner_tie"].values
    
    swap_a = result_swap["winner_model_b"].values  # swap 後 B 變成原本的 A
    swap_b = result_swap["winner_model_a"].values  # swap 後 A 變成原本的 B
    swap_tie = result_swap["winner_tie"].values
    
    # 平均兩次結果
    final_a = (orig_a + swap_a) / 2
    final_b = (orig_b + swap_b) / 2
    final_tie = (orig_tie + swap_tie) / 2
    
    # Normalize to sum = 1
    total = final_a + final_b + final_tie
    final_a = final_a / total
    final_b = final_b / total
    final_tie = final_tie / total
    
    result = pd.DataFrame({
        "id": result_orig["id"],
        "winner_model_a": final_a,
        "winner_model_b": final_b,
        "winner_tie": final_tie
    })
else:
    result = result_orig[["id", "winner_model_a", "winner_model_b", "winner_tie"]]

# Save
result.to_csv("submission.csv", index=False)

2025-11-26 17:08:34.198303: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764176914.417894      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764176914.480922      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['cache_implementation']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]