In [None]:
import os
import json
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from tqdm import tqdm

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import PeftConfig, PeftModelForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput

# ==========================================
# CONFIGURATION
# ==========================================

# Base Model Path
BASE_MODEL_PATH = "/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1"

# Adapter Path
ADAPTER_PATH = "/kaggle/input/gemma-3-preference-adapter/transformers/v1/1" 

# Data Paths
TEST_CSV_PATH = "/kaggle/input/lmsys-chatbot-arena/test.csv"
SUBMISSION_PATH = "submission.csv"

# Device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# ==========================================
# CUSTOM CLASSES & FUNCTIONS
# ==========================================


class Gemma3ForSequenceClassification(PeftModelForSequenceClassification):
    def __init__(self, peft_config: PeftConfig, model: AutoModelForCausalLM, adapter_name="default"):
        super().__init__(model, peft_config, adapter_name)
        self.num_labels = model.config.num_labels
        self.problem_type = "single_label_classification" 

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs):
        
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs)

        # Extract logits from the outputs
        logits = outputs.logits

        # Select last "real" token based on attention mask
        sequence_lengths = torch.sum(attention_mask, dim=1)
        last_token_indices = sequence_lengths - 1
        batch_size = logits.shape[0]
       
        # Get the logits for the last token in the sequence
        logits = logits[torch.arange(batch_size, device=logits.device), last_token_indices, :]

        loss = None
        if labels is not None:
            # Logic mirroring your training code
            if self.problem_type == "regression":
                loss_fct = torch.nn.MSELoss()
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            elif self.problem_type == "single_label_classification":
                loss_fct = torch.nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.problem_type == "multi_label_classification":
                loss_fct = torch.nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels.float())

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

# Helper to Format Input (Sandwich Strategy)
def prepare_inference_input(row, tokenizer, max_length=8192):
    # Configuration from training
    TEMPLATE_BUFFER = 200 
    AVAILABLE_TOKENS = max_length - TEMPLATE_BUFFER
    PROMPT_RATIO = 0.2
    RESP_RATIO = 0.4

    # Parse JSON strings to actual text
    try:
        prompt_text = "\n".join(json.loads(row['prompt']))
        resp_a_text = "\n".join(json.loads(row['response_a']))
        resp_b_text = "\n".join(json.loads(row['response_b']))
    except (json.JSONDecodeError, TypeError):
        prompt_text = str(row['prompt'])
        resp_a_text = str(row['response_a'])
        resp_b_text = str(row['response_b'])

    # Tokenize to check lengths
    prompt_ids = tokenizer(prompt_text, add_special_tokens=False)['input_ids']
    resp_a_ids = tokenizer(resp_a_text, add_special_tokens=False)['input_ids']
    resp_b_ids = tokenizer(resp_b_text, add_special_tokens=False)['input_ids']

    # Apply Budget (Sandwich Logic)
    max_prompt_len = int(AVAILABLE_TOKENS * PROMPT_RATIO)
    max_resp_len = int(AVAILABLE_TOKENS * RESP_RATIO)

    # Prompt: Keep Start
    if len(prompt_ids) > max_prompt_len:
        prompt_ids = prompt_ids[:max_prompt_len]

    # Responses: Keep End
    if len(resp_a_ids) > max_resp_len:
        resp_a_ids = resp_a_ids[-max_resp_len:] 
    
    if len(resp_b_ids) > max_resp_len:
        resp_b_ids = resp_b_ids[-max_resp_len:] 

    # Decode back to text
    final_prompt = tokenizer.decode(prompt_ids, skip_special_tokens=True)
    final_resp_a = tokenizer.decode(resp_a_ids, skip_special_tokens=True)
    final_resp_b = tokenizer.decode(resp_b_ids, skip_special_tokens=True)

    # Construct the Final Formatted String
    return f"""# **Based on the following prompt choose which of the two responses you think humans would prefer the most:** \\n 
    ## **Prompt:**
    `{final_prompt}`\\n
    ## **Response A:**
    `{final_resp_a}`\\n
    ## **Response B:**
    `{final_resp_b}`"""


2025-11-30 15:47:53.426492: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764517673.812730      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764517673.920564      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Using device: cuda


In [5]:

# ==========================================
# LOAD MODEL
# ==========================================

print("Loading Tokenizer...")
# Try loading tokenizer from adapter first, else base model
try:
    tokenizer = AutoTokenizer.from_pretrained(ADAPTER_PATH, padding_side='right', add_bos=True, trust_remote_code=True)
except:
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_PATH, padding_side='right', add_bos=True, trust_remote_code=True)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading Base Model (4-bit)...")
# Matching training config
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_PATH, 
    # quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map=DEVICE,
    trust_remote_code=True
)

# Initialize Classification Head (3 classes)
num_labels = 3
print(f"Resizing lm_head to {num_labels}...")
base_model.lm_head = torch.nn.Linear(
    base_model.config.hidden_size,
    num_labels,
    bias=False,
    device=DEVICE
).to(torch.bfloat16)

print("Loading Adapter...")
peft_config = PeftConfig.from_pretrained(ADAPTER_PATH)

# Wrap base model in custom class
model = Gemma3ForSequenceClassification(peft_config, base_model)

# Load adapter weights
# This will also load the trained lm_head weights because 'modules_to_save' included it
model.load_adapter(ADAPTER_PATH, adapter_name="default")
model.eval()

print("Model ready!")



Loading Tokenizer...
Loading Base Model (4-bit)...




Resizing lm_head to 3...
Loading Adapter...
Model ready!


In [7]:
# ==========================================
# INFERENCE
# ==========================================

# Load Data
test_df = pd.read_csv(TEST_CSV_PATH)
ids = test_df['id'].tolist()
print(f"Total samples: {len(test_df)}")

# Pre-format inputs
print("Formatting inputs (Sandwich Strategy)...")
formatted_texts = []
for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    formatted_texts.append(prepare_inference_input(row, tokenizer))

# Run Inference
BATCH_SIZE = 4 # Adjust based on GPU memory
all_probs = []

print("Running inference...")
with torch.no_grad():
    for i in tqdm(range(0, len(formatted_texts), BATCH_SIZE)):
        batch_texts = formatted_texts[i : i + BATCH_SIZE]
        
        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=8192
        ).to(DEVICE)
        
        outputs = model(**inputs)
        
        # Handle tuple vs object return
        if isinstance(outputs, tuple):
            logits = outputs[0]
        else:
            logits = outputs.logits
            
        # Convert Logits to Probabilities (Softmax)
        # probs = F.softmax(logits, dim=-1).cpu().numpy()
        probs = F.softmax(logits.float(), dim=-1).cpu().numpy()
        all_probs.extend(probs)



Total samples: 3
Formatting inputs (Sandwich Strategy)...


100%|██████████| 3/3 [00:00<00:00, 314.62it/s]


Running inference...


100%|██████████| 1/1 [00:03<00:00,  3.78s/it]


In [8]:
# ==========================================
# SUBMISSION
# ==========================================

print("Creating submission file...")
probs_array = np.array(all_probs)

# Map indices to columns based on the training class2id:
# 0 -> winner_model_a
# 1 -> winner_model_b
# 2 -> winner_tie

submission = pd.DataFrame({
    'id': ids,
    'winner_model_a': probs_array[:, 0],
    'winner_model_b': probs_array[:, 1],
    'winner_tie': probs_array[:, 2]
})

print(submission.head())
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Saved to {SUBMISSION_PATH}")

Creating submission file...
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.321311        0.609741    0.068948
1   211333        0.341412        0.637842    0.020745
2  1233961        0.982696        0.008289    0.009015
Saved to submission.csv
