In [None]:
import os
import json
import torch
import pandas as pd
import numpy as np
import torch.nn.functional as F
from tqdm import tqdm
import concurrent.futures
import gc

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import PeftConfig, PeftModelForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput

# ==========================================
# CONFIGURATION
# ==========================================

BASE_MODEL_PATH = "/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1"
ADAPTER_PATH = "/kaggle/input/gemma-3-preference-adapter/transformers/v1/1"
TEST_CSV_PATH = "/kaggle/input/lmsys-chatbot-arena/test.csv"
SUBMISSION_PATH = "submission.csv"

# ==========================================
# CUSTOM CLASSES & FUNCTIONS
# ==========================================

class Gemma3ForSequenceClassification(PeftModelForSequenceClassification):
    def __init__(self, peft_config: PeftConfig, model: AutoModelForCausalLM, adapter_name="default"):
        super().__init__(model, peft_config, adapter_name)
        self.num_labels = model.config.num_labels
        self.problem_type = "single_label_classification" 

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs):
        
        return_dict = return_dict if return_dict is not None else self.config.return_dict

        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs)

        # Extract logits from the outputs
        logits = outputs.logits

        # Select last "real" token based on attention mask
        sequence_lengths = torch.sum(attention_mask, dim=1)
        last_token_indices = sequence_lengths - 1
        batch_size = logits.shape[0]
       
        # Get the logits for the last token in the sequence
        logits = logits[torch.arange(batch_size, device=logits.device), last_token_indices, :]

        loss = None
        if labels is not None:
            # Logic mirroring your training code
            if self.problem_type == "regression":
                loss_fct = torch.nn.MSELoss()
                loss = loss_fct(logits.squeeze(), labels.squeeze())
            elif self.problem_type == "single_label_classification":
                loss_fct = torch.nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.problem_type == "multi_label_classification":
                loss_fct = torch.nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels.float())

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

def prepare_inference_input(row, tokenizer, max_length=8192):
    TEMPLATE_BUFFER = 200 
    AVAILABLE_TOKENS = max_length - TEMPLATE_BUFFER
    PROMPT_RATIO = 0.2
    RESP_RATIO = 0.4

    try:
        prompt_text = "\n".join(json.loads(row['prompt']))
        resp_a_text = "\n".join(json.loads(row['response_a']))
        resp_b_text = "\n".join(json.loads(row['response_b']))
    except:
        prompt_text = str(row['prompt'])
        resp_a_text = str(row['response_a'])
        resp_b_text = str(row['response_b'])

    prompt_ids = tokenizer(prompt_text, add_special_tokens=False)['input_ids']
    resp_a_ids = tokenizer(resp_a_text, add_special_tokens=False)['input_ids']
    resp_b_ids = tokenizer(resp_b_text, add_special_tokens=False)['input_ids']

    max_prompt_len = int(AVAILABLE_TOKENS * PROMPT_RATIO)
    max_resp_len = int(AVAILABLE_TOKENS * RESP_RATIO)

    if len(prompt_ids) > max_prompt_len: prompt_ids = prompt_ids[:max_prompt_len]
    if len(resp_a_ids) > max_resp_len: resp_a_ids = resp_a_ids[-max_resp_len:] 
    if len(resp_b_ids) > max_resp_len: resp_b_ids = resp_b_ids[-max_resp_len:] 

    final_prompt = tokenizer.decode(prompt_ids, skip_special_tokens=True)
    final_resp_a = tokenizer.decode(resp_a_ids, skip_special_tokens=True)
    final_resp_b = tokenizer.decode(resp_b_ids, skip_special_tokens=True)

    return f"""# **Based on the following prompt choose which of the two responses you think humans would prefer the most:** \\n 
    ## **Prompt:**
    `{final_prompt}`\\n
    ## **Response A:**
    `{final_resp_a}`\\n
    ## **Response B:**
    `{final_resp_b}`"""

2025-11-30 16:17:49.406683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764519469.560060      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764519469.605985      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [4]:
# ==========================================
# PARALLEL SETUP
# ==========================================

import os
# Disable internal tokenizer parallelism to avoid deadlocks within the threads
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Shared Tokenizer Logic
try:
    TOKENIZER_SOURCE = ADAPTER_PATH
    # Verify we can load it
    AutoTokenizer.from_pretrained(TOKENIZER_SOURCE, trust_remote_code=True)
except:
    TOKENIZER_SOURCE = BASE_MODEL_PATH

# Function to Load a Model Replica on a specific GPU
def load_model_on_device(device_id):
    device_name = f"cuda:{device_id}"
    print(f"Loading Model Replica on {device_name}...")
    
    # Load Base
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_PATH, 
        torch_dtype=torch.bfloat16,
        device_map=device_name,
        trust_remote_code=True
    )
    
    # Init Head
    base_model.lm_head = torch.nn.Linear(
        base_model.config.hidden_size, 3, bias=False, device=device_name
    ).to(torch.bfloat16)

    # Load Adapter
    peft_config = PeftConfig.from_pretrained(ADAPTER_PATH)
    model = Gemma3ForSequenceClassification(peft_config, base_model)
    model.load_adapter(ADAPTER_PATH, adapter_name="default")
    model.eval()
    return model

# Worker Function for Threading
def inference_worker(model, subset_df, device_id):
    device_name = f"cuda:{device_id}"
    
    # Load a fresh tokenizer for this specific thread
    local_tokenizer = AutoTokenizer.from_pretrained(
        TOKENIZER_SOURCE, 
        padding_side='right', 
        add_bos=True, 
        trust_remote_code=True
    )
    if local_tokenizer.pad_token is None:
        local_tokenizer.pad_token = local_tokenizer.eos_token

    
    print(f"Worker {device_id}: Starting inference on {len(subset_df)} samples...")
    
    # Format inputs using the LOCAL tokenizer
    formatted_texts = []
    for _, row in subset_df.iterrows():
        # Pass local_tokenizer to your helper function
        formatted_texts.append(prepare_inference_input(row, local_tokenizer))
        
    local_probs = []
    BATCH_SIZE = 4 
    
    with torch.no_grad():
        for i in tqdm(range(0, len(formatted_texts), BATCH_SIZE), desc=f"GPU {device_id}", position=device_id):
            batch_texts = formatted_texts[i : i + BATCH_SIZE]
            
            # Tokenize using LOCAL tokenizer
            inputs = local_tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=8192
            ).to(device_name)
            
            outputs = model(**inputs)
            logits = outputs.logits
            
            # Safe Softmax
            probs = F.softmax(logits.float(), dim=-1).cpu().numpy()
            local_probs.extend(probs)
            
    return local_probs

In [5]:
# ==========================================
# EXECUTION
# ==========================================

# Check GPUs
if torch.cuda.device_count() < 2:
    print("⚠️ WARNING: Less than 2 GPUs found. Using Single GPU Mode on cuda:0")
    models = [load_model_on_device(0)]
    gpu_indices = [0]
else:
    print("✅ 2 GPUs found. Initializing dual-model setup.")
    model_0 = load_model_on_device(0)
    model_1 = load_model_on_device(1)
    models = [model_0, model_1]
    gpu_indices = [0, 1]

# Load Data
test_df = pd.read_csv(TEST_CSV_PATH)
ids = test_df['id'].tolist()

# Split Data
chunks = np.array_split(test_df, len(models))
print(f"Data split into {len(chunks)} chunks.")

# Run Parallel Inference
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=len(models)) as executor:
    futures = []
    for i, model in enumerate(models):
        futures.append(
            executor.submit(inference_worker, model, chunks[i], gpu_indices[i])
        )
    
    for future in futures:
        results.extend(future.result())

✅ 2 GPUs found. Initializing dual-model setup.
Loading Model Replica on cuda:0...
Loading Model Replica on cuda:1...


  return bound(*args, **kwds)


Data split into 2 chunks.
Worker 1: Starting inference on 1 samples...



GPU 1:   0%|          | 0/1 [00:00<?, ?it/s][A

Worker 0: Starting inference on 2 samples...


GPU 0: 100%|██████████| 1/1 [00:00<00:00,  1.11it/s]

GPU 1: 100%|██████████| 1/1 [00:01<00:00,  1.42s/it][A


In [6]:
# ==========================================
# SUBMISSION
# ==========================================

print("Creating submission file...")
probs_array = np.array(results)

submission = pd.DataFrame({
    'id': ids,
    'winner_model_a': probs_array[:, 0],
    'winner_model_b': probs_array[:, 1],
    'winner_tie': probs_array[:, 2]
})

print(submission.head())
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Saved to {SUBMISSION_PATH}")

Creating submission file...
        id  winner_model_a  winner_model_b  winner_tie
0   136060        0.435424        0.033552    0.531024
1   211333        0.307735        0.022732    0.669534
2  1233961        0.001448        0.994020    0.004532
Saved to submission.csv
