In [1]:
%%writefile gemma2_inference.py

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
from IPython.display import display, Math, Latex
import torch
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd, numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from peft import PeftModel
from scipy.special import softmax
from tqdm import tqdm


os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

lora_path = "/kaggle/input/gemma2-9b-it-cv945"
MAX_LEN = 320  # Increased from 256 for better context capture
# helpers
def format_input(row):
    # Enhanced prompt with more explicit structure
    x = "CORRECT"
    if not row['is_correct']:
        x = "INCORRECT"
    return (
        f"Mathematical Question: {row['QuestionText']}\n"
        f"Student's Answer: {row['MC_Answer']}\n"
        f"Answer Status: {x}\n"
        f"Student's Reasoning: {row['StudentExplanation']}\n"
        f"Task: Identify the misconception category and specific misconception."
    )

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LEN)



le = LabelEncoder()

train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')

train.Misconception = train.Misconception.fillna('NA')
train['target'] = train.Category+":"+train.Misconception
train['label'] = le.fit_transform(train['target'])
target_classes = le.classes_
n_classes = len(target_classes)
print(f"Train shape: {train.shape} with {n_classes} target classes")
idx = train.apply(lambda row: row.Category.split('_')[0],axis=1)=='True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c',ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId','MC_Answer']]
correct['is_correct'] = 1

# Prepare test data
test = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)
test['text'] = test.apply(format_input, axis=1)


# load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(lora_path)
model = AutoModelForSequenceClassification.from_pretrained(
    "/kaggle/input/gemma2-9b-it-bf16",
    num_labels=n_classes,
    torch_dtype=torch.bfloat16,  # Changed to bfloat16 for better numerical stability
    device_map="auto",
)

model = PeftModel.from_pretrained(model, lora_path)
model.eval()

# Tokenize dataset
ds_test = Dataset.from_pandas(test[['text']])
ds_test = ds_test.map(tokenize, batched=True, remove_columns=['text'])

# Create data collator for efficient batching with padding
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    max_length=MAX_LEN,  
    return_tensors="pt")

dataloader = DataLoader(
    ds_test,
    batch_size=6,  # Reduced for stability with longer sequences
    shuffle=False,
    collate_fn=data_collator,
    pin_memory=True,  
    num_workers=2     
)

# Fast inference loop
all_logits = []
device = next(model.parameters()).device

with torch.no_grad():
    for batch in tqdm(dataloader, desc="Gemma2 Inference"):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        logits = outputs.logits
        
        # Convert to float32 then move to CPU and store
        all_logits.append(logits.float().cpu().numpy())

# Concatenate all logits
predictions = np.concatenate(all_logits, axis=0)

# Apply temperature scaling for better calibration
temperature = 1.5
predictions = predictions / temperature

# Convert to probs
probs = softmax(predictions, axis=1)

# Get top predictions (top 35 for better ensemble coverage)
top_indices = np.argsort(-probs, axis=1)

# Decode to class names
flat_indices = top_indices.flatten()
decoded_labels = le.inverse_transform(flat_indices)
top_labels = decoded_labels.reshape(top_indices.shape)

# Create submission (top 3)
joined_preds = [" ".join(row[:3]) for row in top_labels]

sub = pd.DataFrame({
    "row_id": test.row_id.values,
    "Category:Misconception": joined_preds
})
sub.to_csv("submission_gemma.csv", index=False)

# Save more predictions for better ensembling
prob_data = []
for i in range(len(test)):
    prob_dict = {f"prob_{j}": probs[i, top_indices[i, j]] for j in range(35)}  # Increased to top 35
    prob_dict['row_id'] = test.row_id.values[i]
    prob_dict['top_classes'] = " ".join(top_labels[i, :35])  # Top 35 class names
    prob_data.append(prob_dict)

prob_df = pd.DataFrame(prob_data)
prob_df.to_csv("submission_gemma_prob.csv", index=False)
print("Gemma2 inference completed!")

Writing gemma2_inference.py


In [2]:
%%writefile qwen3_deepseek_inference.py

# we do parallel inference, for deepseek and qwen3
import os
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import threading
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from scipy.special import softmax
from tqdm import tqdm
import time

os.environ["TOKENIZERS_PARALLELISM"] = "false"


train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
test  = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

model_paths = [
    "/kaggle/input/deekseepmath-7b-map-competition/MAP_EXP_09_FULL",
   "/kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL"]

# Enhanced prompts with more context
def format_input_deepseek(row):
    """DeepSeek optimized prompt - math-focused"""
    status = "CORRECT ANSWER"
    if not row['is_correct']:
        status = "INCORRECT ANSWER"
    return (
        f"Math Problem: {row['QuestionText']}\n"
        f"Student Selected: {row['MC_Answer']}\n"
        f"Answer Status: {status}\n"
        f"Student's Explanation: {row['StudentExplanation']}\n"
        f"Analyze the mathematical misconception."
    )

def format_input_qwen(row):
    """Qwen optimized prompt - reasoning-focused"""
    correctness = "This answer is correct and shows proper understanding."
    if not row['is_correct']:
        correctness = "This answer is incorrect and reveals a misconception."
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer Given: {row['MC_Answer']}\n"
        f"Assessment: {correctness}\n"
        f"Student Reasoning: {row['StudentExplanation']}\n"
        f"Identify the category and misconception type."
    )


le = LabelEncoder()
train.Misconception  = train.Misconception.fillna('NA')
train['target']   = train.Category + ':' +train.Misconception
train['label']    = le.fit_transform(train['target'])

n_classes = len(le.classes_)
print(f"Train shape: {train.shape} with {n_classes} target classes")
idx = train.apply(lambda row: row.Category.split('_')[0],axis=1)=='True'
correct = train.loc[idx].copy()
correct['c'] = correct.groupby(['QuestionId','MC_Answer']).MC_Answer.transform('count')
correct = correct.sort_values('c',ascending=False)
correct = correct.drop_duplicates(['QuestionId'])
correct = correct[['QuestionId','MC_Answer']]
correct['is_correct'] = 1

test = test.merge(correct, on=['QuestionId','MC_Answer'], how='left')
test.is_correct = test.is_correct.fillna(0)


def run_inference_on_gpu(model_path, gpu_id, test_data, output_name, format_func):
    """Run inference for one model on one GPU"""
    
    device = f"cuda:{gpu_id}"
    print(f"Loading {output_name} on {device}...")
    
    # Prepare data with model-specific formatting
    test_copy = test_data.copy()
    test_copy['text'] = test_copy.apply(format_func, axis=1)
    
    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, 
        device_map=device, 
        torch_dtype=torch.bfloat16  # Changed to bfloat16 for better stability
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.eval()
    
    # Tokenize function
    def tokenize(batch):
        return tokenizer(batch["text"], 
                        truncation=True,
                        max_length=320)  # Increased context
    
    ds_test = Dataset.from_pandas(test_copy[['text']])
    ds_test = ds_test.map(tokenize, batched=True, remove_columns=['text'])
    
    # Data collator
    data_collator = DataCollatorWithPadding(
        tokenizer=tokenizer,
        padding=True,
        return_tensors="pt"
    )
    
    # DataLoader
    dataloader = DataLoader(
        ds_test,
        batch_size=6 if output_name == "deepseek" else 5,  # Optimized batch sizes
        shuffle=False,
        collate_fn=data_collator,
        pin_memory=True,
        num_workers=0
    )
    
    # Inference
    all_logits = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"{output_name}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            all_logits.append(outputs.logits.float().cpu().numpy())
    
    predictions = np.concatenate(all_logits, axis=0)
    
    # Temperature scaling per model
    temperature = 1.3 if output_name == "deepseek" else 1.4
    predictions = predictions / temperature
    
    # Process results
    probs = softmax(predictions, axis=1)
    top_indices = np.argsort(-probs, axis=1)
    
    # Decode labels
    flat_indices = top_indices.flatten()
    decoded_labels = le.inverse_transform(flat_indices)
    top_labels = decoded_labels.reshape(top_indices.shape)
    
    # Save top-3 submission
    joined_preds = [" ".join(row[:3]) for row in top_labels]
    sub = pd.DataFrame({
        "row_id": test_data.row_id.values,
        "Category:Misconception": joined_preds
    })
    sub.to_csv(f"submission_{output_name}.csv", index=False)
    
    # Save probabilities for ensemble (increased to 35)
    prob_data = []
    for i in range(len(predictions)):
        prob_dict = {f"prob_{j}": probs[i, top_indices[i, j]] for j in range(35)}
        prob_dict['row_id'] = test_data.row_id.values[i]
        prob_dict['top_classes'] = " ".join(top_labels[i, :35])
        prob_data.append(prob_dict)
    
    prob_df = pd.DataFrame(prob_data)
    prob_df.to_csv(f"submission_{output_name}_probabilities.csv", index=False)
    
    print(f"✓ {output_name} completed - saved submission and probabilities")
    
    # Clean up GPU memory
    del model, tokenizer
    torch.cuda.empty_cache()

print("🚀 Starting multi-GPU inference...")
start_time = time.time()

threads = []
gpu_assignments = [
    (model_paths[0], 0, "deepseek", format_input_deepseek),
    (model_paths[1], 1, "qwen3", format_input_qwen),
]

# Start threads
for model_path, gpu_id, name, format_func in gpu_assignments:
    if gpu_id < torch.cuda.device_count():  
        thread = threading.Thread(
            target=run_inference_on_gpu,
            args=(model_path, gpu_id, test, name, format_func)
        )
        threads.append(thread)
        thread.start()
        time.sleep(10)  # Stagger starts to avoid memory issues

# Wait for completion
for thread in threads:
    thread.join()

end_time = time.time()
print(f"✅ All inference completed in {end_time - start_time:.2f} seconds!")

Writing qwen3_deepseek_inference.py


In [3]:
import time 
import os

print("=" * 60)
print("Starting Gemma2 Inference...")
print("=" * 60)
!python /kaggle/working/gemma2_inference.py

time.sleep(15)  # Increased wait time for memory cleanup

print("\n" + "=" * 60)
print("Starting DeepSeek & Qwen3 Parallel Inference...")
print("=" * 60)
!python /kaggle/working/qwen3_deepseek_inference.py

# Verify all outputs exist
required_files = [
    'submission_gemma_prob.csv',
    'submission_deepseek_probabilities.csv',
    'submission_qwen3_probabilities.csv'
]

print("\n" + "=" * 60)
print("Verifying output files...")
for file in required_files:
    if os.path.exists(f'/kaggle/working/{file}'):
        print(f"✓ {file} exists")
    else:
        print(f"✗ {file} MISSING!")
print("=" * 60)

Starting Gemma2 Inference...
2025-10-05 20:42:57.775593: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759696978.129187      80 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759696978.231574      80 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Train shape: (36696, 9) with 65 target classes
Loading checkpoint shards: 100%|██████████████████| 4/4 [02:36<00:00, 39.10s/it]
Some weights of Gemma2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/gemma2-9b-it-bf16 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.special import softmax
from scipy.stats import rankdata

def extract_class_probabilities(row, model_suffix='', top_k=35):
    """Extract class names and probabilities from a row"""
    classes_col = f'top_classes{model_suffix}'
    if classes_col in row:
        classes = row[classes_col].split(' ')[:top_k]
    else:
        return {}
    
    class_probs = {}
    for i in range(min(top_k, len(classes))):
        prob_col = f'prob_{i}{model_suffix}'
        if prob_col in row:
            class_probs[classes[i]] = row[prob_col]
    return class_probs

def advanced_ensemble(prob_files, model_weights=None, top_k=3):
    """
    Advanced ensemble with:
    1. Rank-based fusion
    2. Calibrated probability averaging
    3. Diversity-aware voting
    4. Confidence-weighted combination
    """
    n_models = len(prob_files)
    prob_dfs = []
    final_predictions = []
    
    for file_path in prob_files:
        df = pd.read_csv(file_path)
        prob_dfs.append(df)
    
    # Merge on row_id
    merged_df = prob_dfs[0]
    for i, df in enumerate(prob_dfs[1:], 1):
        merged_df = pd.merge(merged_df, df, on='row_id', suffixes=('', f'_model{i+1}'))
    
    print(f"Processing {len(merged_df)} test samples...")
    
    for idx, row in merged_df.iterrows():
        # Extract probabilities from each model
        all_class_probs = []
        all_class_ranks = []
        
        for i in range(n_models):
            suffix = f'_model{i+1}' if i > 0 else ''
            class_probs = extract_class_probabilities(row, suffix, top_k=35)
            all_class_probs.append(class_probs)
            
            # Calculate ranks (lower is better)
            if class_probs:
                classes = list(class_probs.keys())
                probs = np.array(list(class_probs.values()))
                # Rank: 1 for highest prob, 2 for second highest, etc.
                ranks = rankdata(-probs, method='ordinal')
                class_ranks = dict(zip(classes, ranks))
                all_class_ranks.append(class_ranks)
            else:
                all_class_ranks.append({})
        
        # Get all unique classes
        all_classes = set()
        for class_probs in all_class_probs:
            all_classes.update(class_probs.keys())
        
        # Calculate multiple scoring components
        final_scores = {}
        
        for class_name in all_classes:
            # Component 1: Weighted probability average
            prob_sum = 0
            prob_count = 0
            for i, class_probs in enumerate(all_class_probs):
                if class_name in class_probs:
                    prob_sum += class_probs[class_name] * model_weights[i]
                    prob_count += 1
            avg_prob = prob_sum / max(prob_count, 1)
            
            # Component 2: Rank-based score (Borda count style)
            rank_score = 0
            for i, class_ranks in enumerate(all_class_ranks):
                if class_name in class_ranks:
                    # Convert rank to score (lower rank = higher score)
                    rank = class_ranks[class_name]
                    # Reciprocal rank with weight
                    rank_score += (1.0 / rank) * model_weights[i]
            
            # Component 3: Model agreement (how many models predict this)
            agreement = sum(1 for cp in all_class_probs if class_name in cp) / n_models
            
            # Component 4: Max confidence (highest probability across models)
            max_confidence = 0
            for i, class_probs in enumerate(all_class_probs):
                if class_name in class_probs:
                    weighted_prob = class_probs[class_name] * model_weights[i]
                    max_confidence = max(max_confidence, weighted_prob)
            
            # Component 5: Position bonus (higher weight for top positions)
            position_bonus = 0
            for i, class_probs in enumerate(all_class_probs):
                if class_name in class_probs:
                    # Get position (0-indexed)
                    position = list(class_probs.keys()).index(class_name)
                    # Exponential decay: top positions get much higher bonus
                    position_bonus += np.exp(-position / 5.0) * model_weights[i]
            
            # Combine all components with optimized weights
            final_scores[class_name] = (
                avg_prob * 0.35 +           # 35% weighted probability
                rank_score * 0.25 +         # 25% rank-based score
                agreement * 0.15 +          # 15% model agreement
                max_confidence * 0.15 +     # 15% peak confidence
                position_bonus * 0.10       # 10% position bonus
            )
        
        # Sort and get top-k
        sorted_classes = sorted(final_scores.items(), key=lambda x: -x[1])
        top_classes = [class_name for class_name, _ in sorted_classes[:top_k]]
        
        final_predictions.append(' '.join(top_classes))
    
    return final_predictions

# Model weights based on individual scores + calibration
# DeepSeek: 0.944 -> weight 1.5 (best performer, math-specialized)
# Gemma2: 0.942 -> weight 1.3 (strong generalist)
# Qwen3: 0.943 -> weight 1.4 (good reasoning)

print("=" * 60)
print("Starting Advanced Ensemble")
print("=" * 60)

prob_files = [
    '/kaggle/working/submission_deepseek_probabilities.csv',
    '/kaggle/working/submission_gemma_prob.csv',
    '/kaggle/working/submission_qwen3_probabilities.csv'
]

# Optimized weights
w1 = 1.5  # DeepSeek (best on math)
w2 = 1.3  # Gemma2
w3 = 1.4  # Qwen3

predictions = advanced_ensemble(
    prob_files, 
    model_weights=[w1, w2, w3],  
    top_k=3
)

test_df = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')
submission = pd.DataFrame({
    'row_id': test_df.row_id.values,
    'Category:Misconception': predictions
})

submission.to_csv('submission.csv', index=False)

print("\n" + "=" * 60)
print("✅ Submission created successfully!")
print(f"Total predictions: {len(submission)}")
print("=" * 60)
print("\nFirst 5 predictions:")
print(submission.head())

Starting Advanced Ensemble
Processing 3 test samples...

✅ Submission created successfully!
Total predictions: 3

First 5 predictions:
   row_id                             Category:Misconception
0   36696   False_Neither:NA True_Correct:NA True_Neither:NA
1   36697  False_Neither:NA False_Misconception:WNB False...
2   36698   False_Neither:NA True_Neither:NA True_Correct:NA
