In [None]:
# --- Cell 1: Imports and Constants (Inference Focus) ---
import pandas as pd
import os
from tqdm import tqdm
import torch
# AutoConfig is needed for Option 2 model loading
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig 
from sklearn.preprocessing import LabelEncoder # Needed for EFFECTIVENESS_CLASSES
from torch.utils.data import Dataset, DataLoader 
import numpy as np 
import time # For timing inference

# --- Environment Detection (Basic) ---
IS_KAGGLE_ENV = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ

# --- Model & Path Configuration ---
MODEL_NAME = 'bert-base-uncased' # Base model name used for architecture if config loading fails
MAX_LEN = 512 
NUM_LABELS = 3 # Will be updated after label encoding from df_train
BATCH_SIZE = 16 # Batch size for inference (can be larger than training)

# --- Paths (CRITICAL: Update for Kaggle environment) ---
if IS_KAGGLE_ENV:
    print("Running in Kaggle environment.")
    BASE_PATH = "/kaggle/input/feedback-prize-effectiveness"
    # *** REPLACE 'your-model-dataset-slug' AND 'epoch_X' with your actual Kaggle dataset slug and model folder ***
    KAGGLE_MODEL_INPUT_DIR = "/kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2" 
    
    BEST_MODEL_DIR = KAGGLE_MODEL_INPUT_DIR # Directory containing tokenizer, config.json (added manually), and .pt file
    BEST_MODEL_FILENAME = f"{MODEL_NAME}-best.pt" # Name of the state_dict file
    BEST_MODEL_PATH = os.path.join(BEST_MODEL_DIR, BEST_MODEL_FILENAME) # Path to the state_dict file
    
    OUTPUT_DIR = "/kaggle/working/" 
    SUBMISSION_FILE = os.path.join(OUTPUT_DIR, "submission.csv")
else:
    print("Running in local environment (for testing inference).")
    BASE_PATH = "./feedback-prize-effectiveness/" 
    # *** Update this to your local best model directory ***
    BEST_MODEL_DIR = "./models/epoch_2/" # Example: directory containing tokenizer, config.json, and .pt file
    BEST_MODEL_FILENAME = f"{MODEL_NAME}-best.pt"
    BEST_MODEL_PATH = os.path.join(BEST_MODEL_DIR, BEST_MODEL_FILENAME) 
    
    OUTPUT_DIR = "./" # Save submission in current dir for local test
    SUBMISSION_FILE = "submission.csv" 

# TRAIN_CSV is needed for LabelEncoder to get EFFECTIVENESS_CLASSES consistently
TRAIN_CSV = os.path.join(BASE_PATH, "train.csv")
TEST_CSV = os.path.join(BASE_PATH, "test.csv")
# TEST_ESSAYS_DIR is needed for test data
TEST_ESSAYS_DIR = os.path.join(BASE_PATH, "test/")
print("\n--- Path Check (Inference Mode) ---")
paths_to_check = {
    "Competition Base Path": BASE_PATH,
    "Train CSV (for LabelEncoder)": TRAIN_CSV,
    "Test CSV": TEST_CSV,
    "Test Essays Dir": TEST_ESSAYS_DIR,
    "Best Model Directory (for tokenizer, config & model)": BEST_MODEL_DIR,
    "Best Model State Dict File Path (.pt)": BEST_MODEL_PATH, 
    "Output Directory": OUTPUT_DIR, 
    "Submission File Path": SUBMISSION_FILE
}
# Add config.json path check
CONFIG_JSON_PATH = os.path.join(BEST_MODEL_DIR, "config.json")
paths_to_check["Model Config File Path (config.json)"] = CONFIG_JSON_PATH

for name, path_val in paths_to_check.items():
    exists = os.path.exists(path_val)
    status = "Found" if exists else "NOT FOUND"
    if name == "Best Model State Dict File Path (.pt)" and not exists and not IS_KAGGLE_ENV:
        status += " (Expected if model not yet trained/placed)"
    elif name == "Best Model Directory (for tokenizer, config & model)" and not exists and IS_KAGGLE_ENV:
        status += " (CRITICAL: This path must exist on Kaggle with your model files!)"
    elif name == "Model Config File Path (config.json)" and not exists:
         status += " (CRITICAL for Option 2 loading: Ensure config.json is in the model directory!)"
    print(f"{name}: {path_val} ... {status}")

print("--- End Path Check ---\n")
if IS_KAGGLE_ENV and "your-model-dataset-slug" in KAGGLE_MODEL_INPUT_DIR:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!! WARNING: 'KAGGLE_MODEL_INPUT_DIR' still contains placeholder           !!!")
    print("!!! 'your-model-dataset-slug/epoch_X'. Update with your Kaggle dataset slug!!!")
    print("!!! and the correct model folder path.                                     !!!")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
# TRAIN_ESSAYS_DIR is needed if df_train is used beyond LabelEncoder
TRAIN_ESSAYS_DIR = os.path.join(BASE_PATH, "train/")

Running in Kaggle environment.

--- Path Check (Inference Mode) ---
Competition Base Path: /kaggle/input/feedback-prize-effectiveness ... Found
Train CSV (for LabelEncoder): /kaggle/input/feedback-prize-effectiveness/train.csv ... Found
Test CSV: /kaggle/input/feedback-prize-effectiveness/test.csv ... Found
Test Essays Dir: /kaggle/input/feedback-prize-effectiveness/test/ ... Found
Best Model Directory (for tokenizer, config & model): /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2 ... Found
Best Model State Dict File Path (.pt): /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2/bert-base-uncased-best.pt ... Found
Output Directory: /kaggle/working/ ... Found
Submission File Path: /kaggle/working/submission.csv ... NOT FOUND
Model Config File Path (config.json): /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2/config.json ... Found
--- End Path Check ---



In [None]:
# --- Path Checking ---
print("\n--- Path Check (Inference Mode) ---")
paths_to_check = {
    "Competition Base Path": BASE_PATH,
    "Train CSV (for LabelEncoder)": TRAIN_CSV,
    "Test CSV": TEST_CSV,
    "Test Essays Dir": TEST_ESSAYS_DIR,
    "Best Model Directory (for tokenizer, config & model)": BEST_MODEL_DIR,
    "Best Model State Dict File Path (.pt)": BEST_MODEL_PATH, 
    "Output Directory": OUTPUT_DIR, 
    "Submission File Path": SUBMISSION_FILE
}
# Add config.json path check
CONFIG_JSON_PATH = os.path.join(BEST_MODEL_DIR, "config.json")
paths_to_check["Model Config File Path (config.json)"] = CONFIG_JSON_PATH

for name, path_val in paths_to_check.items():
    exists = os.path.exists(path_val)
    status = "Found" if exists else "NOT FOUND"
    if name == "Best Model State Dict File Path (.pt)" and not exists and not IS_KAGGLE_ENV:
        status += " (Expected if model not yet trained/placed)"
    elif name == "Best Model Directory (for tokenizer, config & model)" and not exists and IS_KAGGLE_ENV:
        status += " (CRITICAL: This path must exist on Kaggle with your model files!)"
    elif name == "Model Config File Path (config.json)" and not exists:
         status += " (CRITICAL for Option 2 loading: Ensure config.json is in the model directory!)"
    print(f"{name}: {path_val} ... {status}")

print("--- End Path Check ---\n")
if IS_KAGGLE_ENV and "your-model-dataset-slug" in KAGGLE_MODEL_INPUT_DIR:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!! WARNING: 'KAGGLE_MODEL_INPUT_DIR' still contains placeholder           !!!")
    print("!!! 'your-model-dataset-slug/epoch_X'. Update with your Kaggle dataset slug!!!")
    print("!!! and the correct model folder path.                                     !!!")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")


--- Path Check (Inference Mode) ---
Competition Base Path: /kaggle/input/feedback-prize-effectiveness ... Found
Train CSV (for LabelEncoder): /kaggle/input/feedback-prize-effectiveness/train.csv ... Found
Test CSV: /kaggle/input/feedback-prize-effectiveness/test.csv ... Found
Test Essays Dir: /kaggle/input/feedback-prize-effectiveness/test/ ... Found
Best Model Directory (for tokenizer, config & model): /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2 ... Found
Best Model State Dict File Path (.pt): /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2/bert-base-uncased-best.pt ... Found
Output Directory: /kaggle/working/ ... Found
Submission File Path: /kaggle/working/submission.csv ... NOT FOUND
Model Config File Path (config.json): /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2/config.json ... Found
--- End Path Check ---



In [3]:
# --- Cell 2: Helper functions ---
def load_essay_texts(essay_ids, essays_dir):
    essay_texts = {}
    for essay_id in tqdm(essay_ids, desc=f"Loading essays from {essays_dir}"):
        essay_path = os.path.join(essays_dir, f"{essay_id}.txt")
        try:
            with open(essay_path, 'r') as f:
                essay_texts[essay_id] = f.read()
        except FileNotFoundError:
            if IS_KAGGLE_ENV and "test" in essays_dir.lower():
                 print(f"Info: Test essay file not found {essay_path} (may be normal for sample run)")
            else:
                print(f"Warning: Essay file not found {essay_path}")
            essay_texts[essay_id] = "" 
    return essay_texts

def format_time(elapsed_seconds):
    elapsed_rounded = int(round(elapsed_seconds))
    return str(pd.to_timedelta(elapsed_rounded, unit='s'))

In [None]:
# --- Cell 3: Load Data (Test Data and Train Data for Label Encoding) ---
print(f"Loading {TEST_CSV} for inference...")
df_test_original = pd.read_csv(TEST_CSV) 
df_test = df_test_original.copy() 
print(f"Test data shape: {df_test.shape}")

test_essay_ids = df_test['essay_id'].unique()
test_essay_texts_map = load_essay_texts(test_essay_ids, TEST_ESSAYS_DIR)
df_test['essay_full_text'] = df_test['essay_id'].map(test_essay_texts_map)
df_test['discourse_text'] = df_test['discourse_text'].fillna('')
df_test['essay_full_text'] = df_test['essay_full_text'].fillna('')

try:
    df_train_for_labels = pd.read_csv(TRAIN_CSV, usecols=['discourse_effectiveness'])
    label_encoder = LabelEncoder()
    label_encoder.fit(df_train_for_labels['discourse_effectiveness'])
    EFFECTIVENESS_CLASSES = label_encoder.classes_
    NUM_LABELS = len(EFFECTIVENESS_CLASSES)
    print("\nLabel Encoding Mapping (for submission columns):")
    for i, class_name in enumerate(EFFECTIVENESS_CLASSES):
        print(f"{class_name}: {i}")
    print(f"Number of unique labels: {NUM_LABELS}")
except Exception as e:
    print(f"Could not load train.csv or fit LabelEncoder: {e}")
    print("Submission file column order might be incorrect. Defining default.")
    EFFECTIVENESS_CLASSES = np.array(['Adequate', 'Effective', 'Ineffective'])
    NUM_LABELS = 3

Loading /kaggle/input/feedback-prize-effectiveness/test.csv for inference...
Test data shape: (10, 4)


Loading essays from /kaggle/input/feedback-prize-effectiveness/test/: 100%|██████████| 1/1 [00:00<00:00, 87.20it/s]



Label Encoding Mapping (for submission columns):
Adequate: 0
Effective: 1
Ineffective: 2
Number of unique labels: 3


In [None]:
# --- Cell 4: Tokenizer Initialization ---
# Load tokenizer from the directory where the best model was saved
try:
    print(f"Loading tokenizer from {BEST_MODEL_DIR}")
    tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_DIR)
except OSError as e:
    print(f"Could not load tokenizer from {BEST_MODEL_DIR}: {e}")
    print(f"This is critical for inference. Ensure '{BEST_MODEL_DIR}' contains tokenizer files or adjust path.")
    if IS_KAGGLE_ENV and "your-model-dataset-slug" in BEST_MODEL_DIR:
         print("REMINDER: Update 'your-model-dataset-slug' in KAGGLE_MODEL_INPUT_DIR in Cell 1.")
    raise # Stop execution if tokenizer can't be loaded
print("Tokenizer loaded.")

Loading tokenizer from /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2
Tokenizer loaded.


In [6]:
# --- Cell 5: PyTorch Dataset Class (Inference Focus) ---
class FeedbackPrizeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len): 
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text_A = str(row.discourse_text) if pd.notna(row.discourse_text) and str(row.discourse_text).strip() else " "
        text_B = str(row.essay_full_text) if pd.notna(row.essay_full_text) and str(row.essay_full_text).strip() else " "
        
        if not text_A.strip(): text_A = " " 
        if not text_B.strip(): text_B = " " 

        inputs = self.tokenizer.encode_plus(
            text_A,
            text_B,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length', 
            truncation='longest_first', 
            return_attention_mask=True,
            return_token_type_ids=True, 
            return_tensors='pt'    
        )
        item = {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten()
        }
        return item

In [7]:
# --- Cell 6: DataLoader for Test Set ---
test_torch_dataset = FeedbackPrizeDataset(df_test, tokenizer, MAX_LEN) 

try:
    num_avail_workers = len(os.sched_getaffinity(0)) // 2 
except AttributeError:
    num_avail_workers = (os.cpu_count() // 2) if os.cpu_count() and os.cpu_count() > 1 else 0 
num_avail_workers = max(0, num_avail_workers) 
# num_avail_workers = 0 # For debugging
print(f"Using {num_avail_workers} workers for Test DataLoader.")

test_dataloader = DataLoader(
    test_torch_dataset,
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=num_avail_workers,
    pin_memory=True if torch.cuda.is_available() else False
)
print(f"\nTest DataLoader created: {len(test_dataloader)} batches.")

Using 2 workers for Test DataLoader.

Test DataLoader created: 1 batches.


In [None]:
# --- Cell 7: Load Model Architecture and Weights ---
# Load config, then architecture, then state dict separately

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Target device: {device}")

# 1. Load Config
try:
    print(f"Loading config from {BEST_MODEL_DIR}")
    # Ensure num_labels is passed correctly if not defined in config.json
    config = AutoConfig.from_pretrained(BEST_MODEL_DIR, num_labels=NUM_LABELS) 
    print("Model config loaded successfully.")
except OSError as e:
     print(f"Error loading config.json from {BEST_MODEL_DIR}: {e}")
     print("Ensure config.json exists in the model directory (it's needed for Option 2 loading).")
     print("You might need to manually add it or use the save_pretrained method during training.")
     raise e

# 2. Load Architecture from Config
print("Defining model architecture from loaded config...")
model_architecture = AutoModelForSequenceClassification.from_config(config)
print("Model architecture defined.")

# 3. Load State Dict (Weights)
loaded_model = model_architecture 
try:
    print(f"Loading model weights (state_dict) from: {BEST_MODEL_PATH}")
    if not os.path.exists(BEST_MODEL_PATH):
        raise FileNotFoundError(f"Model state_dict file not found at {BEST_MODEL_PATH}")

    loaded_model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=device))
    print("Model weights loaded successfully.")
except Exception as e:
    print(f"Error loading model weights: {e}")
    print(f"Attempted to load from: {BEST_MODEL_PATH}")
    print("Ensure the BEST_MODEL_PATH is correct and points to a valid .pt state_dict file.")
    print("Ensure your model dataset is correctly added to the Kaggle notebook if running on Kaggle.")
    if IS_KAGGLE_ENV and "your-model-dataset-slug" in BEST_MODEL_DIR: 
         print("CRITICAL REMINDER: Update 'your-model-dataset-slug' in KAGGLE_MODEL_INPUT_DIR in Cell 1.")
    raise e 

loaded_model.to(device) 
loaded_model.eval() 
print("Model is on device and ready for inference.")

Target device: cpu
Loading config from /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2
Model config loaded successfully.
Defining model architecture from loaded config...


2025-05-08 15:34:38.100500: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746718478.313818      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746718478.377293      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Model architecture defined.
Loading model weights (state_dict) from: /kaggle/input/feedback-prize-bert-base-uncased-epoch-2/transformers/default/1/epoch_2/bert-base-uncased-best.pt


  loaded_model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=device))


Model weights loaded successfully.
Model is on device and ready for inference.


In [9]:
# --- Cell 8: Inference on Test Set ---
print("\nStarting inference on the test set...")
all_test_predictions_probs = []
t0_inference = time.time()

for batch in tqdm(test_dataloader, total=len(test_dataloader), desc="Test Batches"):
    b_input_ids = batch['input_ids'].to(device)
    b_attention_mask = batch['attention_mask'].to(device)
    b_token_type_ids = batch['token_type_ids'].to(device)

    with torch.no_grad(): 
        outputs = loaded_model(b_input_ids,
                               token_type_ids=b_token_type_ids,
                               attention_mask=b_attention_mask)
    
    logits = outputs.logits
    logits_cpu = logits.detach().cpu()
    probs = torch.softmax(logits_cpu, dim=1).numpy()
    all_test_predictions_probs.extend(probs)

inference_time = format_time(time.time() - t0_inference) 
print(f"Inference on test set completed in: {inference_time}")

predictions_array = np.vstack(all_test_predictions_probs)
print(f"Shape of predictions_array: {predictions_array.shape}")


Starting inference on the test set...


Test Batches: 100%|██████████| 1/1 [00:09<00:00,  9.49s/it]

Inference on test set completed in: 0 days 00:00:09
Shape of predictions_array: (10, 3)





In [10]:
# --- Cell 9: Create Submission File ---
print("\nCreating submission file...")
submission_df = pd.DataFrame()
submission_df['discourse_id'] = df_test_original['discourse_id']

if 'EFFECTIVENESS_CLASSES' not in globals() or len(EFFECTIVENESS_CLASSES) != NUM_LABELS:
    print("Warning: EFFECTIVENESS_CLASSES not properly defined. Using default for submission columns.")
    EFFECTIVENESS_CLASSES = np.array(['Adequate', 'Effective', 'Ineffective']) # Fallback

col_map = {name: i for i, name in enumerate(EFFECTIVENESS_CLASSES)}

try:
    submission_df['Ineffective'] = predictions_array[:, col_map['Ineffective']]
    submission_df['Adequate']    = predictions_array[:, col_map['Adequate']]
    submission_df['Effective']   = predictions_array[:, col_map['Effective']]
except KeyError as e:
    print(f"KeyError creating submission columns: {e}. Mismatch between EFFECTIVENESS_CLASSES ({EFFECTIVENESS_CLASSES}) and required columns?")
    raise

submission_df.to_csv(SUBMISSION_FILE, index=False)
print(f"\nSubmission file created: {SUBMISSION_FILE}")
print("First 5 rows of submission file:")
print(submission_df.head())
print(f"\nSubmission file saved to: {os.path.abspath(SUBMISSION_FILE)}")


Creating submission file...

Submission file created: /kaggle/working/submission.csv
First 5 rows of submission file:
   discourse_id  Ineffective  Adequate  Effective
0  a261b6e14276     0.005158  0.286008   0.708834
1  5a88900e7dc1     0.035271  0.849891   0.114838
2  9790d835736b     0.006935  0.441666   0.551399
3  75ce6d68b67b     0.020652  0.578649   0.400700
4  93578d946723     0.024025  0.833997   0.141977

Submission file saved to: /kaggle/working/submission.csv
