# Feedback Prize Effectiveness: Prediction and Submission (Longformer Ensemble)

This notebook is designed for generating predictions on the test set using an ensemble of trained Longformer models (one from each fold of K-Fold cross-validation). It assumes the models and their tokenizers have been saved using `save_pretrained` and are available via a Kaggle Dataset.

**Key Steps:**
1. **Setup**: Imports, constants, path configurations for all K-Fold models.
2. **Data Loading**: Load `test.csv` and corresponding essay texts.
3. **Label Encoding Info**: Load `train.csv` minimally to get consistent label class order for submission columns.
4. **Dataset and DataLoader**: Prepare the test data for inference (done once).
5. **Inference Loop**: Iterate through each saved model fold:
   a. Load the tokenizer and model for the current fold.
   b. Run predictions on the test set.
   c. Store the predicted probabilities.
6. **Averaging Predictions**: Average the probabilities from all model folds.
7. **Submission File Creation**: Format averaged predictions into `submission.csv`.

In [None]:
# --- Cell 1: Imports and Constants (Inference Focus) ---
import pandas as pd
import os
from tqdm.auto import tqdm 
import torch
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification 
from sklearn.preprocessing import LabelEncoder 
from torch.utils.data import Dataset, DataLoader 
import numpy as np 
import time
import datetime
import gc

os.environ["TOKENIZERS_PARALLELISM"] = "false"

IS_KAGGLE_ENV = 'KAGGLE_KERNEL_RUN_TYPE' in os.environ

MODEL_BASE_NAME = 'longformer-base-4096' 
MAX_LEN = 1024 
NUM_LABELS = 3 
BATCH_SIZE = 16 
N_FOLDS = 5 # Number of folds used during training
AMP_ENABLED = True # Enable AMP for inference if using CUDA

if IS_KAGGLE_ENV:
    print("Running in Kaggle environment.")
    COMPETITION_DATA_PATH = "/kaggle/input/feedback-prize-effectiveness"
    MODEL_FOLDS_BASE_PATH = "/kaggle/input/longformer-feedback-prize-ensemble/pytorch/default/1/longformer-feedback-prize/" # USER VERIFIED THIS PATH
    OUTPUT_DIR = "/kaggle/working/" 
else:
    print("Running in local environment (for testing inference).")
    PROJECT_ROOT = "../" 
    COMPETITION_DATA_PATH = os.path.join(PROJECT_ROOT, "data/feedback-prize-effectiveness/") 
    MODEL_FOLDS_BASE_PATH = os.path.join(PROJECT_ROOT, "models/") 
    OUTPUT_DIR = os.path.join(PROJECT_ROOT, "outputs/") 
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

SUBMISSION_FILE = os.path.join(OUTPUT_DIR, "submission.csv")
TRAIN_CSV = os.path.join(COMPETITION_DATA_PATH, "train.csv")
TEST_CSV = os.path.join(COMPETITION_DATA_PATH, "test.csv")
TEST_ESSAYS_DIR = os.path.join(COMPETITION_DATA_PATH, "test/")
TRAIN_ESSAYS_DIR = os.path.join(COMPETITION_DATA_PATH, "train/")

MODEL_DIR_PATHS = []
for i in range(N_FOLDS):
    MODEL_DIR_PATHS.append(os.path.join(MODEL_FOLDS_BASE_PATH, f"{MODEL_BASE_NAME}-fold-{i}-best"))

print("\n--- Path Check (Inference Mode) ---")
paths_to_check = {
    "Competition Data Path": COMPETITION_DATA_PATH,
    "Train CSV (for LabelEncoder)": TRAIN_CSV,
    "Test CSV": TEST_CSV,
    "Test Essays Dir": TEST_ESSAYS_DIR,
    "Configured Model Folds Base Path": MODEL_FOLDS_BASE_PATH,
    "Output Directory": OUTPUT_DIR
}
for name, path_val in paths_to_check.items():
    exists = os.path.exists(path_val)
    status = "Found" if exists else "NOT FOUND"
    print(f"{name}: {path_val} ... {status}")

print("\nChecking Constructed Model Fold Directories:")
all_fold_paths_exist = True
for i, path_val in enumerate(MODEL_DIR_PATHS):
    exists = os.path.exists(path_val)
    status = "Found" if exists else "NOT FOUND"
    if not exists:
        all_fold_paths_exist = False
        status += " (CRITICAL: This path must exist with model and tokenizer files!)"
    print(f"Model Fold {i} Dir: {path_val} ... {status}")

print("--- End Path Check ---\n")
if not all_fold_paths_exist:
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print("!!! WARNING: One or more model fold directories were NOT FOUND.                  !!!")
    print("!!! Please VERIFY the 'MODEL_FOLDS_BASE_PATH' in Cell 1.                         !!!")
    print(f"!!! It's currently set to: {MODEL_FOLDS_BASE_PATH}                                 !!!")
    print("!!! Ensure this path points to the directory directly CONTAINING your fold folders   !!!")
    print("!!! (e.g., 'longformer-base-4096-fold-0-best', etc.).                            !!!")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
    raise FileNotFoundError("One or more model fold directories not found. Please check paths in Cell 1.")

In [None]:
# --- Cell 2: Helper functions ---
def load_essay_texts(essay_ids, essays_dir):
    essay_texts = {}
    for essay_id in tqdm(essay_ids, desc=f"Loading essays from {essays_dir}"):
        essay_path = os.path.join(essays_dir, f"{essay_id}.txt")
        try:
            with open(essay_path, 'r', encoding='utf-8') as f:
                essay_texts[essay_id] = f.read()
        except FileNotFoundError:
            if IS_KAGGLE_ENV and "test" in essays_dir.lower():
                 print(f"Info: Test essay file not found {essay_path} (may be normal for sample run)")
            else:
                print(f"Warning: Essay file not found {essay_path}")
            essay_texts[essay_id] = "" 
    return essay_texts

def format_time(elapsed_seconds):
    elapsed_rounded = int(round(elapsed_seconds))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# --- Cell 3: Load Data (Test Data and Train Data for Label Encoding) ---
print(f"Loading {TEST_CSV} for inference...")
df_test_original = pd.read_csv(TEST_CSV) 
df_test = df_test_original.copy() 
print(f"Test data shape: {df_test.shape}")

test_essay_ids = df_test['essay_id'].unique()
test_essay_texts_map = load_essay_texts(test_essay_ids, TEST_ESSAYS_DIR)

df_test['discourse_text'] = df_test['discourse_text'].fillna('').astype(str)
df_test['discourse_type'] = df_test['discourse_type'].fillna('').astype(str)

try:
    df_train_for_labels = pd.read_csv(TRAIN_CSV, usecols=['discourse_effectiveness'])
    label_encoder = LabelEncoder()
    label_encoder.fit(df_train_for_labels['discourse_effectiveness'])
    EFFECTIVENESS_CLASSES = label_encoder.classes_
    NUM_LABELS = len(EFFECTIVENESS_CLASSES)
    print("\nLabel Encoding Mapping (for submission columns):")
    for i, class_name in enumerate(EFFECTIVENESS_CLASSES):
        print(f"{class_name}: {i}")
    print(f"Number of unique labels: {NUM_LABELS}")
except Exception as e:
    print(f"Could not load train.csv or fit LabelEncoder: {e}")
    print("Submission file column order might be incorrect. Defining default.")
    EFFECTIVENESS_CLASSES = np.array(['Adequate', 'Effective', 'Ineffective']) 
    NUM_LABELS = 3

In [None]:
# --- Cell 4: PyTorch Dataset Class (Inference - using the same as training for consistency) ---
class FeedbackPrizeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, essay_texts_map, has_labels=False):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.has_labels = has_labels
        self.essay_texts_map = essay_texts_map
        self.cls_token = self.tokenizer.cls_token
        self.sep_token = self.tokenizer.sep_token
        self.pad_token_id = self.tokenizer.pad_token_id

    def __len__(self):
        return len(self.dataframe)

    def _find_discourse_indices(self, essay_text, discourse_text):
        try:
            start_idx = essay_text.find(discourse_text)
            if start_idx == -1:
                start_idx = essay_text.find(discourse_text.strip())
                if start_idx != -1:
                    discourse_text = discourse_text.strip()
            if start_idx != -1:
                end_idx = start_idx + len(discourse_text)
                return start_idx, end_idx
        except Exception:
            pass
        return -1, -1

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        discourse_type_str = str(row.discourse_type).strip()
        discourse_text_str = str(row.discourse_text).strip()
        essay_id = row.essay_id
        essay_full_text_str = self.essay_texts_map.get(essay_id, "").strip()

        t_type = self.tokenizer.tokenize(discourse_type_str)
        t_text = self.tokenizer.tokenize(discourse_text_str)

        num_special_tokens = 5 
        current_payload_len = len(t_type) + len(t_text)
        max_payload_for_type_text = self.max_len - num_special_tokens

        if current_payload_len > max_payload_for_type_text:
            if len(t_text) > (current_payload_len - max_payload_for_type_text):
                t_text = t_text[:len(t_text) - (current_payload_len - max_payload_for_type_text)]
            else:
                t_text = []
                t_type = t_type[:max_payload_for_type_text]
            current_payload_len = len(t_type) + len(t_text)
        
        t_context_before = []
        t_context_after = []
        remaining_budget_for_context = self.max_len - (current_payload_len + num_special_tokens)

        if remaining_budget_for_context > 0 and essay_full_text_str:
            start_idx, end_idx = self._find_discourse_indices(essay_full_text_str, discourse_text_str)
            if start_idx != -1:
                context_before_str = essay_full_text_str[:start_idx].strip()
                context_after_str = essay_full_text_str[end_idx:].strip()
                budget_for_before = remaining_budget_for_context // 2
                budget_for_after = remaining_budget_for_context - budget_for_before
                if context_before_str:
                    temp_cb_tokens = self.tokenizer.tokenize(context_before_str)
                    t_context_before = temp_cb_tokens[-budget_for_before:]
                if context_after_str:
                    temp_ca_tokens = self.tokenizer.tokenize(context_after_str)
                    t_context_after = temp_ca_tokens[:budget_for_after]
        
        tokens = [self.cls_token] + t_type + [self.sep_token] + t_text + \
                 [self.sep_token] + t_context_before + \
                 [self.sep_token] + t_context_after + [self.sep_token]
        
        if len(tokens) > self.max_len:
            tokens = tokens[:self.max_len -1] + [self.sep_token]
        
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)

        padding_length = self.max_len - len(input_ids)
        if padding_length > 0:
            input_ids = input_ids + ([self.pad_token_id] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
        
        item = {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
        }
        return item

In [None]:
# --- Cell 5: DataLoader for Test Set ---
temp_tokenizer_for_dataset = LongformerTokenizerFast.from_pretrained(MODEL_DIR_PATHS[0]) # Load one tokenizer for dataset creation
test_torch_dataset = FeedbackPrizeDataset(df_test, temp_tokenizer_for_dataset, MAX_LEN, test_essay_texts_map, has_labels=False) 
del temp_tokenizer_for_dataset 
gc.collect()

# Set num_workers=0 for Kaggle to avoid multiprocessing issues
num_inference_workers = 0 
print(f"Using {num_inference_workers} workers for Test DataLoader.")

test_dataloader = DataLoader(
    test_torch_dataset,
    batch_size=BATCH_SIZE, 
    shuffle=False,
    num_workers=num_inference_workers,
    pin_memory=True if torch.cuda.is_available() else False
)
print(f"\nTest DataLoader created: {len(test_dataloader)} batches.")

In [None]:
# --- Cell 6: Inference on Test Set (Looping through Folds) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Target device for inference: {device}")
if device.type == 'cpu' and AMP_ENABLED:
    print("AMP is set to enabled, but no CUDA device found. AMP will not be used for inference.")
    LOCAL_AMP_ENABLED = False
else:
    LOCAL_AMP_ENABLED = AMP_ENABLED

all_folds_predictions_probs = []
overall_inference_start_time = time.time()

for fold_num, model_dir_path in enumerate(MODEL_DIR_PATHS):
    print(f"\n--- Processing Fold {fold_num + 1}/{N_FOLDS} ---")
    print(f"Loading model from {model_dir_path}") 
    
    try:
        if not os.path.exists(model_dir_path):
            raise FileNotFoundError(f"Model directory for fold {fold_num} not found at {model_dir_path}")
        
        model_fold = LongformerForSequenceClassification.from_pretrained(model_dir_path, num_labels=NUM_LABELS)
        model_fold.to(device)
        model_fold.eval()
        print(f"Model for fold {fold_num + 1} loaded successfully.")
    except Exception as e:
        print(f"Error loading model for fold {fold_num + 1} from {model_dir_path}: {e}")
        if IS_KAGGLE_ENV and "your-longformer-model-dataset-slug" in model_dir_path: 
             print("REMINDER: Ensure 'MODEL_FOLDS_BASE_PATH' in Cell 1 is correctly set to your Kaggle dataset path.")
        raise

    fold_predictions_probs = []
    fold_inference_start_time = time.time()

    for batch in tqdm(test_dataloader, desc=f"Fold {fold_num+1} Test Batches"):
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)

        with torch.no_grad(): 
            # Corrected autocast usage
            with torch.amp.autocast(device_type=device.type, enabled=(LOCAL_AMP_ENABLED and device.type == 'cuda')):
                outputs = model_fold(input_ids=b_input_ids, attention_mask=b_attention_mask)
        
        logits = outputs.logits
        logits_cpu = logits.detach().cpu()
        probs = torch.softmax(logits_cpu, dim=1).numpy()
        fold_predictions_probs.extend(probs)
    
    all_folds_predictions_probs.append(np.vstack(fold_predictions_probs))
    print(f"Inference for fold {fold_num + 1} completed in: {format_time(time.time() - fold_inference_start_time)}")
    
    del model_fold 
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print(f"\nOverall inference for {N_FOLDS} folds completed in: {format_time(time.time() - overall_inference_start_time)}")

if all_folds_predictions_probs:
    averaged_predictions_array = np.mean(all_folds_predictions_probs, axis=0)
    print(f"Shape of averaged_predictions_array: {averaged_predictions_array.shape}")
else:
    print("No predictions were generated. Check for errors in the loop.")
    averaged_predictions_array = np.zeros((len(df_test), NUM_LABELS))


In [None]:
# --- Cell 7: Create Submission File ---
print("\nCreating submission file...")
submission_df = pd.DataFrame()
submission_df['discourse_id'] = df_test_original['discourse_id']

if 'EFFECTIVENESS_CLASSES' not in globals() or len(EFFECTIVENESS_CLASSES) != NUM_LABELS:
    print("Warning: EFFECTIVENESS_CLASSES not properly defined. Using default for submission columns.")
    EFFECTIVENESS_CLASSES = np.array(['Adequate', 'Effective', 'Ineffective'])

col_map = {name: i for i, name in enumerate(EFFECTIVENESS_CLASSES)}
submission_cols_ordered = ['Ineffective', 'Adequate', 'Effective'] 

try:
    for col_name in submission_cols_ordered:
        if col_name in col_map:
            submission_df[col_name] = averaged_predictions_array[:, col_map[col_name]]
        else:
            raise KeyError(f"Column '{col_name}' not found in label encoder mapping: {EFFECTIVENESS_CLASSES}")
except KeyError as e:
    print(f"KeyError creating submission columns: {e}.")
    print("Ensure your label_encoder.classes_ order during training matches the expected submission columns or adjust mapping here.")
    raise
except Exception as e:
    print(f"An error occurred during submission file creation: {e}")
    raise

submission_df.to_csv(SUBMISSION_FILE, index=False)
print(f"\nSubmission file created: {SUBMISSION_FILE}")
print("First 5 rows of submission file:")
print(submission_df.head())
print(f"\nSubmission file saved to: {os.path.abspath(SUBMISSION_FILE)}")