In [1]:
# Install dependencies from Kaggle datasets
!pip install -U --no-deps /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -U --no-deps /kaggle/input/datasets-214/datasets-2.14.5-py3-none-any.whl
!pip install -U --no-deps /kaggle/input/optimum-113/optimum-1.13.2-py3-none-any.whl
!pip install --no-deps /kaggle/input/tokenizers-0-13-3/tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -U --no-deps /kaggle/input/transformers-432/transformers-4.32.1-py3-none-any.whl

Processing /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Processing /kaggle/input/datasets-214/datasets-2.14.5-py3-none-any.whl
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.1.0
    Uninstalling datasets-3.1.0:
      Successfully uninstalled datasets-3.1.0
Successfully installed datasets-2.14.5
Processing /kaggle/input/optimum-113/optimum-1.13.2-py3-none-any.whl
Installing collected packages: optimum
Successfully installed optimum-1.13.2
Processing /kaggle/input/tokenizers-0-13-3/tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.3
    Uninstalling tokenizers-0.20.3:
      Successfully uninstalled tokenizers-0.20.3
Success

# Loading Dependencies

In [2]:
import gc
import logging
import ctypes
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from pathlib import Path
from time import time
from functools import partial
from concurrent.futures import ThreadPoolExecutor
from threading import Condition
from tqdm.auto import tqdm

import faiss
from torch.utils.data import DataLoader
from datasets import load_from_disk, Dataset

from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
from accelerate import init_empty_weights
from accelerate.utils.modeling import set_module_tensor_to_device
from safetensors.torch import load_file
from optimum.bettertransformer import BetterTransformer

  torch.utils._pytree._register_pytree_node(


# Set up hyperparameters & helper function

In [3]:
NUM_TITLES = 5
MAX_SEQ_LEN = 512
# Using bge-faiss-index
# EMBED_MODEL_DIR = "/kaggle/input/bge-faiss-index-llmscience/"
EMBED_MODEL_DIR = "/kaggle/input/bge-small-faiss/"

# Using all-mpnet-base-faiss index
# EMBED_MODEL_DIR = "/kaggle/input/all-mpnet-base-v2-faiss-llmscience/"
MAX_TOKENS = 4096
MAX_CONTEXT_TOKENS = 1200
N_BATCHES = 5

def clear_memory():
    """Frees up unused memory on both CPU and GPU."""
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

# LLM and RAG pipeline

In [4]:
class SentenceEmbedder:
    """
    Produces embeddings for given sentences using a transformer-based model.
    This is functionally similar to the original SentenceTransformer logic.
    We need to implement it since the LLMScience is Internet off, so we cannot download the SentenceTransformer.
    """
    def __init__(self, model_dir, device="cuda:0"):
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.model = AutoModel.from_pretrained(model_dir).half().to(device)

    def _preprocess(self, text_batch):
        return self.tokenizer(
            text_batch["text"], 
            truncation=True, 
            padding=True, 
            return_tensors="pt", 
            max_length=MAX_SEQ_LEN
        ).to(self.device)

    def _dataloader(self, sentences, batch_size=32):
        decorated = ["Represent this sentence for searching relevant passages: " + s for s in sentences]
        dataset = Dataset.from_dict({"text": decorated})
        dataset.set_transform(self._preprocess)
        return DataLoader(dataset, batch_size=batch_size, shuffle=False)

    def encode(self, texts, show_progress=False, batch_size=32):
        loader = self._dataloader(texts, batch_size)
        iterator = tqdm(loader) if show_progress else loader
        all_embeddings = []
        with torch.no_grad():
            for batch in iterator:
                embeddings = self.model(**batch).pooler_output
                normalized = F.normalize(embeddings, p=2, dim=1).cpu().numpy()
                all_embeddings.append(normalized)
        return np.concatenate(all_embeddings, axis=0)


class WeightsSynchronizer:
    """
    Manages synchronous loading of model layers (in safetensors format)
    across multiple devices to ensure all devices load the same layer simultaneously.
    """
    def __init__(self, base_path, device_list):
        self.checkpoint_path = Path(base_path)
        self.state_lock = Condition()
        self.states = {d: None for d in device_list}
        self.cached_state_dict = None

    def request_weights(self, layer_name, device):
        with self.state_lock:
            self.states[device] = layer_name
            if all(self.states.values()):
                # Confirm all devices are requesting the same layer
                assert len(set(self.states.values())) == 1, "Mismatch in requested layer names"
                layer_file = self.checkpoint_path / (layer_name + ".safetensors")
                self.cached_state_dict = load_file(layer_file, device="cpu")
                for dvc in self.states:
                    self.states[dvc] = None
                self.state_lock.notify_all()

    def retrieve_state_dict(self, device):
        with self.state_lock:
            while self.states[device] is not None:
                self.state_lock.wait()
            result = self.cached_state_dict
            self.states[device] = None
            if not any(self.states.values()):
                self.state_lock.notify_all()
            return result


class LayeredLlama:
    """
    Loads and runs a large model in a sharded manner. 
    Layers are loaded on-demand to minimize memory usage.
    """
    def __init__(self, model_path, weight_sync, device="cuda:0", dtype=torch.float16):
        self.ckpt_path = Path(model_path)
        self.weight_sync = weight_sync
        self.device = device
        self.dtype = dtype

        # Load configuration and tokenizer
        self.config = AutoConfig.from_pretrained(self.ckpt_path)
        self.tokenizer = AutoTokenizer.from_pretrained(self.ckpt_path)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "right"

        # Build a model skeleton without weights
        self._create_empty_model()
        # Store layer names for sequential loading
        self.layer_sequence = ["model.embed_tokens"] + \
                              [f"model.layers.{i}" for i in range(len(self.model.model.layers))] + \
                              ["model.norm", "value_head"]

    def _create_empty_model(self):
        with init_empty_weights():
            self.model = AutoModelForCausalLM.from_config(self.config)
            self.model.lm_head = torch.nn.Linear(8192, 8, bias=False)  # Adjusted head dimension
            self.model.eval()
            self.model = BetterTransformer.transform(self.model)
            self.model.tie_weights()

        # Move buffers to device
        for name, buffer in self.model.named_buffers():
            set_module_tensor_to_device(self.model, name, self.device, value=buffer, dtype=self.dtype)

        self.layers = ([self.model.model.embed_tokens] + list(self.model.model.layers) + 
                       [self.model.model.norm, self.model.lm_head])

    def _load_and_place(self, layer_name):
        self.weight_sync.request_weights(layer_name, self.device)
        weights = self.weight_sync.retrieve_state_dict(self.device)
        # Handle final layer rename if necessary
        if "value_head.weight" in weights:
            weights = {"lm_head.weight": weights["value_head.weight"]}

        for param_name, param in weights.items():
            set_module_tensor_to_device(self.model, param_name, self.device, value=param, dtype=self.dtype)

    def __call__(self, batched_data):
        # Reset model and free memory before processing
        del self.model
        clear_memory()
        self._create_empty_model()

        # Prepare data
        batches = [(pfx.to(self.device), sfx.to(self.device)) for pfx, sfx in batched_data]
        suffix_count = len(batches[0][1])
        suffix_ends = [(sfx != self.tokenizer.pad_token_id).sum(1) - 1 for _, sfx in batches]

        # Generate large attention mask and position IDs
        attn_mask = (torch.ones(MAX_TOKENS, MAX_TOKENS).triu(diagonal=1)[None, None, ...] == 0).to(self.device)
        pos_ids = torch.arange(MAX_TOKENS, dtype=torch.long, device=self.device)[None, :]

        # Execute layers in sequence with threading for async loading
        with ThreadPoolExecutor() as executor, torch.inference_mode():
            pending = executor.submit(self._load_and_place, "model.embed_tokens")

            for idx, (lname, layer) in tqdm(enumerate(zip(self.layer_sequence, self.layers)), desc=self.device, total=len(self.layers)):
                pending.result()  # Ensure weights are loaded
                if (idx + 1) < len(self.layer_sequence):
                    pending = executor.submit(self._load_and_place, self.layer_sequence[idx + 1])

                # Forward pass through the current layer
                for i, (prefix, suffix) in enumerate(batches):
                    if lname == "model.embed_tokens":
                        # Initial embedding
                        batches[i] = (layer(prefix), layer(suffix))
                    elif lname == "model.norm":
                        # Take the final token from suffix
                        final_tokens = suffix[torch.arange(suffix_count), suffix_ends[i]][:, None]
                        batches[i] = (None, layer(final_tokens))
                    elif lname == "value_head":
                        # Compute mean of the output across suffix dimension
                        batches[i] = layer(suffix)[:, 0].mean(1).cpu().numpy()
                    else:
                        plen, slen = prefix.shape[1], suffix.shape[1]

                        # Process prefix with kv-cache
                        pre_out, (k_cache, v_cache) = layer(prefix, use_cache=True, 
                                                             attention_mask=attn_mask[:, :, -plen:, -plen:])
                        # Process suffix using expanded cache
                        pos = pos_ids[:, plen:plen + slen].expand(suffix_count, -1)
                        att = attn_mask[:, :, -slen:, -plen - slen:].expand(suffix_count, -1, -1, -1)
                        kv_cache = (k_cache.expand(suffix_count, -1, -1, -1), v_cache.expand(suffix_count, -1, -1, -1))
                        sfx_out = layer(suffix, past_key_value=kv_cache, position_ids=pos, attention_mask=att)[0]

                        batches[i] = (pre_out, sfx_out)

                # Move layer back to meta to save memory
                layer.to("meta")
                clear_memory()

        return batches


def load_and_prepare_data():
    # Load test data from CSV
    df = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv", index_col="id")
    is_test = (len(df) != 200)
    return df, is_test


def embed_prompts(df):
    start_time = time()
    print(f"Initializing embeddings at t={time() - start_time:.1f}s")

    embedding_model = SentenceEmbedder(EMBED_MODEL_DIR, device="cuda:0")

    def combine_text(row):
        return " ".join([row["prompt"], row["A"], row["B"], row["C"], row["D"], row["E"]])
    combined_inputs = df.apply(combine_text, axis=1).values

    prompt_emb = embedding_model.encode(combined_inputs, show_progress=False)
    return prompt_emb, start_time


# Using FAISS index
def retrieve_context(df, prompt_emb, start_t):
    print(f"Loading FAISS index at t={time() - start_t:.1f}s")
    faiss_idx = faiss.read_index(EMBED_MODEL_DIR + 'faiss.index')
    # faiss_idx = faiss.read_index(EMBED_MODEL_DIR + '/bge-small-faiss.index')
    # faiss_idx = faiss.read_index(EMBED_MODEL_DIR + '/all-mpnet-base-v2-faiss.index')
    # faiss_idx = faiss.read_index('/kaggle/input/all-mp-net-base-v2-embedings/wikipedia_embs_768_all-mp-net-base-v2_faiss.index')

    print(f"Performing vector search at t={time() - start_t:.1f}s")
    results = faiss_idx.search(np.float32(prompt_emb), NUM_TITLES)[1]

    print(f"Loading dataset for retrieval at t={time() - start_t:.1f}s")
    ds = load_from_disk("/kaggle/input/all-paraphs-parsed-expanded")
    for i in range(len(df)):
        df.loc[i, "context"] = "-" + "\n-".join([ds[int(idx)]["text"] for idx in results[i]])

    faiss_idx.reset()
    del faiss_idx, prompt_emb, ds
    clear_memory()
    print(f"Context retrieval complete at t={time() - start_t:.1f}s")

# NOT Using FAISS index
# def retrieve_context(df, prompt_emb, start_t):
#     print(f"Loading dataset for retrieval at t={time() - start_t:.1f}s")
#     ds = load_from_disk("/kaggle/input/all-paraphs-parsed-expanded")
    
#     # Instead of using FAISS results, we can simply pick no context or a dummy context.
#     # For a fair comparison, I justjust leave these as empty lines.
#     for i in range(len(df)):
#         df.loc[i, "context"] = ""

#     del prompt_emb, ds
#     clear_memory()
#     print(f"Context retrieval complete at t={time() - start_t:.1f}s")


def generate_symlinks():
    # Recreate model directory structure
    base_path = Path("/root/.cache/")
    base_path.mkdir(exist_ok=True, parents=True)

    for part_num in [1, 2, 3]:
        src_dir = Path(f"/kaggle/input/platypus2-chuhac2-part{part_num}")
        for item in src_dir.glob("*"):
            link_path = base_path / item.name
            if not link_path.exists():
                link_path.symlink_to(item)
    return base_path


def prepare_inputs(row, tok):
    sys_prompt = (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n### Instruction:\n{instr}\n\n### Input:\nContext:\n{ctx}"
    )
    instruction = ("Your task is to analyze the question and answer below. If the answer is correct, respond yes, "
                   "if it is not correct respond no. As a potential aid to your answer, background context from "
                   "Wikipedia articles is at your disposal, even if they might not always be relevant.")

    # Suffix prompts for each answer option
    suffix_opts = [f"{row[ch]}\n\n### Response:\n" for ch in "ABCDE"]
    suffix_ids = tok(suffix_opts, return_tensors="pt", truncation=True, max_length=MAX_TOKENS, padding=True)["input_ids"][:, 1:]

    question_str = f"\nQuestion: {row['prompt']}\nProposed answer: "
    question_ids = tok(question_str, return_tensors="pt", truncation=True, max_length=max(0, MAX_TOKENS - suffix_ids.shape[1]))["input_ids"][:, 1:]

    # Context tokens
    context_str = sys_prompt.format(instr=instruction, ctx=row["context"])
    ctx_len = min(MAX_CONTEXT_TOKENS, max(0, MAX_TOKENS - question_ids.shape[1] - suffix_ids.shape[1]))
    context_ids = tok(context_str, return_tensors="pt", truncation=True, max_length=ctx_len)["input_ids"]

    prefix_tokens = torch.cat([context_ids, question_ids], dim=1)
    return prefix_tokens, suffix_ids


def run_inference(df, model_path):
    # Determine devices and initialize weights synchronization
    gpu_list = [f"cuda:{i}" for i in range(torch.cuda.device_count())]
    weights_sync = WeightsSynchronizer(model_path, gpu_list)

    # Split DataFrame and run model in parallel
    def process_partition(device, sub_df):
        model = LayeredLlama(model_path, weights_sync, device=device)
        tokenize_fn = partial(prepare_inputs, tok=model.tokenizer)
        data_inputs = sub_df.apply(tokenize_fn, axis=1).values
        subsets = np.array_split(data_inputs, N_BATCHES)
        results = []
        for chunk in subsets:
            results += model(chunk)
        return results

    with ThreadPoolExecutor() as pool:
        parts = np.array_split(df, 2)
        results = list(pool.map(process_partition, gpu_list, parts))
        final_out = sum(results, [])

    # Get prediction order (A,B,C,D,E) from the returned scores
    for i, scores in enumerate(final_out):
        ordering = np.argsort(scores)[::-1]
        df.loc[i, "prediction"] = " ".join(["ABCDE"[j] for j in ordering])


# Running

In [5]:
def main():
    df, IS_TEST = load_and_prepare_data()
    embeddings, start = embed_prompts(df)
    retrieve_context(df, embeddings, start)
    model_dir = generate_symlinks()

    if IS_TEST:
        run_inference(df, model_dir)
    else:
        # If not the test set scenario, produce a placeholder submission
        df["prediction"] = "A B C"

    df[["prediction"]].to_csv("submission.csv")


if __name__ == "__main__":
    main()

Initializing embeddings at t=0.0s


  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


Loading FAISS index at t=4.3s
Performing vector search at t=31.2s
Loading dataset for retrieval at t=33.7s


  table = cls._concat_blocks(blocks, axis=0)


Context retrieval complete at t=60.5s
