In [10]:
import pickle
import pandas as pd
import re

with open('dataprep/combined_vocab/tkn_vocab_to_int.pkl', 'rb') as f:
    vocab_to_int = pickle.load(f)
with open('dataprep/combined_vocab/tkn_int_to_vocab.pkl', 'rb') as f:
    int_to_vocab = pickle.load(f)

# get special token IDs
PAD_ID = vocab_to_int.get('<PAD>', 0)
UNK_ID = vocab_to_int.get('<UNK>', 1)


In [11]:
train_df = pd.read_parquet("dataprep/ms_marco_combined/marco_data_splits/train.parquet")
val_df   = pd.read_parquet("dataprep/ms_marco_combined/marco_data_splits/validation.parquet")
test_df  = pd.read_parquet("dataprep/ms_marco_combined/marco_data_splits/test.parquet")
print(f"Train {len(train_df)}, Val {len(val_df)}, Test {len(test_df)}")

Train 79004, Val 9875, Test 9876


In [12]:
def preprocess(text: str) -> list[str]:
    text = text.lower()
    replacements = {
        '.': ' <PERIOD> ',
        ',': ' <COMMA> ', 
        '"': ' <QUOTATION_MARK> ',
        ';': ' <SEMICOLON> ', 
        '!': ' <EXCLAMATION_MARK> ', 
        '?': ' <QUESTION_MARK> ',
        '(': ' <LEFT_PAREN> ', 
        ')': ' <RIGHT_PAREN> ', 
        '--': ' <HYPHENS> ', 
        ':': ' <COLON> '
    }
    for k, v in replacements.items():
        text = text.replace(k, v)
    words = text.split()
    return words

In [17]:
# tokenisation with <UNK> ratio filter
def text_to_ids(text: str, vocab_to_int: dict[str, int], max_unk_ratio: float = 0.2) -> list[int]:
    unk_id = vocab_to_int["<UNK>"]
    words = preprocess(text)
    ids = [vocab_to_int.get(word, unk_id) for word in words]
    unk_count = sum(1 for i in ids if i == unk_id)

    if len(ids) == 0 or (unk_count / len(ids)) > max_unk_ratio:
        return []
    
    return ids

In [18]:
for df, name in [(train_df, "train"), (val_df, "validation"), (test_df, "test")]:
    df["query_ids"] = df["query"].apply(lambda x: text_to_ids(x, vocab_to_int))
    df["pos_ids"]   = df["positive_passage"].apply(lambda x: text_to_ids(x, vocab_to_int))
    df["neg_ids"]   = df["negative_passage"].apply(lambda x: text_to_ids(x, vocab_to_int))

    df["q_len"] = df["query_ids"].str.len()
    df["p_len"] = df["pos_ids"].str.len()
    df["n_len"] = df["neg_ids"].str.len()

    print(f"{name}: avg q {df['q_len'].mean():.1f}, "
          f"p {df['p_len'].mean():.1f}, "
          f"n {df['n_len'].mean():.1f}")

train: avg q 6.0, p 83.2, n 80.4
validation: avg q 6.0, p 83.1, n 80.0
test: avg q 6.0, p 83.1, n 80.4


In [19]:
def display_triplets(df: pd.DataFrame, vocab: dict[int, str], n: int = 5):
    print("\n📋 Sample tokenised triplets:")
    shown = 0
    for _, row in df.iterrows():
        if not row["query_ids"] or not row["pos_ids"] or not row["neg_ids"]:
            continue  # skip dropped ones

        print("\n---")
        print(f"🔹 Query: {row['query']}")
        print(f"   Tokens: {row['query_ids']}")
        print(f"   Words:  {' '.join([int_to_vocab.get(i, '<UNK>') for i in row['query_ids']])}")

        print(f"✅ Positive: {row['positive_passage'][:100]}...")
        print(f"   Tokens: {row['pos_ids'][:10]}... ({len(row['pos_ids'])} tokens)")
        print(f"   Words:  {' '.join([int_to_vocab.get(i, '<UNK>') for i in row['pos_ids'][:10]])}...")

        print(f"❌ Negative: {row['negative_passage'][:100]}...")
        print(f"   Tokens: {row['neg_ids'][:10]}... ({len(row['neg_ids'])} tokens)")
        print(f"   Words:  {' '.join([int_to_vocab.get(i, '<UNK>') for i in row['neg_ids'][:10]])}...")
        
        shown += 1
        if shown == n:
            break

# Show 5 from each set
display_triplets(train_df, int_to_vocab)
display_triplets(val_df, int_to_vocab)
display_triplets(test_df, int_to_vocab)



📋 Sample tokenised triplets:

---
🔹 Query: what is a landmark in colombia
   Tokens: [71, 9, 6, 9258, 8, 5731]
   Words:  what is a landmark in colombia
✅ Positive: Bogota. Colombia's capital of Bogota is in the central part of the country. The city is situated in ...
   Tokens: [33224, 2, 97385, 963, 4, 33224, 9, 8, 1, 400]... (46 tokens)
   Words:  bogota <PERIOD> colombia's capital of bogota is in the central...
❌ Negative: The second level of the food chains is called the Primary Consumer. These consume the green plants. ...
   Tokens: [1, 284, 238, 4, 1, 153, 3452, 9, 84, 1]... (38 tokens)
   Words:  the second level of the food chains is called the...

---
🔹 Query: how to prepare a field for planting pumpkins
   Tokens: [85, 7, 2815, 6, 505, 10, 4617, 17733]
   Words:  how to prepare a field for planting pumpkins
✅ Positive: 1. Prepare your soil. If you are anticipating growing in the spring, the soil can be prepared in the...
   Tokens: [35, 2, 2815, 27, 690, 2, 41, 23, 14, 349

In [20]:
train_df.to_pickle("dataprep/ms_marco_combined/tokenised/train_tokenised.pkl")
val_df.to_pickle(  "dataprep/ms_marco_combined/tokenised/validation_tokenised.pkl")
test_df.to_pickle( "dataprep/ms_marco_combined/tokenised/test_tokenised.pkl")
