In [3]:
import pandas as pd
import numpy as np

# load tokenised splits
train_df = pd.read_pickle("dataprep/ms_marco_combined/tokenised/train_tokenised.pkl")
val_df   = pd.read_pickle("dataprep/ms_marco_combined/tokenised/validation_tokenised.pkl")
test_df  = pd.read_pickle("dataprep/ms_marco_combined/tokenised/test_tokenised.pkl")

def describe_ids(col):
    lengths = col.str.len()
    return {
      'avg': lengths.mean(),
      'med': lengths.median(),
      'min': lengths.min(),
      'max': lengths.max(),
      'pct_unk': (col.apply(lambda ids: any(i==1 for i in ids)).mean() * 100)
    }


In [4]:
# Length & UNK stats per split
for df, name in [(train_df, "Train"), (val_df, "Val"), (test_df, "Test")]:
    q = describe_ids(df['query_ids'])
    p = describe_ids(df['pos_ids'])
    n = describe_ids(df['neg_ids'])
    print(f"{name} ({len(df)} ex.)")
    print(f"  Query → avg {q['avg']:.1f}, med {q['med']}, max {q['max']}, %UNK {q['pct_unk']:.1f}%")
    print(f"  Pos   → avg {p['avg']:.1f}, med {p['med']}, max {p['max']}, %UNK {p['pct_unk']:.1f}%")
    print(f"  Neg   → avg {n['avg']:.1f}, med {n['med']}, max {n['max']}, %UNK {n['pct_unk']:.1f}%\n")


Train (79004 ex.)
  Query → avg 6.1, med 6.0, max 26, %UNK 41.9%
  Pos   → avg 83.2, med 83.0, max 254, %UNK 100.0%
  Neg   → avg 80.4, med 79.0, max 236, %UNK 100.0%

Val (9875 ex.)
  Query → avg 6.0, med 6.0, max 19, %UNK 42.2%
  Pos   → avg 83.2, med 83.0, max 199, %UNK 100.0%
  Neg   → avg 80.0, med 79.0, max 225, %UNK 100.0%

Test (9876 ex.)
  Query → avg 6.1, med 6.0, max 27, %UNK 43.0%
  Pos   → avg 83.1, med 83.0, max 208, %UNK 99.9%
  Neg   → avg 80.4, med 79.0, max 211, %UNK 100.0%

