In [21]:
import pandas as pd
import collections
from collections import Counter
import pickle
import requests
import tqdm

In [22]:
def preprocess(text: str) -> list[str]:
    text = text.lower()
    replacements = {
        '.': ' <PERIOD> ',
        ',': ' <COMMA> ', 
        '"': ' <QUOTATION_MARK> ',
        ';': ' <SEMICOLON> ', 
        '!': ' <EXCLAMATION_MARK> ', 
        '?': ' <QUESTION_MARK> ',
        '(': ' <LEFT_PAREN> ', 
        ')': ' <RIGHT_PAREN> ', 
        '--': ' <HYPHENS> ', 
        ':': ' <COLON> '
    }
    for k, v in replacements.items():
        text = text.replace(k, v)
    words = text.split()
    return words

In [23]:
# download and load text8
r = requests.get('https://huggingface.co/datasets/ardMLX/text8/resolve/main/text8')
with open('text8', 'wb') as f: f.write(r.content)
with open('text8') as f: text8 = f.read()

text8_tokens = preprocess(text8)
with open('dataprep/combined_vocab/text8_corpus.pkl', 'wb') as f: pickle.dump(text8_tokens, f)

In [24]:
# load and preprocess MS MARCO combined dataset
df = pd.read_parquet("dataprep/ms_marco_combined/combined.parquet")

In [25]:

# extract and preprocess queries and passages
query_tokens = []
passage_tokens = []

for _, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Processing MS MARCO"):
    query_tokens.extend(preprocess(str(row["query"])))
    for passage in row["passages"]["passage_text"]:
        passage_tokens.extend(preprocess(str(passage)))

with open("dataprep/combined_vocab/ms_marco_queries.pkl", "wb") as f: pickle.dump(query_tokens, f)
with open("dataprep/combined_vocab/ms_marco_passages.pkl", "wb") as f: pickle.dump(passage_tokens, f)

Processing MS MARCO: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102023/102023 [00:39<00:00, 2580.07it/s]


In [26]:
# build combined corpus and single vocabulary
combined_corpus = text8_tokens + query_tokens + passage_tokens
with open("dataprep/combined_vocab/combined_corpus.pkl", "wb") as f: pickle.dump(combined_corpus, f)

print(f"Combined corpus created with {len(combined_corpus):,} tokens")

# count global frequencies and filter
word_counts = Counter(combined_corpus)
filtered_corpus = [word for word in combined_corpus if word_counts[word] > 5]

Combined corpus created with 86,128,145 tokens


In [27]:
# sort vocab and create lookup tables
sorted_vocab = sorted(set(filtered_corpus), key=lambda w: word_counts[w], reverse=True)
int_to_vocab = {idx + 1: word for idx, word in enumerate(sorted_vocab)}
int_to_vocab[0] = "<PAD>"
vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}

In [28]:
# add <UNK> token
unk_id = len(vocab_to_int)
vocab_to_int["<UNK>"] = unk_id
int_to_vocab[unk_id] = "<UNK>"

In [29]:
# save vocab
with open("dataprep/combined_vocab/tkn_vocab_to_int.pkl", "wb") as f: pickle.dump(vocab_to_int, f)
with open("dataprep/combined_vocab/tkn_int_to_vocab.pkl", "wb") as f: pickle.dump(int_to_vocab, f)

print(f"Vocabulary created with {len(vocab_to_int):,} unique tokens (including <PAD> and <UNK>)")

Vocabulary created with 150,164 unique tokens (including <PAD> and <UNK>)


In [30]:
unk_words = [word for word in combined_corpus if word not in vocab_to_int]
print("Unknown examples:", unk_words[:10])
print(f"Total unknowns: {len(unk_words):,}")

Unknown examples: ['lahontan', 'septentrionale', 'anarchiste', 'propri', 'mutuellisme', 'amoralism', 'individualistically', 'experimenal', 'yarros', 'signficiant']
Total unknowns: 957,481
