
### In this section, we are building a tokenizer. To do that, we want to:

    - create a list of words by spliting text by whitespace
    - make all words lower case
    - filter out rare words, that occurred less than N times in the corpus
    - setting the vocab length to the total number of unique words


In [3]:
from collections import Counter
import re

text_file = "../data/wiki_text_data.txt"

with open(text_file, "r") as file:
    text = file.read()

def tokenizer(text):
    
    # remove punctuation and non alphabetic characters 
    remove_punctuation = re.sub(r'[^\w\s]', '', text)
    lower_case_words = remove_punctuation.lower()
    words = lower_case_words.split(' ')

    # print count of words in split_words_by_whitespace
    print(f"Number of words before filtering: {len(words)}")

    # get word counts 

    top_k = 30000
    word_counts = Counter(words)
    top_words = dict(word_counts.most_common(top_k))
    word_to_id = {word: i for i, word in enumerate(top_words.keys())}
    id_to_word = {i: word for i, word in enumerate(top_words.keys())}

    # Sum their counts
    total_count = sum(count for word, count in top_words.items())

    print(f"Total count of top {top_k} words: {total_count}")
    # Optional: Show what percentage of all words this represents
    total_words = sum(word_counts.values())
    percentage = (total_count / total_words) * 100
    print(f"This represents {percentage:.2f}% of all words in the corpus")

    # filter corpus to only include words in the tok k words
    corpus = [word for word in words if word in top_words]
    print("corpus length:", len(corpus))

    return word_to_id, id_to_word, corpus


In [4]:
# usage
word_to_id, id_to_word, corpus = tokenizer(text)

#print(corpus[:100])



Number of words before filtering: 17005208
Total count of top 30000 words: 16315126
This represents 95.94% of all words in the corpus
corpus length: 16315126


In [5]:
# Generate the training data from the corpus
# The training data looks like a list of tuples, 
# where each tuple contains a list of context words and the target word (not the IDs)

def generate_training_data(corpus):
    data = []

    # start from index 2 and end 2 positions before the last word
    # this ensures we always have 2 words before and after the target word
    # for a 5-len sliding window

    for i in range(2, len(corpus) - 2):
        # Get the context words
        # 'i' is the index of the target word
        # [i-2:i] gets the two words before the target word
        # [i+1:i+3] gets the two words after the target word
        context_words = corpus[i-2:i] + corpus[i+1:i+3]
        
        # Get the target word
        target_word = corpus[i]

        # Append the tuple to the data list
        data.append((context_words, target_word))

    return data

In [6]:
# usage
training_data = generate_training_data(corpus)
print("CBOW training data generated")



CBOW training data generated


In [5]:
# save the training data, word to id mappings, id to word mappings, and corpus to pytorch tensors
import torch

# save the training data
torch.save(training_data, "../data/eve_training_data.pt")

# save the word to id mappings
torch.save(word_to_id, "../data/eve_word_to_id.pt")

# save the id to word mappings
torch.save(id_to_word, "../data/eve_id_to_word.pt")

# save the corpus
torch.save(corpus, "../data/eve_corpus.pt")



In [11]:
len(training_data)

16315122