### Create training data for CBOW:
- For each word in your corpus, create a 5-item context window 
- The input will be the context words, and the target will be the center word
- We'll need to convert these words into word IDs from your tokenizer

### Build the CBOW architecture:
- Input layer (context words)
- Hidden layer (embedding dimension)
- Output layer (prediction of center word using softmax)

CREATE TRAINING DATA

In [1]:
import sys
sys.path.append('../scripts')

from tokenizer import tokenizer

# Load the wiki data
filename = "../data/wiki_text_data.txt"

text = open(filename, 'r', encoding='utf-8').read().lower()

# First tokenize the wiki text
word_to_id, id_to_word, corpus, unique_word_counts = tokenizer(text)
print("Tokenisation complete")
print(len(corpus))


Number of unique words included in word count before filtering:  253854
Number of unique words included in word count after filtering:  71290
Tokenisation complete
71290


In [2]:
# Generate the training data from the corpus
# The training data looks like a list of tuples, 
# where each tuple contains a list of context words and the target word (not the IDs)

def generate_training_data(corpus):
    data = []

    # start from index 2 and end 2 positions before the last word
    # this ensures we always have 2 words before and after the target word
    # for a 5-len sliding window

    for i in range(2, len(corpus) - 2):
        # Get the context words
        # 'i' is the index of the target word
        # [i-2:i] gets the two words before the target word
        # [i+1:i+3] gets the two words after the target word
        context_words = corpus[i-2:i] + corpus[i+1:i+3]
        
        # Get the target word
        target_word = corpus[i]

        # Append the tuple to the data list
        data.append((context_words, target_word))

    return data

In [3]:
# usage
training_data = generate_training_data(corpus)
print("CBOW training data generated")



CBOW training data generated


In [4]:
# check the function
training_data[:200]

[(['anarchism', 'originated', 'a', 'term'], 'as'),
 (['originated', 'as', 'term', 'of'], 'a'),
 (['as', 'a', 'of', 'abuse'], 'term'),
 (['a', 'term', 'abuse', 'first'], 'of'),
 (['term', 'of', 'first', 'used'], 'abuse'),
 (['of', 'abuse', 'used', 'against'], 'first'),
 (['abuse', 'first', 'against', 'early'], 'used'),
 (['first', 'used', 'early', 'working'], 'against'),
 (['used', 'against', 'working', 'class'], 'early'),
 (['against', 'early', 'class', 'radicals'], 'working'),
 (['early', 'working', 'radicals', 'including'], 'class'),
 (['working', 'class', 'including', 'the'], 'radicals'),
 (['class', 'radicals', 'the', 'diggers'], 'including'),
 (['radicals', 'including', 'diggers', 'english'], 'the'),
 (['including', 'the', 'english', 'revolution'], 'diggers'),
 (['the', 'diggers', 'revolution', 'and'], 'english'),
 (['diggers', 'english', 'and', 'sans'], 'revolution'),
 (['english', 'revolution', 'sans', 'culottes'], 'and'),
 (['revolution', 'and', 'culottes', 'french'], 'sans'),


In [6]:
# save the training data, word to id mappings, id to word mappings, and corpus to pytorch tensors
import torch

# save the training data
torch.save(training_data, "../data/eve_training_data.pt")

# save the word to id mappings
torch.save(word_to_id, "../data/eve_word_to_id.pt")

# save the id to word mappings
torch.save(id_to_word, "../data/eve_id_to_word.pt")

# save the corpus
torch.save(corpus, "../data/eve_corpus.pt")

