In [1]:
# generate_context_word_pairs + pretty print example
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer



In [2]:
# Small example corpus (replace with your tokenized corpus)
sentences = [
    "deep learning also known as structured learning",
    "structured learning is part of a broader family of machine learning methods based"
]

In [7]:

# 1) build tokenizer & mappings
tokenizer = Tokenizer(oov_token=None)   # no special OOV for small demo
tokenizer.fit_on_texts(sentences)
word2idx = tokenizer.word_index
id2word = {idx: w for w, idx in word2idx.items()}
vocab_size = len(word2idx) + 1

print(word2idx)

{'learning': 1, 'structured': 2, 'of': 3, 'deep': 4, 'also': 5, 'known': 6, 'as': 7, 'is': 8, 'part': 9, 'a': 10, 'broader': 11, 'family': 12, 'machine': 13, 'methods': 14, 'based': 15}


In [8]:
# convert sentences to lists of ids (corpus of word-id lists)
wids = [ [word2idx[w] for w in s.split()] for s in sentences ]

print(wids)

[[4, 1, 5, 6, 7, 2, 1], [2, 1, 8, 9, 3, 10, 11, 12, 3, 13, 1, 14, 15]]


In [5]:
# generator: yields (x, y) where
# x -> padded context ids shape (1, context_length)
# y -> one-hot target shape (1, vocab_size)
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size * 2
    for words in corpus:
        sentence_length = len(words)
        for index, center_word in enumerate(words):
            # collect context ids (only valid indices)
            context_ids = [ words[i]
                           for i in range(index-window_size, index+window_size+1)
                           if 0 <= i < sentence_length and i != index ]
            # make them into a single sample (list of ids)
            x = pad_sequences([context_ids], maxlen=context_length, padding='pre')  # shape (1, context_length)
            y = to_categorical([center_word], num_classes=vocab_size)              # shape (1, vocab_size)
            yield (x, y)


In [6]:
# Print first N readable pairs (skip ones with padding if you want full-length contexts)
window_size = 2
gen = generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size)

printed = 0
max_print = 20
for x, y in gen:
    # convert context ids -> words, remove any padding zeros
    context_ids = [int(i) for i in x[0] if i != 0]
    context_words = [ id2word[c] for c in context_ids ]
    target_id = int(np.argmax(y[0]))
    target_word = id2word[target_id]
    
    # print exactly like your example
    print("Context (X):", context_words, "-> Target (Y):", target_word)
    
    printed += 1
    if printed >= max_print:
        break


Context (X): ['learning', 'also'] -> Target (Y): deep
Context (X): ['deep', 'also', 'known'] -> Target (Y): learning
Context (X): ['deep', 'learning', 'known', 'as'] -> Target (Y): also
Context (X): ['learning', 'also', 'as', 'structured'] -> Target (Y): known
Context (X): ['also', 'known', 'structured', 'learning'] -> Target (Y): as
Context (X): ['known', 'as', 'learning'] -> Target (Y): structured
Context (X): ['as', 'structured'] -> Target (Y): learning
Context (X): ['learning', 'is'] -> Target (Y): structured
Context (X): ['structured', 'is', 'part'] -> Target (Y): learning
Context (X): ['structured', 'learning', 'part', 'of'] -> Target (Y): is
Context (X): ['learning', 'is', 'of', 'a'] -> Target (Y): part
Context (X): ['is', 'part', 'a', 'broader'] -> Target (Y): of
Context (X): ['part', 'of', 'broader', 'family'] -> Target (Y): a
Context (X): ['of', 'a', 'family', 'of'] -> Target (Y): broader
Context (X): ['a', 'broader', 'of', 'machine'] -> Target (Y): family
Context (X): ['broa