In [7]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 123598.78 examples/s]
Generating train split: 100%|██████████| 1801350/1801350 [00:02<00:00, 620522.10 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 458433.85 examples/s]


In [None]:
type(dataset)

datasets.arrow_dataset.Dataset

In [8]:


# -- 1. Tokenizer & dataset loader --------------------------------

def get_english_tokenizer() -> Callable[[str], List[str]]:
    return get_tokenizer("basic_english", language="en")


def get_data_iterator(
    ds_name: str,
    split: str,
    root: str
):
    if ds_name == "WikiText2":
        data = WikiText2(root=root, split=split)
    elif ds_name == "WikiText103":
        data = WikiText103(root=root, split=split)
    else:
        raise ValueError("Choose dataset from: WikiText2, WikiText103")
    from torchtext.data import to_map_style_dataset
    return to_map_style_dataset(data)

# -- 2. Vocabulary builder -----------------------------------------

def build_vocab_from_corpus(
    data_iter,
    tokenizer: Callable[[str], List[str]],
    min_freq: int = MIN_WORD_FREQUENCY,
    specials: List[str] = ['<unk>']
):
    vocab = build_vocab_from_iterator(
        map(tokenizer, data_iter),
        specials=specials,
        min_freq=min_freq
    )
    vocab.set_default_index(vocab['<unk>'])
    return vocab

# -- 3. Pair generation helper ------------------------------------

def generate_pairs(
    token_ids: List[int],
    window: int,
    model: str
) -> List[Tuple[List[int], int]]:
    pairs = []
    L = len(token_ids)
    for idx, center in enumerate(token_ids):
        start = max(0, idx - window)
        end = min(L, idx + window + 1)
        context = token_ids[start:idx] + token_ids[idx+1:end]

        if model == 'cbow':
            if context:
                pairs.append((context, center))
        else:
            for ctx in context:
                pairs.append((center, ctx))
    return pairs

# -- 4. Unified collate with PyTorch utilities --------------------

def collate_word2vec(
    batch: List[str],
    text_pipeline: Callable[[str], List[int]],
    window: int,
    model: str
) -> Tuple[torch.Tensor, torch.Tensor]:
    # tokenize and convert to IDs
    token_ids_batch = [torch.tensor(text_pipeline(text), dtype=torch.long)
                       for text in batch]
    # generate all pairs
    inp_list, tgt_list = [], []
    for ids in token_ids_batch:
        if len(ids) < window * 2 + 1:
            continue
        if MAX_SEQUENCE_LENGTH:
            ids = ids[:MAX_SEQUENCE_LENGTH]
        for inp, tgt in generate_pairs(ids.tolist(), window, model):
            inp_list.append(torch.tensor(inp, dtype=torch.long))
            tgt_list.append(tgt)
    if model == 'cbow':
        # pad context sequences
        padded = pad_sequence(inp_list, batch_first=True,
                              padding_value=text_pipeline.vocab['<unk>'])
        return padded, torch.tensor(tgt_list, dtype=torch.long)
    else:
        centers = torch.tensor(inp_list, dtype=torch.long)
        targets = torch.tensor(tgt_list, dtype=torch.long)
        return centers, targets

# -- 5. High-level dataloader + vocab getter ----------------------

def get_dataloader_and_vocab(
    model_name: str,
    ds_name: str,
    ds_type: str,
    data_dir: str,
    batch_size: int,
    shuffle: bool,
    vocab: Optional[object] = None
):
    data_iter = get_data_iterator(ds_name, ds_type, data_dir)
    tokenizer = get_english_tokenizer()

    if vocab is None:
        vocab = build_vocab_from_corpus(data_iter, tokenizer)

    # build a transform pipeline
    text_pipeline = Sequential(
        tokenizer,
        VocabTransform(vocab),
        TruncateTransform(MAX_SEQUENCE_LENGTH)
    )
    # stash vocab for padding
    text_pipeline.vocab = vocab

    window = CBOW_N_WORDS if model_name == 'cbow' else SKIPGRAM_N_WORDS
    collate_fn = partial(
        collate_word2vec,
        text_pipeline=text_pipeline,
        window=window,
        model=model_name
    )

    loader = DataLoader(
        data_iter,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=collate_fn
    )
    return loader, vocab


AttributeError: 'str' object has no attribute 'param_groups'

In [4]:
import torch
import torch.nn as nn

In [11]:
corpus = [
    ["the", "sky", "is", "blue"],
    ["the", "sun", "is", "bright"],
    ["we", "love", "natural", "language", "processing"],
    ["word2vec", "learns", "word", "embeddings"],
    ["pytorch", "is", "great", "for", "deep", "learning"]
]


In [40]:
word2idx = {}
idx2word = {}
idx = 0

for c in corpus:
    for word in c:
        if word not in word2idx:
            word2idx[word] = idx
            idx2word[idx] = word
            idx += 1

In [41]:
def get_training_pairs(corpus, window_size=2, model_type="cbow"):
    training_pairs = []

    for sentence in corpus:
        for idx in range(len(sentence)):
            center_word = sentence[idx]
            center_word_idx = word2idx[center_word]

            # Get left and right context
            start = max(0, idx - window_size)
            end = min(len(sentence), idx + window_size + 1)

            context = sentence[start:idx] + sentence[idx+1:end]
            context_idx = [word2idx[word] for word in context]

            if model_type == "cbow":
                if context: 
                    training_pairs.append((context_idx, center_word_idx))
            else:
                for context_word in context:
                    context_word_idx = word2idx[context_word]
                    training_pairs.append((center_word_idx, context_word_idx))

    return training_pairs


In [42]:
training_pairs = get_training_pairs(corpus, window_size=2, model_type="cbow")
training_pairs[:5]

[([1, 2], 0), ([0, 2, 3], 1), ([0, 1, 3], 2), ([1, 2], 3), ([4, 2], 0)]

In [45]:
class Word2vecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward_cbow(self, X):  # X: (batch_size, context_size)
        embedded = self.embedding(X)        # (batch_size, context_size, emb_dim)
        averaged = torch.mean(embedded, dim=1)  # (batch_size, emb_dim)
        return self.linear(averaged)        # (batch_size, vocab_size)

    def forward_skipgram(self, X):  # X: (batch_size,)
        embedded = self.embedding(X)        # (batch_size, emb_dim)
        return self.linear(embedded)        # (batch_size, vocab_size)