In [27]:
def train_batch_cbow(sentences, model=None, alpha=None, work=None, neu1=None, compute_loss=False):
        """Update CBOW model by training on a sequence of sentences.

        Called internally from :meth:`~gensim.models.word2vec.Word2Vec.train`.

        Warnings
        --------
        This is the non-optimized, pure Python version. If you have a C compiler, Gensim
        will use an optimized code path from :mod:`gensim.models.word2vec_inner` instead.

        Parameters
        ----------
        model : :class:`~gensim.models.word2vec.Word2Vec`
            The Word2Vec model instance to train.
        sentences : iterable of list of str
            The corpus used to train the model.
        alpha : float
            The learning rate
        work : object, optional
            Unused.
        neu1 : object, optional
            Unused.
        compute_loss : bool, optional
            Whether or not the training loss should be computed in this batch.

        Returns
        -------
        int
            Number of words in the vocabulary actually used for training (that already existed in the vocabulary
            and were not discarded by negative sampling).

        """
        result = 0
        for sentence in sentences:
            # Assume that all words in the sentence are also in the vocabulary
            word_vocabs = [ w for w in sentence
                # model.wv.vocab[w] for w in sentence if w in model.wv.vocab
                # and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32
                # 'to', 'be', 'or', 'not', 'that', 'is', 'the', 'question'
                
            ]
            for pos, word in enumerate(word_vocabs):
                # reduced_window = model.random.randint(model.window)  # `b` in the original word2vec code
                reduced_window = 2
                # start = max(0, pos - model.window + reduced_window)
                start = max(0, pos - 5 + reduced_window)
                # window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
                window_pos = enumerate(word_vocabs[start:(pos + 5 + 1 - reduced_window)], start)
                word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
                # l1 = np_sum(model.wv.syn0[word2_indices], axis=0)  # 1 x vector_size
                # if word2_indices and model.cbow_mean:
                #     l1 /= len(word2_indices)
                # train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss)
                print(f'{word} --> {[w.__self__ for w in word2_indices]}')
            result += len(word_vocabs)
        return result

In [28]:
train_batch_cbow([['to', 'be', 'or', 'not', 'to', 'be', 'that', 'is', 'the', 'question']])

to --> ['be', 'or', 'not']
be --> ['to', 'or', 'not', 'to']
or --> ['to', 'be', 'not', 'to', 'be']
not --> ['to', 'be', 'or', 'to', 'be', 'that']
to --> ['be', 'or', 'not', 'be', 'that', 'is']
be --> ['or', 'not', 'to', 'that', 'is', 'the']
that --> ['not', 'to', 'be', 'is', 'the', 'question']
is --> ['to', 'be', 'that', 'the', 'question']
the --> ['be', 'that', 'is', 'question']
question --> ['that', 'is', 'the']


10

In [46]:
import itertools

sentences = itertools.combinations([str(x) for x in range(1,11)], 5)

train_batch_cbow(sentences)

1 --> ['2', '3', '4']
2 --> ['1', '3', '4', '5']
3 --> ['1', '2', '4', '5']
4 --> ['1', '2', '3', '5']
5 --> ['2', '3', '4']
1 --> ['2', '3', '4']
2 --> ['1', '3', '4', '6']
3 --> ['1', '2', '4', '6']
4 --> ['1', '2', '3', '6']
6 --> ['2', '3', '4']
1 --> ['2', '3', '4']
2 --> ['1', '3', '4', '7']
3 --> ['1', '2', '4', '7']
4 --> ['1', '2', '3', '7']
7 --> ['2', '3', '4']
1 --> ['2', '3', '4']
2 --> ['1', '3', '4', '8']
3 --> ['1', '2', '4', '8']
4 --> ['1', '2', '3', '8']
8 --> ['2', '3', '4']
1 --> ['2', '3', '4']
2 --> ['1', '3', '4', '9']
3 --> ['1', '2', '4', '9']
4 --> ['1', '2', '3', '9']
9 --> ['2', '3', '4']
1 --> ['2', '3', '4']
2 --> ['1', '3', '4', '10']
3 --> ['1', '2', '4', '10']
4 --> ['1', '2', '3', '10']
10 --> ['2', '3', '4']
1 --> ['2', '3', '5']
2 --> ['1', '3', '5', '6']
3 --> ['1', '2', '5', '6']
5 --> ['1', '2', '3', '6']
6 --> ['2', '3', '5']
1 --> ['2', '3', '5']
2 --> ['1', '3', '5', '7']
3 --> ['1', '2', '5', '7']
5 --> ['1', '2', '3', '7']
7 --> ['2', '3', '

1260

In [47]:
sentences2 = [str(x) for x in range(1,11)]

train_batch_cbow([sentences2])
sentences2

1 --> ['2', '3', '4']
2 --> ['1', '3', '4', '5']
3 --> ['1', '2', '4', '5', '6']
4 --> ['1', '2', '3', '5', '6', '7']
5 --> ['2', '3', '4', '6', '7', '8']
6 --> ['3', '4', '5', '7', '8', '9']
7 --> ['4', '5', '6', '8', '9', '10']
8 --> ['5', '6', '7', '9', '10']
9 --> ['6', '7', '8', '10']
10 --> ['7', '8', '9']


['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']