In [1]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import json
from tqdm.auto import tqdm


class DocsIterator:
    def __init__(self, filepath, title_key="title", body_key="body"):
        self.filepath = filepath
        self.title_key = title_key
        self.body_key = body_key

    def __iter__(self):
        with open(self.filepath, 'r', encoding='utf-8') as f:
            for line in f:
                doc = json.loads(line)
                text = f"{doc[self.title_key]} {doc[self.body_key]}"
                # remove stopwords
                tokens = simple_preprocess(text, max_len=50)
                yield [t for t in tokens if t not in STOPWORDS]

# Callback that updates a tqdm bar at the end of each epoch
class EpochProgress(CallbackAny2Vec):
    def __init__(self, total_epochs):
        self.epoch = 0
        self.pbar = tqdm(total=total_epochs, desc="Training Epochs")

    def on_epoch_end(self, model):
        self.epoch += 1
        self.pbar.update(1)

# Initialize iterator
iterator = DocsIterator('data/docs.jsonl')

epochs = 5

# train Word2Vec
model = Word2Vec(
    sentences=iterator,
    vector_size=100,   # dimensionality of embeddings
    window=8,          # context window size
    negative=5,
    workers=1,
    epochs=epochs,
    callbacks=[EpochProgress(epochs)]
)

# save for later use
model.save("word2vec_stopwords_removed.model")

Training Epochs:   0%|          | 0/5 [00:00<?, ?it/s]