In [16]:
from multiprocessing import cpu_count
from gensim.models import Word2Vec, callbacks

def train_embeddings(df, sg=0, negative=5, window=8, min_count=5, size=300, epochs=10):
    sentences = df["body_clean"].to_list()
    
    class callback(callbacks.CallbackAny2Vec):
        def __init__(self):
            self.epoch = 0

        def on_epoch_end(self, model):
            loss = model.get_latest_training_loss()
            if self.epoch == 0:
                print('Loss after epoch {}: {}'.format(self.epoch, loss))
#             elif self.epoch % 10 == 0:
            else:
                print('Loss after epoch {}: {}'.format(self.epoch, loss - self.loss_previous_step))
            self.epoch += 1
            self.loss_previous_step = loss

    model = Word2Vec(sg=sg, negative=negative, window=window, min_count=min_count, vector_size=size, workers=cpu_count())
    model.build_vocab(corpus_iterable=sentences)
    model.train(corpus_iterable=sentences, compute_loss=True, callbacks=[callback()], epochs=epochs, total_examples=model.corpus_count)
    return model

In [18]:
from loader import load_dataset

def evaluate(ds, prefix="../data/embeddings/embeddings/"):
    model = Word2Vec.load(prefix + ds + ".bin")
    print(model.wv.most_similar("man", negative="woman", topn=5))

dataset_names = ["fourthwavewomen","incels", "braincels", "trufemcels", "mensrights", "incels_full","feminism_full"]
for name in dataset_names:
    print(name)
    ds = load_dataset(name,prefix = "../data/cleaned_corpora/clean/")
    model = train_embeddings(ds,epochs=10)
    model.save("../data/embeddings/embeddings/" + name + ".bin")
    evaluate(name)

fourthwavewomen
Loss after epoch 0: 578634.5625
Loss after epoch 1: 549650.4375
Loss after epoch 2: 514351.375
Loss after epoch 3: 502573.125
Loss after epoch 4: 495701.75
Loss after epoch 5: 489172.75
Loss after epoch 6: 481805.0
Loss after epoch 7: 491668.75
Loss after epoch 8: 423997.25
Loss after epoch 9: 408947.5
[('he', 0.40082764625549316), ('buddy', 0.372963011264801), ('bf', 0.3628980219364166), ('him', 0.3555917739868164), ('scrote', 0.3543185293674469)]
incels
Loss after epoch 0: 3664846.25
Loss after epoch 1: 3217743.75
Loss after epoch 2: 2898399.0
Loss after epoch 3: 2656797.0
Loss after epoch 4: 2637712.0
Loss after epoch 5: 2474372.0
Loss after epoch 6: 2204720.0
Loss after epoch 7: 2163728.0
Loss after epoch 8: 2133010.0
Loss after epoch 9: 2073482.0
[('bro', 0.39054936170578003), ('dude', 0.3546479344367981), ('incelbro', 0.33318039774894714), ('lad', 0.32877224683761597), ('mofo', 0.3132885992527008)]
braincels
Loss after epoch 0: 7659301.0
Loss after epoch 1: 617397