In [19]:
from gensim.models import Word2Vec
import multiprocessing
import pickle
import pandas as pd
import os
from time import time
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
#defining an iterator that reads the txt file line by line, converts all the letters to lowercase and 
#splits them on whitespace, creating a list of tokens for each tweet. The final corpus is thus a list of lists

In [4]:
class MyCorpus:
    
    def __init__(self, file):
        self.file = file
        
    def __iter__(self):
        with open(self.file, encoding = "utf-8") as f:
            for row in f:
                _row = row.lower().split()
                yield _row

In [5]:
#creating the corpus as an iterator

In [None]:
sentences = MyCorpus("word2vec_tweets.txt")

In [6]:
#initializing the word2vec model with context_window = 1, vector_size = 300

In [20]:
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(
    min_count = 20,
    window = 1,
    vector_size = 300,
    sample = 6e-05,
    alpha = 0.03,
    min_alpha = 0.0007,
    negative = 20,
    workers = cores - 2,
    ns_exponent = 0.75,
    sg = 0)

INFO - 14:44:53: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-03-31T14:44:53.150452', 'gensim': '4.2.0', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [21]:
#building the vocabulary

In [None]:
t = time()
w2v_model.build_vocab(sentences, progress_per = 10000)

print("time to build vocab: {} mins".format(round((time() - t)/60, 2)))

In [23]:
#training the word2vec model

In [None]:
t = time()
w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 30, report_delay = 1)

print("time taken to train model: {} mins".format(round((time() - t)/60, 2)))

In [24]:
#saving the word2vec model to disk

In [None]:
pickle.dump(open("wv_model.pkl", "wb"))

In [25]:
#if you are replicating the full thesis workflow, you need to save this model in the folder containing 
#the code and files for the sentiment analysis task as well