In [1]:
from gensim.models import Word2Vec
import multiprocessing
import pickle
import pandas as pd
import os
from time import time
import logging
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
#defining an iterator that reads the txt file line by line, converts all the letters to lowercase and 
#splits them on whitespace, creating a list of tokens for each tweet. The final corpus is thus a list of lists

In [2]:
class MyCorpus:
    
    def __init__(self, file):
        self.file = file
        
    def __iter__(self):
        with open(self.file, encoding = "utf-8") as f:
            for row in f:
                _row = row.lower().split()
                yield _row

In [5]:
#creating the corpus as an iterator

In [3]:
sentences = MyCorpus("word2vec_tweets.txt")

In [6]:
#initializing the word2vec model with context_window = 1, vector_size = 300, CBOW architecture

In [4]:
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(
    min_count = 20,
    window = 1,
    vector_size = 300,
    sample = 6e-05,
    alpha = 0.03,
    min_alpha = 0.0007,
    negative = 20,
    workers = cores - 2,
    ns_exponent = 0.75,
    sg = 0)

INFO - 18:22:36: Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=300, alpha=0.03>', 'datetime': '2023-04-01T18:22:36.742560', 'gensim': '4.2.0', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [21]:
#building the vocabulary for the word2vec model

In [5]:
t = time()
w2v_model.build_vocab(sentences, progress_per = 10000)

print("time to build vocab: {} mins".format(round((time() - t)/60, 2)))

INFO - 18:22:38: collecting all words and their counts
INFO - 18:22:38: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:22:38: PROGRESS: at sentence #10000, processed 133543 words, keeping 16127 word types
INFO - 18:22:38: PROGRESS: at sentence #20000, processed 268382 words, keeping 24156 word types
INFO - 18:22:39: PROGRESS: at sentence #30000, processed 406570 words, keeping 30617 word types
INFO - 18:22:39: PROGRESS: at sentence #40000, processed 546710 words, keeping 36117 word types
INFO - 18:22:39: PROGRESS: at sentence #50000, processed 689132 words, keeping 41262 word types
INFO - 18:22:39: PROGRESS: at sentence #60000, processed 837726 words, keeping 45253 word types
INFO - 18:22:39: PROGRESS: at sentence #70000, processed 984773 words, keeping 49023 word types
INFO - 18:22:39: PROGRESS: at sentence #80000, processed 1123420 words, keeping 52680 word types
INFO - 18:22:39: PROGRESS: at sentence #90000, processed 1269111 words, keeping 56296 word ty

time to build vocab: 0.02 mins


In [23]:
#training the word2vec model

In [6]:
t = time()
w2v_model.train(sentences, total_examples = w2v_model.corpus_count, epochs = 30, report_delay = 1)

print("time taken to train model: {} mins".format(round((time() - t)/60, 2)))

INFO - 18:22:44: Word2Vec lifecycle event {'msg': 'training model with 6 workers on 7853 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=1 shrink_windows=True', 'datetime': '2023-04-01T18:22:44.912229', 'gensim': '4.2.0', 'python': '3.8.13 (default, Mar 28 2022, 06:59:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'train'}
INFO - 18:22:45: EPOCH 0 - PROGRESS: at 38.58% examples, 392642 words/s, in_qsize 0, out_qsize 0
INFO - 18:22:46: EPOCH 0 - PROGRESS: at 85.02% examples, 435351 words/s, in_qsize 0, out_qsize 1
INFO - 18:22:47: EPOCH 0: training on 2491052 raw words (1030410 effective words) took 2.3s, 440712 effective words/s
INFO - 18:22:48: EPOCH 1 - PROGRESS: at 45.44% examples, 459411 words/s, in_qsize 0, out_qsize 0
INFO - 18:22:49: EPOCH 1 - PROGRESS: at 91.38% examples, 464689 words/s, in_qsize 0, out_qsize 0
INFO - 18:22:49: EPOCH 1: training on 2491052 raw words (1029238 effective words) took 2.2s, 465249 eff

INFO - 18:23:42: EPOCH 25 - PROGRESS: at 44.21% examples, 449493 words/s, in_qsize 0, out_qsize 0
INFO - 18:23:43: EPOCH 25 - PROGRESS: at 86.14% examples, 441531 words/s, in_qsize 0, out_qsize 1
INFO - 18:23:44: EPOCH 25: training on 2491052 raw words (1030573 effective words) took 2.4s, 436371 effective words/s
INFO - 18:23:45: EPOCH 26 - PROGRESS: at 43.78% examples, 440231 words/s, in_qsize 0, out_qsize 0
INFO - 18:23:46: EPOCH 26 - PROGRESS: at 88.14% examples, 449477 words/s, in_qsize 0, out_qsize 0
INFO - 18:23:46: EPOCH 26: training on 2491052 raw words (1029604 effective words) took 2.3s, 451690 effective words/s
INFO - 18:23:47: EPOCH 27 - PROGRESS: at 43.78% examples, 441550 words/s, in_qsize 0, out_qsize 0
INFO - 18:23:48: EPOCH 27 - PROGRESS: at 88.88% examples, 454447 words/s, in_qsize 0, out_qsize 0
INFO - 18:23:48: EPOCH 27: training on 2491052 raw words (1029831 effective words) took 2.3s, 447537 effective words/s
INFO - 18:23:49: EPOCH 28 - PROGRESS: at 42.55% example

time taken to train model: 1.14 mins


In [24]:
#saving the word2vec model to disk

In [8]:
pickle.dump(w2v_model, open("wv_model.pkl", "wb"))