In [1]:
from os import listdir
from os.path import isfile, join

In [None]:
path_to_dataset = "../dataset"
dataset_files = [
    join(path_to_dataset, path) 
    for path in listdir(path_to_dataset) 
    if isfile(join(path_to_dataset, path))
]

print( dataset_files )

In [3]:
from gensim.parsing.preprocessing import preprocess_string
from pymorphy2.utils import word_splits
from os.path import getsize
from math import floor
import pymorphy2
import re

class DatasetReader:
    def __init__(self, datasetPath):
        self._datasetPath = datasetPath
        self._currentText = ""
        self._morph = pymorphy2.MorphAnalyzer(lang='uk')
        self._ukrLetters = re.compile("^[абвгґдеєжзиіїйклмнопрстуфхцчшщьюя]*$", re.IGNORECASE)
    
    def __iter__(self):
        fileSize = getsize( self._datasetPath )
        prevProgress = 0
        
        with open(self._datasetPath, "r") as file:
            line = file.readline()
            while line:
                progress = floor(100 * file.tell() / fileSize)
                if prevProgress != progress:
                    prevProgress = progress
                    print(progress)
                
                self._currentText += line
                if self._tryPopSentence():
                    yield self._currentSentence
                
                try:
                    line = file.readline()
                except:
                    line = None
    
    def _tryPopSentence(self):
        sentenceEnd = self._currentText.find('.')
        if( sentenceEnd == -1 ):
            return False
        
        words = preprocess_string(self._currentText[:sentenceEnd:].replace( '\'', '' ))
        ukrWords = list(filter(lambda x: self._ukrLetters.search(x) , words))
        
        self._currentText = self._currentText[sentenceEnd+1::]
        self._currentSentence = self.to_normal_form(ukrWords)
        return True
        
    def to_normal_form(self, words_list):
        if isinstance(words_list, str):
            words_list = [words_list, ]

        res = [
            self._morph.parse(word)[0].normal_form
            for word in words_list
        ]

        if len(res) == 1:
            res = res[0]

        return res
        

In [4]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [7]:
from gensim.models.callbacks import CallbackAny2Vec

class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.prevLoss = 0
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.losses.append(loss - self.prevLoss)
        print(f'  Loss: {loss - self.prevLoss}')
        self.epoch += 1
        self.prevLoss = loss

In [None]:
dataset = DatasetReader( dataset_files[0] )
sentences = [s for s in dataset]

In [None]:
loss_logger = LossLogger()
model = Word2Vec(sentences=sentences, epochs=100, compute_loss=True, vector_size=100, window=5, min_count=1, workers=4, callbacks=[loss_logger])

In [12]:
model.save("gensim_word2Vec.model")

In [None]:
model.wv.most_similar("ялинка")
