# Микродиахроническое исследование русских приставок методами дистрибутивной семантики
## Автор: Елизавета Клыкова, БКЛ181
### Часть 5: обучение word2vec-моделей для каждого периода
1. Выбрать из базы предложения нужных периодов
2. Записать их лемматизированные представления в соответствующие файлы, каждое предложение с новой строки
3. Обучить word2vec-эмбеддинги
4. Повторить для лемм с приклеенными частями речи

#### Импорт модулей

In [1]:
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
import gensim
import pymongo
from tqdm.auto import tqdm
from collections import Counter
# import warnings
# warnings.filterwarnings("ignore")

**Подключение к базе**

In [3]:
client = pymongo.MongoClient('localhost', 27017)
db = client['thesis']
# fs = gridfs.GridFS(db)
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'thesis')

In [4]:
sentences = db.sentences
lemmas = db.lemmas
tokens = db.tokens

#### Запись лемматизированных предложений в файлы
Досоветский период (4,4 млн)

In [None]:
with open('word2vec_presov.txt', 'w', encoding='utf-8') as f_pr:
    for sentence in tqdm(sentences.find({'period': 0},
                                        {'token_info': True, '_id': False})):
        lem_list = [token['lemma'] for token in sentence['token_info']
                    if token['pos'] != 'PUNCT' or ' ' in token['token']]
        lem_sent = ''.join(lem_list)
        f_pr.write(f'{lem_sent}\n')

Советский период (7 млн)

In [None]:
with open('word2vec_sov.txt', 'w', encoding='utf-8') as f_s:
    for sentence in tqdm(sentences.find({'period': 1},
                                        {'token_info': True, '_id': False})):
        lem_list = [token['lemma'] for token in sentence['token_info']
                    if token['pos'] != 'PUNCT' or ' ' in token['token']]
        lem_sent = ''.join(lem_list)
        f_s.write(f'{lem_sent}\n')

Постсоветский период (5,8 млн)

In [None]:
with open('word2vec_postsov.txt', 'w', encoding='utf-8') as f_ps:
    for sentence in tqdm(sentences.find({'period': 2},
                                        {'token_info': True, '_id': False})):
        lem_list = [token['lemma'] for token in sentence['token_info']
                    if token['pos'] != 'PUNCT' or ' ' in token['token']]
        lem_sent = ''.join(lem_list)
        f_ps.write(f'{lem_sent}\n')

#### Обучение моделей

In [5]:
def train_word2vec_model(file_for_model, size=300, window=5,
                         min_count=5, iterations=50):
    data = gensim.models.word2vec.LineSentence(file_for_model)
    model = gensim.models.Word2Vec(data, size=size, window=window,
                                   min_count=min_count, iter=iterations)
    return model

In [6]:
def save_word2vec_model(model, file_with_model, binary=True):
    model.init_sims(replace=True)
    model.wv.save_word2vec_format(file_with_model, binary=binary)

In [7]:
def load_word2vec_model(file_with_model):
    model = gensim.models.KeyedVectors.load_word2vec_format(file_with_model,
                                                            binary=False)
    print('Размер словаря:', len(model.wv.vocab))
    return model

Досоветский период

In [None]:
presov_w2v_model = train_word2vec_model('word2vec_presov.txt')

In [None]:
save_word2vec_model(presov_w2v_model, 'word2vec_presov_lemmas.bin')

In [None]:
len(presov_w2v_model.wv.vocab)

Советский период

In [None]:
sov_w2v_model = train_word2vec_model('word2vec_sov.txt')

In [None]:
save_word2vec_model(sov_w2v_model, 'word2vec_sov_lemmas.bin')

In [None]:
len(sov_w2v_model.wv.vocab)

Постсоветский период

In [None]:
postsov_w2v_model = train_word2vec_model('word2vec_postsov.txt')

In [None]:
save_word2vec_model(postsov_w2v_model, 'word2vec_postsov_lemmas.bin')

In [None]:
len(postsov_w2v_model.wv.vocab)

#### То же для лемм + частей речи

In [8]:
# досоветский период
with open('word2vec_presov_with_pos.txt', 'w', encoding='utf-8') as f_pr:
    for sentence in tqdm(sentences.find({'period': 0},
                                        {'token_info': True, '_id': False})):
        lem_list = [token['lemma'] + '_' + token['pos']
                    for token in sentence['token_info']
                    if token['pos'] != 'PUNCT']
        lem_sent = ' '.join(lem_list)
        f_pr.write(f'{lem_sent}\n')

0it [00:00, ?it/s]

In [8]:
presov_w2v_model_pos = train_word2vec_model('word2vec_presov_with_pos.txt')

In [9]:
save_word2vec_model(presov_w2v_model_pos, 'word2vec_presov_lemmas_pos.bin')

In [10]:
len(presov_w2v_model_pos.wv.vocab)

123804

In [9]:
# советский период
with open('word2vec_sov_with_pos.txt', 'w', encoding='utf-8') as f_s:
    for sentence in tqdm(sentences.find({'period': 1},
                                        {'token_info': True, '_id': False})):
        lem_list = [token['lemma'] + '_' + token['pos']
                    for token in sentence['token_info']
                    if token['pos'] != 'PUNCT']
        lem_sent = ' '.join(lem_list)
        f_s.write(f'{lem_sent}\n')

0it [00:00, ?it/s]

In [11]:
sov_w2v_model_pos = train_word2vec_model('word2vec_sov_with_pos.txt')

In [12]:
save_word2vec_model(sov_w2v_model_pos, 'word2vec_sov_lemmas_pos.bin')

In [13]:
len(sov_w2v_model_pos.wv.vocab)

147146

In [10]:
# постсоветский период
with open('word2vec_postsov_with_pos.txt', 'w', encoding='utf-8') as f_ps:
    for sentence in tqdm(sentences.find({'period': 2},
                                        {'token_info': True, '_id': False})):
        lem_list = [token['lemma'] + '_' + token['pos']
                    for token in sentence['token_info']
                    if token['pos'] != 'PUNCT']
        lem_sent = ' '.join(lem_list)
        f_ps.write(f'{lem_sent}\n')

0it [00:00, ?it/s]

In [14]:
postsov_w2v_model_pos = train_word2vec_model('word2vec_postsov_with_pos.txt')

In [15]:
save_word2vec_model(postsov_w2v_model_pos, 'word2vec_postsov_lemmas_pos.bin')

In [16]:
len(postsov_w2v_model_pos.wv.vocab)

144650