Import all the libraries we need

In [1]:
import gensim
import logging
import numpy as np
import re
from conllu import parse
import os
from sympy import Point, Line, Segment
from scipy.spatial import distance
from pymorphy2 import MorphAnalyzer
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
path = os.getcwd()
conllu_file = path + '/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu'

Parse the CoNLL-U file

In [3]:
with open(conllu_file, "r", encoding="utf-8") as data_file:
    data = data_file.read()
sentences = parse(data)

Lemmatize texts and put part-of-speech tags for each word in them

In [None]:
with open('result.txt', 'w', encoding='utf-8') as result:
    for sentence in sentences:
        line = ' '.join([word['lemma'] + '_' + word['upos'] for word in sentence if word['upos'] != 'PUNCT' and word['upos'] != 'NUM'])
        result.write(line + '\n')

In [4]:
f = 'result.txt'
data = gensim.models.word2vec.LineSentence(f)

Compile a dictionary only from words longer than two characters and not included in the list of stop words for the Russian language

In [7]:
stops = set(stopwords.words('russian'))
words = [word['lemma'] + '_' + word['upos'] for sentence in sentences for word in sentence
         if len(word) > 2
         and word['lemma'] not in stops]

Calculate the association measure for each bigram obtained from the texts

In [8]:
finder = BigramCollocationFinder.from_words(words)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {' '.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

Leave only phrases consisting of an adjective and a noun

In [9]:
colloc = [key for key, value in collocations.items() if value > 1 and re.match(r'[а-я]+_ADJ [а-я]+_NOUN', key)]

In the collocations highlighted by the MI measure, we connect the words with an underscore so that the model perceives them as a single whole

In [None]:
%time
with open('result.txt', 'r', encoding='utf-8') as file:
    text = file.read()
for c in colloc:
    repl = c.replace(' ', '_')
    text = text.replace(c, repl)
with open('result_new.txt', 'w', encoding='utf-8') as result:
    result.write(text)

Train the Word2Vec model 

In [None]:
%time model = gensim.models.Word2Vec(data, vector_size=300, window=5, min_count=5, epochs=50)

In [None]:
model.save('w2v.model')

In [6]:
model = gensim.models.Word2Vec.load('w2v.model')

Create a function that calculates the mean vector for two words in a phrase

In [10]:
def get_mean_vector(word2vec_model, words):
    vectors = [model.wv.get_vector(word, norm=True) for word in words if word in word2vec_model.wv]
    if len(words) >= 1:
        return gensim.matutils.unitvec(np.mean(vectors, axis=0)).astype(float)
    else:
        return []

Calculate for each phrase the distance from the midpoint to the line connecting the words, and sort the phrases in descending order of the calculated distance

In [11]:
ud_collocations = {}
for c in colloc:
    adj, noun = c.split()
    if adj in model.wv and noun in model.wv:
        p1, p2, p3 = Point(model.wv[adj], evaluate=False), Point(model.wv[noun], evaluate=False), Point(get_mean_vector(model, [adj, noun]), evaluate=False)
        l1 = Line(p1, p2)
        dist = distance.euclidean(np.array(p3).astype(float), np.array(l1.projection(p3)).astype(float))
        ud_collocations[c] = dist

In [12]:
ud_collocations = {i: ud_collocations[i] for i in sorted(ud_collocations, key=ud_collocations.get, reverse=True)}
ud_collocations

{'самый_ADJ дело_NOUN': 15.158156330852007,
 'экономический_ADJ рост_NOUN': 14.33828849291259,
 'русский_ADJ язык_NOUN': 14.111519883686263,
 'первый_ADJ очередь_NOUN': 12.957828971144938,
 'больший_ADJ часть_NOUN': 12.938151326550942,
 'ядерный_ADJ оружие_NOUN': 12.764024274189852,
 'значительный_ADJ часть_NOUN': 12.443441886875654,
 'высший_ADJ образование_NOUN': 12.361910296405629,
 'высший_ADJ школа_NOUN': 11.809780359480094,
 'средний_ADJ класс_NOUN': 11.703104855735296,
 'настоящий_ADJ время_NOUN': 11.555916390021013,
 'прошлый_ADJ год_NOUN': 11.515941841794097,
 'пенсионный_ADJ возраст_NOUN': 9.637137339877176,
 'железный_ADJ дорога_NOUN': 9.386149038196775,
 'конечный_ADJ счет_NOUN': 9.36536199876118,
 'головной_ADJ мозг_NOUN': 8.90222203926927,
 'крайний_ADJ мера_NOUN': 8.843125443730608,
 'жидкий_ADJ мембрана_NOUN': 8.348501462723748,
 'золотой_ADJ медаль_NOUN': 7.773153748232594,
 'розничный_ADJ сеть_NOUN': 7.588308673844453,
 'сельский_ADJ хозяйство_NOUN': 7.514950792993694

Let's repeat these steps for texts in chemistry

In [None]:
morph = MorphAnalyzer()

In [None]:
def pos(word):
    return morph.parse(word)[0].tag.POS

func_pos = {'INTJ', 'PRCL', 'CONJ', 'PREP', 'NPRO', 'NUMR', None}

In [None]:
with open('all_cyberleninka_chemystry2.txt', "r", encoding="utf-8") as txt_file:
    texts = txt_file.read()

In [None]:
s_tokenized = sent_tokenize(texts)

In [None]:
tokens = [[token for token in wordpunct_tokenize(s)
           if pos(token) not in func_pos and token not in stops]
          for s in s_tokenized]

In [None]:
with open('chemistry.txt', 'w', encoding='utf-8') as file:
    for s in tokens:
        if s:
            s_lemmatized = ' '.join([morph.parse(token)[0].normal_form + '_' + pos(token) for token in s
                                     if str(morph.parse(token)[0].tag) != 'PNCT'])
            file.write(s_lemmatized + '\n')

In [13]:
f = 'chemistry.txt'
data = gensim.models.word2vec.LineSentence(f)

In [14]:
lines = [line.strip() for line in open(f, encoding='utf-8')]

In [15]:
words = [word for line in lines for word in line.split()]

In [16]:
finder = BigramCollocationFinder.from_words(words)
bgm = BigramAssocMeasures()
score = bgm.mi_like
collocations = {' '.join(bigram): pmi for bigram, pmi in finder.score_ngrams(score)}

In [17]:
colloc = [key for key, value in collocations.items() if value > 1 and re.match(r'[а-я]+_ADJF [а-я]+_NOUN', key)]

In [None]:
with open('chemistry.txt', 'r', encoding='utf-8') as file:
    text = file.read()
for c in colloc:
    repl = c.replace(' ', '_')
    text = text.replace(c, repl)
with open('chemistry_new.txt', 'w', encoding='utf-8') as result:
    result.write(text)

In [None]:
%time model = gensim.models.Word2Vec(data, vector_size=300, window=5, min_count=5, epochs=50)

In [None]:
model.save('w2v_chem.model')

In [18]:
model = gensim.models.Word2Vec.load('w2v_chem.model')

In [19]:
c_dict = {}
for c in colloc:
    adj, noun = c.split()
    if adj in model.wv and noun in model.wv:
        p1, p2, p3 = Point(model.wv[adj], evaluate=False), Point(model.wv[noun], evaluate=False), Point(get_mean_vector(model, [adj, noun]), evaluate=False)
        l1 = Line(p1, p2)
        dist = distance.euclidean(np.array(p3).astype(float), np.array(l1.projection(p3)).astype(float))
        c_dict[c] = dist

In [20]:
sorted_colls = {i: c_dict[i] for i in sorted(c_dict, key=c_dict.get, reverse=True)}
sorted_colls

{'государственный_ADJF университет_NOUN': 24.750774255764522,
 'прочностный_ADJF свойство_NOUN': 24.08047151042016,
 'амперометрический_ADJF биосенсор_NOUN': 23.70061566999836,
 'адгезионный_ADJF прочность_NOUN': 23.671127700865718,
 'компьютерный_ADJF программа_NOUN': 23.656466316097184,
 'реологический_ADJF свойство_NOUN': 23.435833916397655,
 'актуальный_ADJF проблема_NOUN': 23.336830819606657,
 'ковалентный_ADJF связь_NOUN': 23.258859150862737,
 'нуклеофильный_ADJF атака_NOUN': 23.255462267856043,
 'инверсионный_ADJF вольтамперометрия_NOUN': 23.13523353557722,
 'рентгеновский_ADJF дифрактометр_NOUN': 23.092514801804764,
 'важный_ADJF задача_NOUN': 23.087013445579146,
 'исследовательский_ADJF институт_NOUN': 23.08655971799994,
 'экологический_ADJF безопасность_NOUN': 23.058169668008862,
 'актуальный_ADJF задача_NOUN': 23.02834005709894,
 'надмолекулярный_ADJF структура_NOUN': 23.01560913530108,
 'мировой_ADJF рынок_NOUN': 23.00483674354127,
 'научный_ADJF школа_NOUN': 22.86030681023

In [21]:
desc_colls = {i: c_dict[i] for i in sorted(c_dict, key=c_dict.get, reverse=False)}
desc_colls

{'ациламинопроизводный_ADJF арилоксиантрахинон_NOUN': 2.3457526452641906,
 'редакционный_ADJF коллегия_NOUN': 2.4688549140622427,
 'борнокислый_ADJF аиилинуксуонай_NOUN': 2.5306406580025156,
 'метоксибензилиденамин_ADJF метилвалерианов_NOUN': 2.541294557633888,
 'себряковский_ADJF филиал_NOUN': 2.626221187795479,
 'ревматический_ADJF лихорадка_NOUN': 2.6322965303179875,
 'полихромный_ADJF расцветка_NOUN': 2.670950189525924,
 'остромысленский_ADJF жоба_NOUN': 2.698192845490106,
 'чутливый_ADJF елемент_NOUN': 2.700681833091412,
 'монохромный_ADJF расцветка_NOUN': 2.7344799666885,
 'севастопольский_ADJF бухта_NOUN': 2.9443958720934464,
 'асбестоцементный_ADJF шифер_NOUN': 2.9521406385342224,
 'щековый_ADJF дробилка_NOUN': 2.9753626484445768,
 'кедринский_ADJF дмитренко_NOUN': 3.1658550441324533,
 'роговой_ADJF обманка_NOUN': 3.1818859150846657,
 'метилнитроамин_ADJF тринитропиридин_NOUN': 3.186039960006879,
 'упсв_ADJF пашня_NOUN': 3.208998855146158,
 'аскадский_ADJF кондращенко_NOUN': 3.