In [1]:
from pathlib import Path
import pandas as pd

# pip install git+https://github.com/boudinfl/pke.git
# pke relies on spacy (>= 3.2.3) for text processing and requires models to be installed:
# # download the russian model
# python -m spacy download ru_core_news_lg
import pke
import string
import ru_core_news_lg
import spacy

In [None]:
!python -m spacy download ru_core_news_lg

In [2]:
spacy_model = spacy.load("ru_core_news_lg")

In [3]:
all_df = pd.read_csv("data/all_dataset.csv", sep="\t", encoding="utf-8")

In [4]:
all_df = all_df[all_df["rubrics"].str.contains('образование_отзывус')].dropna().astype("str").reset_index()

In [37]:
text = []
for i in range(len(all_df)):
    text.append([
        (w.text, w.pos_)
        for w in spacy_model(all_df['text'][i])
    ])

text_normalized = []
for i in range(len(all_df)):
    text_normalized.append([
        (w.lemma_, w.pos_)
        for w in spacy_model(all_df['text'][i])
        if w.pos_ != "PUNCT"
    ])

In [41]:
print(text[0], '\n')
print(text_normalized[0])

[('Был', 'AUX'), ('скучноват', 'ADJ'), ('.', 'PUNCT'), ('Много', 'ADV'), ('практики', 'NOUN'), (',', 'PUNCT'), ('решение', 'NOUN'), ('задач', 'NOUN'), ('про', 'ADP'), ('заработную', 'ADJ'), ('плату', 'NOUN'), ('.', 'PUNCT'), ('Довольно', 'ADV'), ('сложно', 'ADJ'), ('зарабатывать', 'VERB'), ('баллы', 'NOUN'), (',', 'PUNCT'), ('нужно', 'ADJ'), ('теорию', 'NOUN'), ('учить', 'VERB'), ('и', 'CCONJ'), ('местами', 'NOUN'), ('её', 'PRON'), ('рассказывать', 'VERB'), (',', 'PUNCT'), ('решать', 'VERB'), ('постоянно', 'ADV'), ('задачи', 'NOUN'), (',', 'PUNCT'), ('выходить', 'VERB'), ('к', 'ADP'), ('доске', 'NOUN'), ('.', 'PUNCT'), ('Когда', 'SCONJ'), ('баллы', 'NOUN'), ('набраны', 'VERB'), ('на', 'ADP'), ('зачёт', 'NOUN'), (',', 'PUNCT'), ('все', 'ADV'), ('равно', 'ADJ'), ('нужно', 'ADJ'), ('ходить', 'VERB'), (',', 'PUNCT'), ('иначе', 'CCONJ'), ('она', 'PRON'), ('их', 'PRON'), ('уберёт', 'VERB'), ('(', 'PUNCT'), ('хотя', 'SCONJ'), (',', 'PUNCT'), ('она', 'PRON'), ('просто', 'PART'), ('так', 'ADV')

# TextRank

In [44]:
%%time

# define the set of valid Part-of-Speeches
# pos = {'NOUN', 'PROPN', 'ADJ'}
pos = {'NOUN', 'PROPN'}

# 1. create a TextRank extractor.
extractor_textrank = pke.unsupervised.TextRank()

# 2. load the content of the document.
extractor_textrank.load_document(input=text,
                        language='ru',
                        #spacy_model=spacy_model,
                        normalization=None)

extractor_textrank.ngram_selection(n=1)

# 3. build the graph representation of the document and rank the words.
#    Keyphrase candidates are composed from the 33-percent
#    highest-ranked words.
extractor_textrank.candidate_weighting(window=3,
                              pos=pos,
                              top_percent=0.33)




CPU times: user 1.74 s, sys: 8.01 ms, total: 1.74 s
Wall time: 1.74 s


In [45]:
# 4. get the 10-highest scored candidates as keyphrases
keyphrases_textrank = extractor_textrank.get_n_best(n=100)
keyphrases_textrank

[('электив преподаватель -', 0.04649929603272352),
 ('составление интеллект - карта подготовка презентация написание эссе электив',
  0.040957931602645714),
 ('преподавательница - елена эрикович -', 0.040466090832117455),
 ('онлайн - прокторинг - сдача экзамен', 0.03756406636137394),
 ('- студент электив', 0.03709786719555557),
 ('навык администрирование сеть преподаватель -', 0.03581824213985302),
 ('преподаватель елена александрович -', 0.03503414938095692),
 ('преподаватель светлана геннадиевич - очень', 0.034388556767959814),
 ('электив - это дело предпочтение', 0.03427154243486049),
 ('электив подача преподаватель полезность материал', 0.03394326990089521),
 ('преподаватель - петров владимир владимирович', 0.03393038507751399),
 ('по преподаватель ирина иванович -', 0.03391581185378042),
 ('проблема преподаватель - очень', 0.033774902605974856),
 ('преподаватель - ирина иванович', 0.033582546973448237),
 ('преподаватель - оксана александрович', 0.03339168531362244),
 ('преподавате

# TopicRank

In [None]:
%%time

# initialize keyphrase extraction model, here TopicRank
extractor_topicrank = pke.unsupervised.TopicRank()

# 2. load the content of the document.
stoplist = list(string.punctuation)
stoplist += pke.lang.stopwords.get('ru')

# load the content of the document, here document is expected to be a simple
# test string and preprocessing is carried out using spacy
extractor_topicrank.load_document(input=text, language='ru', stoplist=stoplist)

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
pos = {'NOUN', 'PROPN'}
extractor_topicrank.candidate_selection(pos=pos)

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor_topicrank.candidate_weighting(threshold=0.74, method='average')


In [None]:
# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
keyphrases_topicrank = extractor_topicrank.get_n_best(n=100)
keyphrases_topicrank

# SingleRank

In [None]:
%%time

# 1. create a SingleRank extractor.
extractor_singlerank = pke.unsupervised.SingleRank()

# 2. load the content of the document.
extractor_singlerank.load_document(input=text,
                        language='ru',
                        normalization=None)

# 3. select the longest sequences of nouns and adjectives as candidates.
# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN'}
extractor_singlerank.candidate_selection(pos=pos)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk. In the graph, nodes are words of
#    certain part-of-speech (nouns and adjectives) that are connected if
#    they occur in a window of 10 words.
extractor_singlerank.candidate_weighting(window=10,
                                         pos=pos)


CPU times: user 5.34 s, sys: 55.9 ms, total: 5.4 s
Wall time: 5.39 s


In [None]:
# 5. get the 10-highest scored candidates as keyphrases
keyphrases_singlerank = extractor_singlerank.get_n_best(n=20)
keyphrases_singlerank

[('теории - электив', 0.03974693264373249),
 ('электив -', 0.038359945245145514),
 ('целом электив', 0.03206445731005243),
 ('проект электив', 0.031316604092436445),
 ('студентов электив', 0.03126727287287185),
 ('электив людей', 0.030983823363713463),
 ('моок электив', 0.029684082022215064),
 ('электив год', 0.02957551658380631),
 ('сенцонуицнауа электив', 0.029508266432170645),
 ('сдачу электив', 0.029426588750769495),
 ('закрытии электив', 0.02942432670419585),
 ('подспорьем электив', 0.029354512263770668),
 ('электив', 0.029108646766357597),
 ('преподаватель - душка', 0.026389286182699676),
 ('преподаватель - психолог', 0.02627851495222831),
 ('- преподаватель', 0.026049244879996946),
 ('задания преподаватель', 0.025176469271029477),
 ('начале курса преподаватель', 0.023311636800214357),
 ('преподаватель идея электива', 0.021544146211280944),
 ('половине семестра преподаватель', 0.020483028546499922)]

# PositionRank

In [None]:
%%time

# define the valid Part-of-Speeches to occur in the graph
pos = {'NOUN', 'PROPN'}

# define the grammar for selecting the keyphrase candidates
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"
# grammar = "NP: {<NOUN|PROPN>+}"

# 1. create a PositionRank extractor.
extractor_positionrank = pke.unsupervised.PositionRank()

# 2. load the content of the document.
extractor_positionrank.load_document(input=text,
                                     language='ru',
                                     normalization=None)

# 3. select the noun phrases up to 3 words as keyphrase candidates.
extractor_positionrank.candidate_selection(grammar=grammar,
                                           maximum_word_number=3)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk biaised with the position of the words
#    in the document. In the graph, nodes are words (nouns and
#    adjectives only) that are connected if they occur in a window of
#    10 words.
extractor_positionrank.candidate_weighting(window=10,
                                           pos=pos)


CPU times: user 5.72 s, sys: 172 ms, total: 5.89 s
Wall time: 5.89 s


In [None]:
# 5. get the 10-highest scored candidates as keyphrases
keyphrases_positionrank = extractor_positionrank.get_n_best(n=20)
keyphrases_positionrank

[('милое местечко', 0.03426966960482957),
 ('атмосферное местечко', 0.03426966960482957),
 ('самое вкусное местечко', 0.03426966960482957),
 ('данное местечко', 0.03426966960482957),
 ('интересно местечко', 0.03426966960482957),
 ('убогое местечко', 0.03426966960482957),
 ('уютное местечко', 0.03426966960482957),
 ('местечко', 0.03426966960482957),
 ('сыров -', 0.03407217417400235),
 ('- меню', 0.032872340615539855),
 ('пиво - класс', 0.03127058172060554),
 ('- пиво', 0.030817591889289106),
 ('- вкусное пиво', 0.030817591889289106),
 ('атмосфера - кайф', 0.030586651514968793),
 ('раз -', 0.028809622305363047),
 ('- официант ярослав', 0.027532756575211478),
 ('официанты - лапочки', 0.027306570625952015),
 ('- лучший бар', 0.02682962766585958),
 ('- подвальный бар', 0.02682962766585958),
 ('основных блюд -', 0.026178607679303034)]

# MultipartiteRank

In [10]:
%%time

# 1. create a MultipartiteRank extractor.
extractor_multipartiterank = pke.unsupervised.MultipartiteRank()

stoplist = list(string.punctuation)
stoplist += pke.lang.stopwords.get('ru')

# 2. load the content of the document.
extractor_multipartiterank.load_document(input=text,
                                         language="ru",
                                         stoplist=stoplist)

# 3. select the longest sequences of nouns and adjectives, that do
#    not contain punctuation marks or stopwords as candidates.
pos = {'NOUN', 'PROPN'}
extractor_multipartiterank.candidate_selection(pos=pos)

# 4. build the Multipartite graph and rank candidates using random
#    walk, alpha controls the weight adjustment mechanism, see
#    TopicRank for threshold/method parameters.
extractor_multipartiterank.candidate_weighting(alpha=1.5,
                                               threshold=0.25,
                                               method='average')


CPU times: user 3min 3s, sys: 2.8 s, total: 3min 6s
Wall time: 3min 6s


In [13]:
# 5. get the 20-highest scored candidates as keyphrases
keyphrases_multipartiterank = extractor_multipartiterank.get_n_best(n=100)
keyphrases_multipartiterank

[('электив', 0.04366968008791535),
 ('преподаватель', 0.029052902571779293),
 ('пары', 0.026348506959864387),
 ('задания', 0.01882588699619859),
 ('лекции', 0.01401080222374473),
 ('баллы', 0.01267645705351588),
 ('работы', 0.011460256698121481),
 ('тему', 0.011015522535422546),
 ('элективом', 0.010614899230453605),
 ('занятия', 0.010054305019815116),
 ('практики', 0.00932047923995928),
 ('курса', 0.008613943996838519),
 ('студентам', 0.00849804935722269),
 ('информации', 0.008231521788113559),
 ('презентации', 0.008070599446386283),
 ('зачёт', 0.007795361237515503),
 ('тест', 0.007423200436219892),
 ('вопросы', 0.007116353445979112),
 ('материал', 0.006758269606736994),
 ('знаний', 0.006449717215196795),
 ('предмета', 0.005557074458146722),
 ('целом', 0.005529209491101293),
 ('языке', 0.004640493369427495),
 ('группы', 0.004626151577285963),
 ('проект', 0.004623502161208501),
 ('время', 0.004042613657103062),
 ('фильмы', 0.003950389819192411),
 ('преподавательница', 0.0038461290267977