In [None]:
from pathlib import Path
import pandas as pd

# pip install git+https://github.com/boudinfl/pke.git
# pke relies on spacy (>= 3.2.3) for text processing and requires models to be installed:
# # download the russian model
# python -m spacy download ru_core_news_lg
import pke
import string

In [None]:
all_df = pd.read_csv("dest/all_dataset.csv", sep="\t", encoding="utf-8")

In [None]:
# all_df = all_df[all_df["rubrics"].str.contains('образование_онлайн_курсы')]
# all_df = all_df[all_df["rubrics"].str.contains('образование_отзывус')]
all_df = all_df[all_df["rubrics"].str.contains('Бар')]

In [None]:
# do not shuffle
# text = " ".join(list(all_df['text'].dropna().astype("str")))[:100000]

# shuffle rows
text = " ".join(list(all_df['text'].dropna().sample(frac=1).astype("str")))[:100000]

# TextRank

In [None]:
%%time

# define the set of valid Part-of-Speeches
# pos = {'NOUN', 'PROPN', 'ADJ'}
pos = {'NOUN', 'PROPN'}

# 1. create a TextRank extractor.
extractor_textrank = pke.unsupervised.TextRank()

# 2. load the content of the document.
extractor_textrank.load_document(input=text,
                        language='ru',
                        normalization=None)

# 3. build the graph representation of the document and rank the words.
#    Keyphrase candidates are composed from the 33-percent
#    highest-ranked words.
extractor_textrank.candidate_weighting(window=4,
                              pos=pos,
                              top_percent=0.33)




CPU times: user 5.64 s, sys: 67.8 ms, total: 5.71 s
Wall time: 5.71 s


In [None]:
# 4. get the 10-highest scored candidates as keyphrases
keyphrases_textrank = extractor_textrank.get_n_best(n=20)
keyphrases_textrank

[('теории - электив', 0.03129145157117268),
 ('электив -', 0.029875414680006533),
 ('- электив', 0.029843974680006533),
 ('- преподаватель', 0.02502374434325645),
 ('преподаватель -', 0.02494784434325645),
 ('студентов электив', 0.021192362048651266),
 ('электив тем студентам', 0.020646958445456314),
 ('электив людей', 0.020460176820970587),
 ('начале курса преподаватель', 0.02036677000340112),
 ('задания преподаватель', 0.020245971268097948),
 ('целом электив', 0.020189198614876464),
 ('проект электив', 0.020182275280965733),
 ('это электив', 0.019729350126032327),
 ('электив тем', 0.019655144620199426),
 ('электив', 0.018826107107431007),
 ('итогам курса - программа', 0.018122629999391272),
 ('задания -', 0.017404302069992553),
 ('вопросов - доп баллы', 0.017380479761658497),
 ('семестра преподаватель', 0.016911013239008454),
 ('преподаватель практики', 0.01669096757404685)]

# TopicRank

In [None]:
%%time

# initialize keyphrase extraction model, here TopicRank
extractor_topicrank = pke.unsupervised.TopicRank()

# 2. load the content of the document.
stoplist = list(string.punctuation)
stoplist += pke.lang.stopwords.get('ru')

# load the content of the document, here document is expected to be a simple
# test string and preprocessing is carried out using spacy
extractor_topicrank.load_document(input=text, language='ru', stoplist=stoplist)

# keyphrase candidate selection, in the case of TopicRank: sequences of nouns
# and adjectives (i.e. `(Noun|Adj)*`)
pos = {'NOUN', 'PROPN'}
extractor_topicrank.candidate_selection(pos=pos)

# candidate weighting, in the case of TopicRank: using a random walk algorithm
extractor_topicrank.candidate_weighting(threshold=0.74, method='average')


CPU times: user 16.5 s, sys: 95.7 ms, total: 16.6 s
Wall time: 16.6 s


In [None]:
# N-best selection, keyphrases contains the 10 highest scored candidates as
# (keyphrase, score) tuples
keyphrases_topicrank = extractor_topicrank.get_n_best(n=20)
keyphrases_topicrank

[('электив', 0.04374085853602636),
 ('парах', 0.02738448170817668),
 ('преподаватель', 0.0271791115190622),
 ('задания', 0.018739328662491613),
 ('лекции', 0.01577885389484904),
 ('теме', 0.013366875628217765),
 ('работа зала', 0.012636904871537695),
 ('практике', 0.01253481064271973),
 ('балла', 0.012109779742456576),
 ('занятиях', 0.01170286158376834),
 ('курс', 0.010894734639320822),
 ('материал', 0.010651983244162817),
 ('презентации', 0.009616472898400822),
 ('элективе', 0.008807505141742885),
 ('информации ноль', 0.008730331039044161),
 ('студент', 0.008426415740849727),
 ('знаниями', 0.007771797167432178),
 ('вопросы', 0.007593502197698881),
 ('зачёт', 0.006902731148059688),
 ('группе', 0.006488117779589755)]

# SingleRank

In [None]:
%%time

# 1. create a SingleRank extractor.
extractor_singlerank = pke.unsupervised.SingleRank()

# 2. load the content of the document.
extractor_singlerank.load_document(input=text,
                        language='ru',
                        normalization=None)

# 3. select the longest sequences of nouns and adjectives as candidates.
# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN'}
extractor_singlerank.candidate_selection(pos=pos)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk. In the graph, nodes are words of
#    certain part-of-speech (nouns and adjectives) that are connected if
#    they occur in a window of 10 words.
extractor_singlerank.candidate_weighting(window=10,
                                         pos=pos)


CPU times: user 5.34 s, sys: 55.9 ms, total: 5.4 s
Wall time: 5.39 s


In [None]:
# 5. get the 10-highest scored candidates as keyphrases
keyphrases_singlerank = extractor_singlerank.get_n_best(n=20)
keyphrases_singlerank

[('теории - электив', 0.03974693264373249),
 ('электив -', 0.038359945245145514),
 ('целом электив', 0.03206445731005243),
 ('проект электив', 0.031316604092436445),
 ('студентов электив', 0.03126727287287185),
 ('электив людей', 0.030983823363713463),
 ('моок электив', 0.029684082022215064),
 ('электив год', 0.02957551658380631),
 ('сенцонуицнауа электив', 0.029508266432170645),
 ('сдачу электив', 0.029426588750769495),
 ('закрытии электив', 0.02942432670419585),
 ('подспорьем электив', 0.029354512263770668),
 ('электив', 0.029108646766357597),
 ('преподаватель - душка', 0.026389286182699676),
 ('преподаватель - психолог', 0.02627851495222831),
 ('- преподаватель', 0.026049244879996946),
 ('задания преподаватель', 0.025176469271029477),
 ('начале курса преподаватель', 0.023311636800214357),
 ('преподаватель идея электива', 0.021544146211280944),
 ('половине семестра преподаватель', 0.020483028546499922)]

# PositionRank

In [None]:
%%time

# define the valid Part-of-Speeches to occur in the graph
pos = {'NOUN', 'PROPN'}

# define the grammar for selecting the keyphrase candidates
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"
# grammar = "NP: {<NOUN|PROPN>+}"

# 1. create a PositionRank extractor.
extractor_positionrank = pke.unsupervised.PositionRank()

# 2. load the content of the document.
extractor_positionrank.load_document(input=text,
                                     language='ru',
                                     normalization=None)

# 3. select the noun phrases up to 3 words as keyphrase candidates.
extractor_positionrank.candidate_selection(grammar=grammar,
                                           maximum_word_number=3)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk biaised with the position of the words
#    in the document. In the graph, nodes are words (nouns and
#    adjectives only) that are connected if they occur in a window of
#    10 words.
extractor_positionrank.candidate_weighting(window=10,
                                           pos=pos)


CPU times: user 5.72 s, sys: 172 ms, total: 5.89 s
Wall time: 5.89 s


In [None]:
# 5. get the 10-highest scored candidates as keyphrases
keyphrases_positionrank = extractor_positionrank.get_n_best(n=20)
keyphrases_positionrank

[('милое местечко', 0.03426966960482957),
 ('атмосферное местечко', 0.03426966960482957),
 ('самое вкусное местечко', 0.03426966960482957),
 ('данное местечко', 0.03426966960482957),
 ('интересно местечко', 0.03426966960482957),
 ('убогое местечко', 0.03426966960482957),
 ('уютное местечко', 0.03426966960482957),
 ('местечко', 0.03426966960482957),
 ('сыров -', 0.03407217417400235),
 ('- меню', 0.032872340615539855),
 ('пиво - класс', 0.03127058172060554),
 ('- пиво', 0.030817591889289106),
 ('- вкусное пиво', 0.030817591889289106),
 ('атмосфера - кайф', 0.030586651514968793),
 ('раз -', 0.028809622305363047),
 ('- официант ярослав', 0.027532756575211478),
 ('официанты - лапочки', 0.027306570625952015),
 ('- лучший бар', 0.02682962766585958),
 ('- подвальный бар', 0.02682962766585958),
 ('основных блюд -', 0.026178607679303034)]

# MultipartiteRank

In [None]:
%%time

# 1. create a MultipartiteRank extractor.
extractor_multipartiterank = pke.unsupervised.MultipartiteRank()

stoplist = list(string.punctuation)
stoplist += pke.lang.stopwords.get('ru')

# 2. load the content of the document.
extractor_multipartiterank.load_document(input=text,
                                         language="ru",
                                         stoplist=stoplist)

# 3. select the longest sequences of nouns and adjectives, that do
#    not contain punctuation marks or stopwords as candidates.
pos = {'NOUN', 'PROPN'}
extractor_multipartiterank.candidate_selection(pos=pos)

# 4. build the Multipartite graph and rank candidates using random
#    walk, alpha controls the weight adjustment mechanism, see
#    TopicRank for threshold/method parameters.
extractor_multipartiterank.candidate_weighting(alpha=1.5,
                                               threshold=0.25,
                                               method='average')


CPU times: user 21.9 s, sys: 400 ms, total: 22.3 s
Wall time: 22.3 s


In [None]:
# 5. get the 10-highest scored candidates as keyphrases
keyphrases_multipartiterank = extractor_multipartiterank.get_n_best(n=20)
keyphrases_multipartiterank

[('место', 0.02895057012781344),
 ('персонал', 0.01641370245076043),
 ('заведение', 0.01541817042853683),
 ('пива', 0.014374028398306085),
 ('кухню', 0.014062170684994992),
 ('еде', 0.013398316598900896),
 ('атмосфера', 0.01152498747316213),
 ('обслуживание', 0.011246302932412116),
 ('цены', 0.010620380838374473),
 ('официанты', 0.010556204678451612),
 ('меню', 0.009782930862203755),
 ('музыка', 0.007933471650336496),
 ('блюда', 0.0077810815273155745),
 ('стол', 0.007499518506603814),
 ('раз', 0.007211008567100512),
 ('бар', 0.007210308239260101),
 ('интерьер', 0.0071106000636529335),
 ('спасибо', 0.006996946852392568),
 ('коктейли', 0.006380744174745164),
 ('заказе', 0.005141419376638484)]