In [1]:
from pathlib import Path
import pandas as pd

# pip install git+https://github.com/boudinfl/pke.git
# pke relies on spacy (>= 3.2.3) for text processing and requires models to be installed:
# # download the english model
# python -m spacy download en_core_web_sm
import pke

In [8]:
all_df = pd.read_csv("dest/all_dataset.csv", sep=";")
all_df = all_df[all_df["rubrics"].str.contains('Образование_онлайн_курсы')]

In [9]:
text = " ".join(list(all_df['text'].dropna().astype("str")))[:1000000]

In [10]:

# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a SingleRank extractor.
extractor = pke.unsupervised.SingleRank()

# 2. load the content of the document.
extractor.load_document(input=text,
                        language='ru',
                        normalization=None)

# 3. select the longest sequences of nouns and adjectives as candidates.
extractor.candidate_selection(pos=pos)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk. In the graph, nodes are words of
#    certain part-of-speech (nouns and adjectives) that are connected if
#    they occur in a window of 10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=10)


In [11]:
keyphrases

[('данный электив интересный электив', 0.02662816459361012),
 ('электив сам электив', 0.023828254952605966),
 ('электив замечательный преподаватель рамзия ахмаровна', 0.01862670117034687),
 ('целом интересный курс', 0.0186219256458276),
 ('курс нужно', 0.018582123548682328),
 ('курс обучения', 0.01839208988982352),
 ('целом курс хороший', 0.018325158152650546),
 ('электив - хорошая возможность', 0.01821776993474377),
 ('целом данный курс', 0.018080183684864783),
 ('объемные задания интересный электив', 0.0180497803651325)]