In [1]:
%pip install yake

Note: you may need to restart the kernel to use updated packages.


In [2]:
import json
from yake import KeywordExtractor
from collections import Counter

## Read preprocessed text corpus

In [3]:
with open("data/o4u_preprocessed_messages_Jun_07_2025.json", "r") as f:
    texts = json.load(f)

print(texts[:5])

['Dear students, This channel advertises minor extracurricular activities, internal and external events, hackathons, competitions, campaigns and other potentially interesting happenings. All mentioned is supposed to help you to keep informed about additional opportunities for own personal and professional development. Keep in touch!', 'Hi there! Student Affairs is urgently looking for 3 volunteers to help with administrative work today from 15:30 until 18:00. Your efforts will be compensated with: - innopoints - tea cookies, if you like - friendly 319 team - amazing reputation in the future! If you may help please message andrejsblakunovs', 'Hi there! Want any of these? Student Affairs are looking for volunteers to help with administrative work - today 15:00-17:00 or - tomorrow in 319 from 14:00 to 16:00. Your efforts will be compensated with: - IBC 2019 T-shirt - tea cookies, if you like - friendliness of 319 team! If you may help please message andrejsblakunovs', "Bonjour! Ça va? С'e

## Extract potential keywords

In [4]:
MAX_NUM_NGRAM = 2
MAX_NUM_KEYWORDS = 10

custom_stopwords = {
    "a", "an", "and", "the", "in", "on", "for", "to", "with", "of", "is", "are", 
    "was", "were", "it", "that", "this", "you", "he", "she", "they", "we", "i",
    "or", "but", "at", "by", "from", "be", "as", "about", "your", "have", "has", "had"
    "innopolis", "university", "iu",
    "dear", "students", "join", "invite", "invites", "room", "hall",
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday",
    "sunday", "today", "tomorrow", "yesterday",
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
}

kw_extractor = KeywordExtractor(
    n=MAX_NUM_NGRAM,
    top=MAX_NUM_KEYWORDS,
    windowsSize=2,
    stopwords=custom_stopwords
)



In [5]:
sorted(kw_extractor.extract_keywords(texts[1]), key=lambda x: x[1])

[('Student Affairs', np.float64(0.13852360513076734)),
 ('there', np.float64(0.16526509993669602)),
 ('until 18:00', np.float64(0.2888479059923554)),
 ('Affairs', np.float64(0.2986136531614795)),
 ('innopoints', np.float64(0.30807788090138477)),
 ('help', np.float64(0.3335343196299429)),
 ('tea cookies', np.float64(0.3341383705148965)),
 ('Student', np.float64(0.40744790190646923)),
 ('18:00', np.float64(0.40744790190646923)),
 ('urgently looking', np.float64(0.433785625834595))]

In [6]:
extracted_keywords = []
for text in texts:
    keywords_scores = sorted(kw_extractor.extract_keywords(text), key=lambda x: x[1])
    if keywords_scores:
        keywords = [kw[0].lower() for kw in keywords_scores]
        extracted_keywords.extend(keywords[:5])

print(len(extracted_keywords))

14225


In [7]:
keyword_counter = Counter(extracted_keywords)

In [12]:
popular_keywords = sorted(
    [(kw, cnt) for kw, cnt in keyword_counter.items() if cnt > 10],
    key=lambda x: x[1],
    reverse=True
)

In [13]:
for kw, cnt in popular_keywords:
    print(repr(kw), cnt)

'visiting lecturer' 125
'reminder' 115
'lecturer candidate' 112
'innopolis' 103
'new visiting' 100
'candidate talk' 82
'inno stand' 79
'friendly reminder' 67
'club' 59
'take part' 54
'new year' 52
'international fest' 45
'candidate lecture' 45
'friendly' 45
'day' 41
'got talent' 41
'artificial intelligence' 40
'inno got' 37
'stand' 35
'year' 35
'russian' 31
'new faculty' 31
'will' 30
'new' 29
'russian federation' 29
'international' 29
'fest' 25
'master class' 24
'iustudentnews good' 24
'friends' 23
'summer school' 23
'spring ball' 22
'faculty candidate' 22
'ball' 21
'information security' 21
'those who' 19
'sport complex' 19
'iustudentnews hello' 19
'lounge zone' 18
'inno' 18
'good evening' 18
'student' 17
'what' 17
'rage club' 17
'quiz' 17
'spring' 16
'olympiad' 16
'твой ход' 16
'lecture seminar' 16
'candidate class' 16
'good' 16
'art' 15
'volunteering opportunity' 15
'club fest' 15
'tatarstan' 15
'lecture' 15
'dance day' 15
'our' 14
'volunteers wanted' 14
'kazan digital' 14
'applicat