In [2]:
%pip install yake spacy sentence-transformers scikit-learn pandas numpy ipywidgets tqdm
!python -m spacy download en_core_web_sm

Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl.metadata (9.0 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cusolver

# Read preprocessed text corpus

In [3]:
import json
import pandas as pd
import torch
from yake import KeywordExtractor
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy import load

2025-06-11 12:42:37.889805: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749645758.109794      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749645758.176667      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
with open("/kaggle/input/o4u-messages/o4u_preprocessed_messages_Jun_07_2025.json") as f:
    texts = json.load(f)

print(texts[:5])

['Dear students, This channel advertises minor extracurricular activities, internal and external events, hackathons, competitions, campaigns and other potentially interesting happenings. All mentioned is supposed to help you to keep informed about additional opportunities for own personal and professional development. Keep in touch!', 'Hi there! Student Affairs is urgently looking for 3 volunteers to help with administrative work today from 15:30 until 18:00. Your efforts will be compensated with: - innopoints - tea cookies, if you like - friendly 319 team - amazing reputation in the future! If you may help please message andrejsblakunovs', 'Hi there! Want any of these? Student Affairs are looking for volunteers to help with administrative work - today 15:00-17:00 or - tomorrow in 319 from 14:00 to 16:00. Your efforts will be compensated with: - IBC 2019 T-shirt - tea cookies, if you like - friendliness of 319 team! If you may help please message andrejsblakunovs', "Bonjour! Ça va? С'e

# Keyword extraction

In [5]:
MAX_NUM_NGRAM = 2
MAX_NUM_KEYWORDS = 10

custom_stopwords = {
    "a", "an", "and", "the", "in", "on", "for", "to", "with", "of", "is", "are", 
    "was", "were", "it", "that", "this", "you", "he", "she", "they", "we", "i",
    "or", "but", "at", "by", "from", "be", "as", "about", "your", "have", "has", "had"
    "innopolis", "university", "iu",
    "dear", "students", "join", "invite", "invites", "room", "hall",
    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday",
    "sunday", "today", "tomorrow", "yesterday",
    "january", "february", "march", "april", "may", "june",
    "july", "august", "september", "october", "november", "december"
}

kw_extractor = KeywordExtractor(
    n=MAX_NUM_NGRAM,
    top=MAX_NUM_KEYWORDS,
    windowsSize=2,
    stopwords=custom_stopwords
)

In [6]:
sorted(kw_extractor.extract_keywords(texts[1]), key=lambda x: x[1])

[('Student Affairs', 0.13852360513076734),
 ('there', 0.16526509993669602),
 ('until 18:00', 0.2888479059923554),
 ('Affairs', 0.2986136531614795),
 ('innopoints', 0.30807788090138477),
 ('help', 0.3335343196299429),
 ('tea cookies', 0.3341383705148965),
 ('Student', 0.40744790190646923),
 ('18:00', 0.40744790190646923),
 ('urgently looking', 0.433785625834595)]

In [7]:
extracted_keywords = []
for text in texts:
    keywords_scores = sorted(kw_extractor.extract_keywords(text), key=lambda x: x[1])
    if keywords_scores:
        keywords = [kw[0].lower() for kw in keywords_scores]
        extracted_keywords.extend(keywords[:5])

print(len(extracted_keywords))

14225


In [8]:
keyword_counter = Counter(extracted_keywords)

In [9]:
popular_keywords = sorted(
    [(kw, cnt) for kw, cnt in keyword_counter.items() if cnt > 10],
    key=lambda x: x[1],
    reverse=True
)

In [10]:
keywords_candidates = []
for kw, cnt in popular_keywords:
    print(repr(kw), cnt)
    keywords_candidates.append(kw)

'visiting lecturer' 125
'reminder' 115
'lecturer candidate' 112
'innopolis' 103
'new visiting' 100
'candidate talk' 82
'inno stand' 79
'friendly reminder' 67
'club' 59
'take part' 54
'new year' 52
'international fest' 45
'candidate lecture' 45
'friendly' 45
'day' 41
'got talent' 41
'artificial intelligence' 40
'inno got' 37
'stand' 35
'year' 35
'russian' 31
'new faculty' 31
'will' 30
'new' 29
'russian federation' 29
'international' 29
'fest' 25
'master class' 24
'iustudentnews good' 24
'friends' 23
'summer school' 23
'spring ball' 22
'faculty candidate' 22
'ball' 21
'information security' 21
'those who' 19
'sport complex' 19
'iustudentnews hello' 19
'lounge zone' 18
'inno' 18
'good evening' 18
'student' 17
'what' 17
'rage club' 17
'quiz' 17
'spring' 16
'olympiad' 16
'твой ход' 16
'lecture seminar' 16
'candidate class' 16
'good' 16
'art' 15
'volunteering opportunity' 15
'club fest' 15
'tatarstan' 15
'lecture' 15
'dance day' 15
'our' 14
'volunteers wanted' 14
'kazan digital' 14
'applicat

# Noun Phrase Chunking

In [11]:
nlp = load("en_core_web_sm")

In [12]:
extracted_noun_phrases = []
for doc in nlp.pipe(texts):
    for chunk in doc.noun_chunks:
        extracted_noun_phrases.append(chunk.text.lower())

print(len(extracted_noun_phrases))

66350


In [13]:
phrase_counter = Counter(extracted_noun_phrases)

In [14]:
popular_phrases = sorted(
    [(n, cnt) for n, cnt in phrase_counter.items() if cnt > 30 and n not in custom_stopwords],
    key=lambda x: x[1],
    reverse=True
)

In [15]:
phrases_candidates = []
for ph, cnt in popular_phrases:
    print(repr(ph), cnt)
    phrases_candidates.append(ph)

'part' 379
'who' 339
'the link' 339
'innopolis' 277
'us' 268
'what' 264
'russia' 202
'more information' 195
'which' 190
'participants' 189
'the event' 171
'more info' 170
'education' 170
'innopolis university' 163
'registration' 150
'everyone' 149
'the competition' 142
'questions' 140
'a team' 140
'the form' 139
'our university' 137
'the field' 132
'reminder' 123
'research' 123
'russian' 119
'them' 118
'tatarstan' 117
'participation' 117
'the program' 114
'the world' 113
'science' 112
"'s" 107
'the best faculty candidates' 104
'the opportunity' 103
'the year' 100
'kazan' 93
'registration deadline' 93
'language' 92
'the winners' 91
'the forum' 89
'teams' 86
'experience' 85
'volunteers' 84
'time' 84
'the meeting' 84
'club' 84
'yourself' 83
'the development' 80
'register' 80
'english' 80
'the future' 79
'experts' 79
'artificial intelligence' 79
'deadline' 77
'-' 77
'new visiting lecturer candidate talk' 77
'technology' 76
'the republic' 75
'the university' 75
'development' 75
'artspace' 7

# Semantic clustering based topic extraction

In [16]:
model = SentenceTransformer(
    model_name_or_path="sentence-transformers/all-MiniLM-L12-v2",
    device="cuda:0" if torch.cuda.is_available() else "cpu",
    model_kwargs={"torch_dtype": torch.bfloat16}
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
document_embeddings = model.encode(texts, batch_size=64)
document_embeddings

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

array([[ 0.0402832 , -0.03540039,  0.08447266, ...,  0.07568359,
        -0.0324707 ,  0.04516602],
       [-0.02783203, -0.04174805,  0.12451172, ...,  0.03173828,
        -0.04760742, -0.02258301],
       [-0.01843262, -0.02185059,  0.10107422, ..., -0.02661133,
        -0.06176758, -0.02526855],
       ...,
       [ 0.01324463, -0.03112793,  0.00037003, ...,  0.00061798,
        -0.03710938, -0.01391602],
       [-0.0135498 ,  0.0112915 ,  0.05786133, ..., -0.06347656,
        -0.04077148, -0.00346375],
       [ 0.01397705, -0.03344727,  0.02050781, ...,  0.06689453,
         0.02941895, -0.03295898]], dtype=float32)

In [18]:
NUM_CLUSTERS = 30
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init="auto")
kmeans.fit(document_embeddings)

In [19]:
cluster_assignments = kmeans.labels_
cluster_assignments

array([13, 25, 25, ..., 18,  3,  5], dtype=int32)

In [20]:
docs_df = pd.DataFrame({'doc': texts, 'cluster': cluster_assignments})
docs_df

Unnamed: 0,doc,cluster
0,"Dear students, This channel advertises minor e...",13
1,Hi there! Student Affairs is urgently looking ...,25
2,Hi there! Want any of these? Student Affairs a...,25
3,Bonjour! Ça va? С'est la vie.. Croissant. If t...,14
4,"On December 14, comedian Vladimir Marconi arri...",18
...,...,...
2845,"""Slippers of the Year"" Contest! Your slippers ...",22
2846,ECO ACTION for World Environment Day! 5 June i...,2
2847,"PreParty of the City Day for homies ""Super Inn...",18
2848,International Acceleration Program 2025 The pr...,3


In [21]:
grouped_docs = docs_df.groupby(["cluster"], as_index=False).agg({"doc": " ".join})
grouped_docs

Unnamed: 0,cluster,doc
0,0,Want to upgrade your programming skills? We ar...
1,1,"Inno Stand Up - every Thursday, 19:00 If you h..."
2,2,Student Union are looking for amazing VOLUNTEE...
3,3,APPLY for the Robotic projects until December ...
4,4,Japanese language courses at Anime Club! The f...
5,5,"""Me and my personal habits. How to use time in..."
6,6,"""The Profession - Business Analyst"" book prese..."
7,7,We invite you to become an IU Ambassador and t...
8,8,ART- Therapy course! Dear students! We continu...
9,9,Ho-Ho-Ho! SECRET SANTA IS COMING SOON! Secret ...


In [22]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(grouped_docs['doc'])

In [23]:
feature_names = vectorizer.get_feature_names_out()
print(feature_names)

['00' '00 00' '00 00 00' ... 'ثنائي' 'ثنائي duet'
 'ثنائي duet instrumental']


In [24]:
topics_candidates = []
for i in range(NUM_CLUSTERS):
    row = tfidf_matrix[i].toarray().flatten()
    top_indices = row.argsort()[-5:]
    top_keywords = [feature_names[j] for j in reversed(top_indices)]
    print(f"Topic {i}: {', '.join(top_keywords)}")
    topics_candidates.extend(top_keywords)

Topic 0: innopolis, university, students, innopolis university, programming
Topic 1: inno stand, inno stand club, stand club, stand, inno
Topic 2: wegs, 00, halloween, cook, photo contest
Topic 3: project, technologies, competition, information, projects
Topic 4: club, 00, team, tournament, game
Topic 5: mathematical, lecture, mathematics, russian, university
Topic 6: click link, new visiting, lecturer candidate, just click link, just click
Topic 7: iu, students, university, team, student
Topic 8: art therapy, therapy, meeting, 00, language meeting russian
Topic 9: ho, new year, christmas, ho ho, new
Topic 10: candidate talk, lecturer candidate talk, visiting lecturer, lecturer candidate, new visiting lecturer
Topic 11: russian, competition, scientific, technologies, project
Topic 12: talent, got talent, inno got, inno got talent, got
Topic 13: 00, join, talk space, talk, room
Topic 14: russian, 00, festival, international, language
Topic 15: contest, bee, competition, integration bee,

# Final tag list candidates

In [27]:
tags_candidates = []
tags_candidates.extend(keywords_candidates)
tags_candidates.extend(phrases_candidates)
tags_candidates.extend(topics_candidates)
tags_candidates = set(tags_candidates)

print(len(tags_candidates))
print(tags_candidates)

308
{'ms teams candidates', 'startups', 'bar', 'the winners', 'hip hop', 'the future', 'prizes', 'who want', 'inno stand', 'mathematics', 'university', 'sports programming', 'news like share', 'quiz', 'date', 'lecture', 'secret santa', 'iustudentnews hello', 'part', 'sciences', 'football club', 'volunteers', 'kazan digital', 'the end', 'time', 'news', 'faculty candidate', 'club', 'dance', 'the opportunity', 'head', 'open mic', 'what', 'the framework', 'got', 'job fair', 'ball', 'digital transformation', 'science', 'meeting', 'knowledge', 'therapy', 'application deadline', 'candidate talk', 'day', 'mipt', 'russian venture', 'digital week', 'support', 'the russian federation', 'registration deadline', 'ho ho', 'olympiad', 'contest', 'volunteering', 'all', 'get ready', 'event', 'sport complex', 'hybrid mode', 'international', 'prize fund', 'queen', 'youth forum', 'innostreetdance', 'registration', 'the field', 'the link', 'dance day', 'halloween', 'our students', 'information', 'summer sc

In [None]:
final_tags = {
    # --- Primary Event & Opportunity Types ---
    "Workshop",
    "Lecture",
    "Seminar",
    "Talk",
    "Conference",
    "Forum",
    "Hackathon",
    "Olympiad",
    "Contest",
    "Festival",
    "Job Fair",
    "Master Class",
    "Club Meeting",
    "Ball",
    "Concert",
    "Party",
    "Quiz",
    "Game",
    "Internship",
    "Volunteering",
    # --- Common Topics ---
    "Programming",
    "Artificial Intelligence",
    "Computer Science",
    "Machine Learning",
    "Data Science",
    "Cybersecurity",
    "Robotics",
    "Science",
    "Mathematics",
    "Physics",
    "Business",
    "Startups",
    "Design",
    "Art",
    "Music",
    "Dance",
    "Sports",
    "Language Learning"
}

In [26]:
print(len(final_tags))

37
