## load data

In [23]:
import pandas as pd
from bs4 import BeautifulSoup
import emoji
import re


def preprocess_text(text: str) -> str:
  soup = BeautifulSoup(text, features="html.parser")
  text = soup.get_text()
  text = text.replace("\n", ". ")
  return text


def remove_url(text: str) -> str:
  return re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', ' ', text)


def preprocess_entity(text: str) -> str:
  text = text.replace("\n", " ")
  return re.sub(r'[@][^\ ]*', '', text)

In [8]:
news_df = pd.read_csv("post10k.csv", encoding="utf-8")
news_df = news_df.dropna(subset=['text'])
news_df = news_df.reset_index(drop=True)
news_df.text = news_df.text.apply(lambda x: preprocess_text(x))
news_df.text = news_df.text.apply(lambda x: remove_url(x))
news_df['text'] = news_df["text"].apply(lambda x: emoji.replace_emoji(x))

In [9]:
news_df

Unnamed: 0.1,Unnamed: 0,msg_id,entity_id,text,content_type,keyboard,silent,has_media_spoiler,posted
0,0,5301,1002094480,–Ø—Ä–∫–∏–µ —Ñ–æ—Ç–æ–∫–∞—Ä—Ç–æ—á–∫–∏ –ø–µ—Ä–≤–æ–π –ø–æ–±–µ–¥—ã –≤ –≥–æ–¥—É —É–∂–µ –∂...,media_group,,0,0,2023-01-09 17:10:06
1,1,5302,1002094480,–ö—Ç–æ —Å—Ç–∞–ª –æ–±–ª–∞–¥–∞—Ç–µ–ª–µ–º –∏–≥—Ä–æ–≤–æ–π –º–∞–π–∫–∏ –∏ –±—Ä–µ–Ω–¥–∏—Ä–æ...,text,,0,0,2023-01-10 14:00:11
2,2,5304,1002094480,–¢—Ä–µ–Ω–∏—Ä—É–µ–º—Å—è –≤ —Ç—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω–æ–º —Ä–µ–∂–∏–º–µ . . –ì–æ—Ç–æ–≤–∏–º—Å...,video,,0,0,2023-01-10 14:40:25
3,3,5305,1002094480,–ü—Ä–æ–≤–µ–¥–∏ –≥–æ–¥ –≤–º–µ—Å—Ç–µ —Å –î–∏–Ω–∞–º–æ-–ê–∫ –ë–∞—Ä—Å . . –ü—Ä–µ–¥—Å—Ç...,video,,0,0,2023-01-11 12:30:23
4,4,5306,1002094480,–î–∏–Ω–∞–º–æ-–ê–∫ –ë–∞—Ä—Å –ú–∏–Ω—á–∞–Ω–∫–∞. . –ö—Ç–æ –æ–∫–∞–∂–µ—Ç—Å—è —Å–∏–ª—å–Ω...,photo,,0,0,2023-01-11 16:00:06
...,...,...,...,...,...,...,...,...,...
11078,2217,5732,1150141694,–ß–∏—Å—Ç–∏–ª–∏ –¥–æ—Ä–æ–≥—É-—Å–∫–æ–≤—ã—Ä–Ω—É–ª–∏ —Ç–∞–∫—Å–æ—Ñ–æ–Ω.. . @montaj...,photo,,0,0,2023-03-27 09:01:43
11079,2218,5733,1150141694,–ù–∞—Ç–∫–Ω—É–ª—Å—è –Ω–∞ –ø–æ–ª–µ–∑–Ω—ã–π –∫–∞–Ω–∞–ª –æ—Ç –¢–∏–º—É—Ä–∞ –ï–≤–≥–∞–∂—É–∫–æ...,photo,,0,0,2023-03-27 09:21:34
11080,2219,5735,1150141694,@montajniklvs,media_group,,0,0,2023-03-27 14:00:54
11081,2220,5736,1150141694,–ö–æ–º–ø–∞–Ω–∏—è X-–°om —Å–æ–≤–º–µ—Å—Ç–Ω–æ c Hyperline –ø—Ä–∏–≥–ª–∞—à–∞–µ...,photo,,0,0,2023-03-27 14:09:43


In [24]:
ent_df = pd.read_csv("entity10k.csv", encoding="utf-8")

ent_df = ent_df.dropna(subset=['about', 'name'])
ent_df = ent_df.reset_index(drop=True)
ent_df.about = ent_df.about.apply(lambda x: preprocess_entity(x))

ent_df

Unnamed: 0.1,Unnamed: 0,entity_id,created_with,entity_type,photo_id,photo_url,name,username,access_hash,about,...,cheat_created,ios_ban,scam,verified,fake,hidden,blocked,language_code,birthdate,created
0,0,1610995496,,public_channel,5.472302e+18,https://telegra.ph/file/8bfdab6f783b8dc653ee5.jpg,PanStandUp,panstandup,-3962254009726019372,–ö–æ–º–∏–∫–∏ –∏–∑ –ë–µ–ª–∞—Ä—É—Å–∏ –≤ –í–∞—Ä—à–∞–≤–µ (–ü–æ–ª—å—à–∞). –î–µ–ª–∞–µ–º ...,...,,False,False,False,False,False,False,be,2022-08-08 15:50:07.000000,2023-01-26 20:20:56.994689
1,1,1789104687,,public_channel,5.256092e+18,https://telegra.ph/file/8e59a247c7bf2fefa3623.jpg,Perusha,perushag,7057200524768072727,–ü—Ä–æ–¥–∞—é –ª—Å:,...,,False,False,False,False,False,False,ru,2023-02-16 13:03:54.000000,2023-02-18 08:30:23.964101
2,4,1436274540,,public_channel,5.382082e+18,https://telegra.ph/file/6b5a56d9dcecc1a6512b0.jpg,–í–æ–ª–æ–¥–∏–º–∏—Ä –ù–∞–∑–∞—Ä–µ–Ω–∫–æ,nazarenkovolodymyr,-4403316951522258700,,...,,False,False,False,False,False,False,uk,2020-01-14 12:24:58.000000,2023-01-28 21:30:48.070326
3,5,1164503012,,public_channel,,,Tania,tania,-3016471089131185998,"To acquire the username, contact:",...,,False,False,False,False,False,False,,2018-08-10 15:18:21.000000,2023-03-23 16:15:02.398859
4,9,1652917150,,public_channel,5.224346e+18,https://telegra.ph/file/8e21c229d0855e30cad86.jpg,Astrolog4U |–ê—Å—Ç—Ä–æ–ª–æ–≥–∏—è|–ì–æ—Ä–æ—Å–∫–æ–ø,astrolog4y,-1395830427450260891,–¢–≤–æ–π –ª–∏—á–Ω—ã–π –ê—Å—Ç—Ä–æ–ª–æ–≥ü´Ç –†–µ–∫–ª–∞–º–∞: –ó–∞–∫—É–ø –∏ –í–ü:...,...,,False,False,False,False,False,False,ru,2022-08-10 09:33:13.000000,2023-01-26 13:36:00.010200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8702,9994,1685138688,,public_channel,5.224527e+18,https://telegra.ph/file/ee3480c2badd90cdd285d.jpg,–ü—É–Ω–∫—Ç –í—ã–¥–∞—á–∏ –î–ù–†,ozon_wb_dpr,5698232276355410541,–ù–æ–≤–æ—Å—Ç–∏ –ü–í–ó,...,,False,False,False,False,False,False,ru,2023-03-14 13:14:24.000000,2023-03-25 15:22:48.378101
8703,9995,1192459940,,public_channel,5.393149e+18,https://telegra.ph/file/cc58e72ffad36e7f70da9.jpg,–ö–û–†–ü–û–†–ê–¢–ò–í–ù–´–ô –°–ï–ö–†–ï–¢–ê–†–¨,nokc1,-7612141659753682724,"–ò–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω—ã–π –∫–∞–Ω–∞–ª –¥–ª—è —á–ª–µ–Ω–æ–≤ –ù–û–ö–°, –ø—Ä–æ—Ñ–µ—Å—Å–∏...",...,,False,False,False,False,False,False,ru,2018-02-07 19:47:08.000000,2023-01-26 23:21:40.681633
8704,9997,1185527198,,public_channel,5.294033e+18,https://telegra.ph/file/9f970b5bb0533a8b5d31c.jpg,—è–∑—ã—á–∫–∏ –∏ —Ç–µ–º—ã —Ç–≥,yazyki_dlya,2539500734111985922,–ø–æ–¥–±–æ—Ä–∫–∏ —Ç–µ–º–æ–∫ –∏ —è–∑—ã–∫–æ–≤ –ø–æ —Ä–µ–∫–ª–∞–º–µ: –†–µ–∫–ª–∞–º–∞...,...,,False,False,False,False,False,False,ru,2021-09-01 10:53:30.000000,2023-03-05 14:43:58.608290
8705,9998,1605489168,,public_channel,5.278642e+18,https://telegra.ph/file/faa9d704bbea0ca4f9328.jpg,IT channels | –∫–∞—Ç–∞–ª–æ–≥ IT –∫–∞–Ω–∞–ª–æ–≤,itchannels_telegram,-3072196939198662943,–ø–æ –≤—Å–µ–º –≤–æ–ø—Ä–æ—Å–∞–º ...,...,,False,False,False,False,False,False,,2022-03-08 14:30:16.000000,2023-01-26 13:02:41.959799


## BERTopic

https://www.kaggle.com/code/falloutbabe/russian-invasion-of-ukraine-news-topic-modeling/notebook

In [26]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import os
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

import torch

!pip install umap-learn
!pip install hdbscan
!pip install bertopic
!pip install sentence-transformers
!pip install emoji==1.7.0

from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer



In [27]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [62]:
docs = news_df.text.to_list()
ents = ent_df.about.to_list()

In [63]:
umap_model = UMAP(n_neighbors=10, n_components=5, metric='cosine', low_memory=False)
vectorizer_model = CountVectorizer(stop_words=stopwords.words('russian') + stopwords.words('english'))
hdbscan_model = HDBSCAN(min_cluster_size=35, min_samples=10, metric='euclidean', prediction_data=True)

topic_model_ent = BERTopic(umap_model=umap_model,
                       vectorizer_model=vectorizer_model,
                       hdbscan_model=hdbscan_model,
                       nr_topics=40, top_n_words=10, language='multilingual', verbose=True).fit(ents)

Batches:   0%|          | 0/273 [00:00<?, ?it/s]

2023-10-11 12:44:21,307 - BERTopic - Transformed documents to Embeddings
2023-10-11 12:44:46,349 - BERTopic - Reduced dimensionality
2023-10-11 12:44:46,853 - BERTopic - Clustered reduced embeddings
2023-10-11 12:44:47,947 - BERTopic - Reduced number of topics from 42 to 40


In [53]:
umap_model = UMAP(n_neighbors=15, n_components=5, metric='cosine', low_memory=False)
vectorizer_model = CountVectorizer(stop_words=stopwords.words('russian') + stopwords.words('english'))
hdbscan_model = HDBSCAN(min_cluster_size=30, min_samples=5, metric='euclidean', prediction_data=True)

topic_model_post = BERTopic(umap_model=umap_model,
                       vectorizer_model=vectorizer_model,
                       hdbscan_model=hdbscan_model,
                       nr_topics=100, top_n_words=10, language='multilingual', verbose=True).fit(docs)

Batches:   0%|          | 0/347 [00:00<?, ?it/s]

2023-10-11 12:16:47,226 - BERTopic - Transformed documents to Embeddings
2023-10-11 12:17:01,597 - BERTopic - Reduced dimensionality
2023-10-11 12:17:02,104 - BERTopic - Clustered reduced embeddings
2023-10-11 12:17:04,504 - BERTopic - Reduced number of topics from 70 to 70


In [67]:
topic_model_post.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3865,-1_—ç—Ç–æ_—Ä–æ—Å—Å–∏–∏_–≥–æ–¥–∞_—Ä—Ñ,"[—ç—Ç–æ, —Ä–æ—Å—Å–∏–∏, –≥–æ–¥–∞, —Ä—Ñ, –æ–±–ª–∞—Å—Ç–∏, –≥–æ–¥—É, —Ç–∞–∫–∂–µ, ...",[–¢–∞–ª–æ–Ω—ã –≤ –ú–ú–¶ –≤ –°–∞—Ö–∞—Ä–æ–≤–æ. 12 —Ñ–µ–≤—Ä–∞–ª—è 2023 –≥. –ø...
1,0,72,0_–ø—Ä–µ–¥–æ–ø–ª–∞—Ç—É_–∫–∏–¥–∞–ª–∞_–º–æ—à–µ–Ω–Ω–∏–∫–∏_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞,"[–ø—Ä–µ–¥–æ–ø–ª–∞—Ç—É, –∫–∏–¥–∞–ª–∞, –º–æ—à–µ–Ω–Ω–∏–∫–∏, –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞,...",[‚Äã–í–ù–ò–ú–ê–ù–ò–ï - –ú–û–®–ï–ù–ù–ò–ö–ò. . –£—á–∞—Å—Ç–∏–ª–∏—Å—å —Å–ª—É—á–∞–∏ –º–æ...
2,1,49,1_—Ç–∞–±–ª–µ—Ç—É—à–∫–∏___,"[—Ç–∞–±–ª–µ—Ç—É—à–∫–∏, , , , , , , , , ]","[ , –¢–∞–±–ª–µ—Ç—É—à–∫–∏, ]"
3,2,53,2_—Ñ—Ä–∞–Ω—Ü–∏–∏_–ø–µ–Ω—Å–∏–æ–Ω–Ω–æ–π_—Ä–µ—Ñ–æ—Ä–º—ã_–ø—Ä–æ—Ç–∏–≤,"[—Ñ—Ä–∞–Ω—Ü–∏–∏, –ø–µ–Ω—Å–∏–æ–Ω–Ω–æ–π, —Ä–µ—Ñ–æ—Ä–º—ã, –ø—Ä–æ—Ç–∏–≤, –º–∞–∫—Ä–æ–Ω,...",[–°–µ–Ω–∞—Ç –§—Ä–∞–Ω—Ü–∏–∏ –ø—Ä–∏–Ω—è–ª –∑–∞–∫–æ–Ω –æ –ø–µ–Ω—Å–∏–æ–Ω–Ω–æ–π —Ä–µ—Ñ–æ—Ä...
4,3,852,3_—Ç–±–∏–ª–∏—Å–∏_–æ–±—ä—è–≤–ª–µ–Ω–∏–π_—Ü–µ–Ω–∞_–∂–∏–≤–æ—Ç–Ω—ã–µ,"[—Ç–±–∏–ª–∏—Å–∏, –æ–±—ä—è–≤–ª–µ–Ω–∏–π, —Ü–µ–Ω–∞, –∂–∏–≤–æ—Ç–Ω—ã–µ, krayzeml...",[–Ø: #–°–¥–∞—é–ñ–∏–ª—å—ë . –í: #–¢–±–∏–ª–∏—Å–∏. –û–ø–∏—Å–∞–Ω–∏–µ: –ê–†–ï–ù–î–ê...
...,...,...,...,...,...
65,64,54,64_—Ç–∞—à–∫–µ–Ω—Ç_tas_–∞–≤–∏–∞–∫–æ–º–ø–∞–Ω–∏–∏_uzbekistan,"[—Ç–∞—à–∫–µ–Ω—Ç, tas, –∞–≤–∏–∞–∫–æ–º–ø–∞–Ω–∏–∏, uzbekistan, –≤—ã–ª–µ—Ç...",[–ü–æ—Å–∞–¥–∫–∞ –≤ –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω–æ–º –∞—ç—Ä–æ–ø–æ—Ä—Ç—É –∏–º–µ–Ω–∏ –ò—Å–ª–∞–º...
66,65,127,65_mq_—Å—à–∞_–º–æ—Ä–µ–º_—Å—É,"[mq, —Å—à–∞, –º–æ—Ä–µ–º, —Å—É, –±–µ—Å–ø–∏–ª–æ—Ç–Ω–∏–∫, –±–µ—Å–ø–∏–ª–æ—Ç–Ω–∏–∫–∞...",[–ù–∞–º–µ—Ä–µ–Ω–Ω–æ–µ –Ω–∞–ø–∞–¥–µ–Ω–∏–µ –°–®–ê –Ω–∞ —Ä–æ—Å—Å–∏–π—Å–∫–∏–π —Å–∞–º–æ–ª–µ...
67,66,97,66_–≤—Å—É_—É–Ω–∏—á—Ç–æ–∂–∏–ª–∏_–Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–∏_–≤–æ–µ–Ω–Ω—ã—Ö,"[–≤—Å—É, —É–Ω–∏—á—Ç–æ–∂–∏–ª–∏, –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–∏, –≤–æ–µ–Ω–Ω—ã—Ö, –≤—Å, —Å—É...",[–ù–æ–≤—ã–π –±—Ä–∏—Ñ–∏–Ω–≥ –ú–û –†–§. –ì–ª–∞–≤–Ω–æ–µ:. . –ü–æ—Ç–µ—Ä–∏ –í–°–£ –Ω...
68,67,90,67_–≥–µ—Ä–º–∞–Ω–∏–∏_dwrussian_–µ–≤—Ä–æ_—Ñ—Ä–≥,"[–≥–µ—Ä–º–∞–Ω–∏–∏, dwrussian, –µ–≤—Ä–æ, —Ñ—Ä–≥, –Ω–µ–º–µ—Ü–∫–∏–µ, deu...",[–ß—Ç–æ –º–µ—à–∞–µ—Ç –∏–Ω–æ—Å—Ç—Ä–∞–Ω—Ü—É –Ω–∞–π—Ç–∏ —Ä–∞–±–æ—Ç—É –≤ –ì–µ—Ä–º–∞–Ω–∏–∏...


In [68]:
topic_model_ent.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3751,-1_https_–∫–∞–Ω–∞–ª_com_ru,"[https, –∫–∞–Ω–∞–ª, com, ru, va, uchun, –≤–æ–ø—Ä–æ—Å–∞–º, —Ä...",[–ù–∞—à —Å–∞–π—Ç weekportal.ru —Ä–µ–∫–ª–∞–º–∞ –ü–æ–¥–±–æ—Ä–∫–∏ üåà ...
1,0,670,0_—Å–≤—è–∑—å_–≤—Å–µ–º_–≤–æ–ø—Ä–æ—Å–∞–º_—ç—Ç–æ,"[—Å–≤—è–∑—å, –≤—Å–µ–º, –≤–æ–ø—Ä–æ—Å–∞–º, —ç—Ç–æ, –≤–ø, –º–Ω–æ–π, —Ç–µ–±–µ, —Å...","[–°–≤—è–∑—å —Å–æ –º–Ω–æ–π : , —Å–≤—è–∑—å —Å–æ –º–Ω–æ–π , –ø–æ –≤—Å–µ–º –≤–æ–ø..."
2,1,493,1_https_com_instagram_youtube,"[https, com, instagram, youtube, www, twitter,...",[üíö –°–æ—Ç—Ä—É–¥–Ω–∏—á–µ—Å—Ç–≤–æ - wx7eacooperation / ü•ï Lik...
3,2,476,2_00_–≤–∞_–Ω·¥Ä_–¥–æ—Å—Ç–∞–≤–∫–∞,"[00, –≤–∞, –Ω·¥Ä, –¥–æ—Å—Ç–∞–≤–∫–∞, –∫–∞–Ω–∞–ª, –∞–¥–º–∏–Ω, –≤–æ–ø—Ä–æ—Å–∞–º,...",[–û–ø—Ç–æ–º —ç–ª–µ–∫—Ç—Ä–æ–Ω–Ω—ã–µ –∏—Å–ø–∞—Ä–∏—Ç–µ–ª–∏ 18+üí• –ù–∞—Ö–æ–¥–∏–º—Å—è –ø...
4,3,431,3_—Ä–æ—Å—Å–∏–∏_—Ä—Ñ_–º–æ—Å–∫–≤—ã_ru,"[—Ä–æ—Å—Å–∏–∏, —Ä—Ñ, –º–æ—Å–∫–≤—ã, ru, –Ω–æ–≤–æ—Å—Ç–∏, –∫–∞–Ω–∞–ª, –º–æ—Å–∫–≤...","[–°–∞–º—ã–µ –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏ –ò—Ä–∫—É—Ç—Å–∫–æ–π –æ–±–ª–∞—Å—Ç–∏, –†..."
5,4,224,4_—Ä–µ–∫–ª–∞–º—ã_—Ä–µ–∫–ª–∞–º–∞_—Ä–µ–∫–ª–∞–º–µ_–≤–æ–ø—Ä–æ—Å–∞–º,"[—Ä–µ–∫–ª–∞–º—ã, —Ä–µ–∫–ª–∞–º–∞, —Ä–µ–∫–ª–∞–º–µ, –≤–æ–ø—Ä–æ—Å–∞–º, —Ä–µ–∫–ª–∞–º—É,...","[–ü–æ –≤–æ–ø—Ä–æ—Å–∞–º —Ä–µ–∫–ª–∞–º—ã:üëá , –ü–æ –≤–æ–ø—Ä–æ—Å–∞–º —Ä–µ–∫–ª–∞–º—ã ..."
6,5,164,5_–Ω–æ–≤–æ—Å—Ç–∏_—Å–º–∏_—Å–∞–º—ã–µ_–Ω–æ–≤–æ—Å—Ç–µ–π,"[–Ω–æ–≤–æ—Å—Ç–∏, —Å–º–∏, —Å–∞–º—ã–µ, –Ω–æ–≤–æ—Å—Ç–µ–π, –∞–∫—Ç—É–∞–ª—å–Ω—ã–µ, ne...","[–í—Å–µ —Å–∞–º—ã–µ –∞–∫—Ç—É–∞–ª—å–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏, –ú—ã –∑–¥–µ—Å—å –Ω–æ–≤–æ—Å—Ç..."
7,6,161,6_–æ–¥–µ–∂–¥—ã_–æ–¥–µ–∂–¥–∞_–º–∞–≥–∞–∑–∏–Ω_–±—Ä–µ–Ω–¥,"[–æ–¥–µ–∂–¥—ã, –æ–¥–µ–∂–¥–∞, –º–∞–≥–∞–∑–∏–Ω, –±—Ä–µ–Ω–¥, –¥–æ—Å—Ç–∞–≤–∫–∞, –∏–Ω—Ç...",[üõç–û–ù–õ–ê–ô–ù-–ú–ê–ì–ê–ó–ò–ù ¬´LUX BRAND¬ª –ë—Ä–µ–Ω–¥–æ–≤–∞—è –æ–¥–µ–∂–¥–∞...
8,7,157,7_–≥–ª–∞–≤–∞_–æ–±–ª–∞—Å—Ç–∏_—Ä–∞–π–æ–Ω–∞_—Ä–µ—Å–ø—É–±–ª–∏–∫–∏,"[–≥–ª–∞–≤–∞, –æ–±–ª–∞—Å—Ç–∏, —Ä–∞–π–æ–Ω–∞, —Ä–µ—Å–ø—É–±–ª–∏–∫–∏, –≥—É–±–µ—Ä–Ω–∞—Ç–æ...","[–ì–ª–∞–≤–∞ –ü–µ—Ç—Ä–æ–≤—Å–∫–æ–≥–æ –º—É–Ω–∏—Ü–∏–ø–∞–ª—å–Ω–æ–≥–æ —Ä–∞–π–æ–Ω–∞, –ì–ª–∞–≤..."
9,8,129,8_crypto_nft_–∫—Ä–∏–ø—Ç–µ_–∫—Ä–∏–ø—Ç–æ,"[crypto, nft, –∫—Ä–∏–ø—Ç–µ, –∫—Ä–∏–ø—Ç–æ, https, –∫—Ä–∏–ø—Ç–æ–≤–∞–ª...",[ü§ñ feedback | –≤—ñ–¥–ø—Ä–∞–≤–∏—Ç–∏ –µ–ë—É–∫ ‚Äî „Ä∞Ô∏è„Ä∞Ô∏è„Ä∞Ô∏è„Ä∞Ô∏è„Ä∞Ô∏è„Ä∞Ô∏è...


In [69]:
topic_model_post.visualize_hierarchy()

In [70]:
topic_model_ent.visualize_hierarchy()

In [71]:
topic_model_post.visualize_topics()

In [72]:
topic_model_ent.visualize_topics()

In [77]:
text = "–ö–∞–Ω–∞–ª —Å —Ä–µ—Ü–µ–ø—Ç–∞–º–∏"
topic_model_ent.get_topic_info(topic_model_ent.transform(text)[0][0])


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-10-11 13:08:43,519 - BERTopic - Reduced dimensionality
2023-10-11 13:08:43,522 - BERTopic - Predicted clusters


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,9,126,9_—Ä–µ—Ü–µ–ø—Ç—ã_wildberries_–≤–∫—É—Å–Ω—ã–µ_–¥–µ—Å–µ—Ä—Ç—ã,"[—Ä–µ—Ü–µ–ø—Ç—ã, wildberries, –≤–∫—É—Å–Ω—ã–µ, –¥–µ—Å–µ—Ä—Ç—ã, –ø–ø, —Ä...",[üìå–°–∞–º—ã–µ –≤–∫—É—Å–Ω—ã–µ —Ä–µ—Ü–µ–ø—Ç—ã –ø–æ —Å–∏—Å—Ç–µ–º–µ –º–∏–Ω—É—Å 60: ‚ñ™...


In [78]:
text = "–°–¥–∞—é –∂–∏–ª—å–µ –≤ –¢–±–∏–ª–∏—Å–∏"
topic_model_post.get_topic_info(topic_model_post.transform(text)[0][0])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2023-10-11 13:09:04,399 - BERTopic - Reduced dimensionality
2023-10-11 13:09:04,402 - BERTopic - Predicted clusters


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,3,852,3_—Ç–±–∏–ª–∏—Å–∏_–æ–±—ä—è–≤–ª–µ–Ω–∏–π_—Ü–µ–Ω–∞_–∂–∏–≤–æ—Ç–Ω—ã–µ,"[—Ç–±–∏–ª–∏—Å–∏, –æ–±—ä—è–≤–ª–µ–Ω–∏–π, —Ü–µ–Ω–∞, –∂–∏–≤–æ—Ç–Ω—ã–µ, krayzeml...",[–Ø: #–°–¥–∞—é–ñ–∏–ª—å—ë . –í: #–¢–±–∏–ª–∏—Å–∏. –û–ø–∏—Å–∞–Ω–∏–µ: –ê–†–ï–ù–î–ê...


## keyBERT
https://github.com/MaartenGr/KeyBERT

In [79]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.8.3.tar.gz (29 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: keybert
  Building wheel for keybert (setup.py) ... [?25l[?25hdone
  Created wheel for keybert: filename=keybert-0.8.3-py3-none-any.whl size=39124 sha256=473c994ec61249bc0e1c05639565ad3d4d5438bd479f53f032a85960785f9b14
  Stored in directory: /root/.cache/pip/wheels/70/88/07/1a3bc11fd1dd5f89924a02dcbca89a3015e25e8faa31f904dc
Successfully built keybert
Installing collected packages: keybert
Successfully installed keybert-0.8.3


In [80]:
pip install -U sentence-transformers



In [81]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer

In [82]:
sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
kw_model = KeyBERT(model=sentence_model)

In [83]:
news_df

Unnamed: 0.1,Unnamed: 0,msg_id,entity_id,text,content_type,keyboard,silent,has_media_spoiler,posted
0,0,5301,1002094480,–Ø—Ä–∫–∏–µ —Ñ–æ—Ç–æ–∫–∞—Ä—Ç–æ—á–∫–∏ –ø–µ—Ä–≤–æ–π –ø–æ–±–µ–¥—ã –≤ –≥–æ–¥—É —É–∂–µ –∂...,media_group,,0,0,2023-01-09 17:10:06
1,1,5302,1002094480,–ö—Ç–æ —Å—Ç–∞–ª –æ–±–ª–∞–¥–∞—Ç–µ–ª–µ–º –∏–≥—Ä–æ–≤–æ–π –º–∞–π–∫–∏ –∏ –±—Ä–µ–Ω–¥–∏—Ä–æ...,text,,0,0,2023-01-10 14:00:11
2,2,5304,1002094480,–¢—Ä–µ–Ω–∏—Ä—É–µ–º—Å—è –≤ —Ç—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω–æ–º —Ä–µ–∂–∏–º–µ . . –ì–æ—Ç–æ–≤–∏–º—Å...,video,,0,0,2023-01-10 14:40:25
3,3,5305,1002094480,–ü—Ä–æ–≤–µ–¥–∏ –≥–æ–¥ –≤–º–µ—Å—Ç–µ —Å –î–∏–Ω–∞–º–æ-–ê–∫ –ë–∞—Ä—Å . . –ü—Ä–µ–¥—Å—Ç...,video,,0,0,2023-01-11 12:30:23
4,4,5306,1002094480,–î–∏–Ω–∞–º–æ-–ê–∫ –ë–∞—Ä—Å –ú–∏–Ω—á–∞–Ω–∫–∞. . –ö—Ç–æ –æ–∫–∞–∂–µ—Ç—Å—è —Å–∏–ª—å–Ω...,photo,,0,0,2023-01-11 16:00:06
...,...,...,...,...,...,...,...,...,...
11078,2217,5732,1150141694,–ß–∏—Å—Ç–∏–ª–∏ –¥–æ—Ä–æ–≥—É-—Å–∫–æ–≤—ã—Ä–Ω—É–ª–∏ —Ç–∞–∫—Å–æ—Ñ–æ–Ω.. . @montaj...,photo,,0,0,2023-03-27 09:01:43
11079,2218,5733,1150141694,–ù–∞—Ç–∫–Ω—É–ª—Å—è –Ω–∞ –ø–æ–ª–µ–∑–Ω—ã–π –∫–∞–Ω–∞–ª –æ—Ç –¢–∏–º—É—Ä–∞ –ï–≤–≥–∞–∂—É–∫–æ...,photo,,0,0,2023-03-27 09:21:34
11080,2219,5735,1150141694,@montajniklvs,media_group,,0,0,2023-03-27 14:00:54
11081,2220,5736,1150141694,–ö–æ–º–ø–∞–Ω–∏—è X-–°om —Å–æ–≤–º–µ—Å—Ç–Ω–æ c Hyperline –ø—Ä–∏–≥–ª–∞—à–∞–µ...,photo,,0,0,2023-03-27 14:09:43


In [84]:
news_df2 = news_df.copy()
news_df2 = news_df2[news_df2.text.str.len() > 200]

In [85]:
news_df2 = news_df2.reset_index(drop=True)
news_df2

Unnamed: 0.1,Unnamed: 0,msg_id,entity_id,text,content_type,keyboard,silent,has_media_spoiler,posted
0,1,5302,1002094480,–ö—Ç–æ —Å—Ç–∞–ª –æ–±–ª–∞–¥–∞—Ç–µ–ª–µ–º –∏–≥—Ä–æ–≤–æ–π –º–∞–π–∫–∏ –∏ –±—Ä–µ–Ω–¥–∏—Ä–æ...,text,,0,0,2023-01-10 14:00:11
1,3,5305,1002094480,–ü—Ä–æ–≤–µ–¥–∏ –≥–æ–¥ –≤–º–µ—Å—Ç–µ —Å –î–∏–Ω–∞–º–æ-–ê–∫ –ë–∞—Ä—Å . . –ü—Ä–µ–¥—Å—Ç...,video,,0,0,2023-01-11 12:30:23
2,4,5306,1002094480,–î–∏–Ω–∞–º–æ-–ê–∫ –ë–∞—Ä—Å –ú–∏–Ω—á–∞–Ω–∫–∞. . –ö—Ç–æ –æ–∫–∞–∂–µ—Ç—Å—è —Å–∏–ª—å–Ω...,photo,,0,0,2023-01-11 16:00:06
3,19,5323,1002094480,–û–¥–µ—Ä–∂–∏–≤–∞–µ–º –ø–æ–±–µ–¥—É —Å–µ–≥–æ–¥–Ω—è! . . –ñ–¥—ë–º –≤–∞—Å –Ω–∞ –Ω–∞—à...,photo,,0,0,2023-01-13 20:31:24
4,25,5337,1002094480,–í—Ä–µ–º—è –±—Ä–∞—Ç—å —Ä–µ–≤–∞–Ω—à! . . –û—Ç–ø—Ä–∞–≤–ª—è–µ–º—Å—è –≤ –ù–∏–∂–Ω–∏–π ...,video,,0,0,2023-01-16 11:20:25
...,...,...,...,...,...,...,...,...,...
8286,2174,5642,1150141694,"–í–∏–¥–µ–æ–Ω–∞–±–ª–Ø–¥–µ–Ω–∏–µ –Ω–∞ –∞–≤—Ç–æ–±–∞–∑–µ.. –°–æ–±—Å—Ç–≤–µ–Ω–Ω–æ, —á—ë—Ä–Ω...",media_group,,0,0,2023-03-17 09:01:05
8287,2208,5711,1150141694,"–ù–µ—É–º–æ–ª–∏–º–∞—è —è—Ä–æ—Å—Ç—å —Å—Ç–∏—Ö–∏–∏, —Å–¥–æ–±—Ä–µ–Ω–Ω–∞—è –Ω–∞—Ä—É—à–µ–Ω–∏–µ...",media_group,,0,0,2023-03-25 09:01:53
8288,2214,5727,1150141694,–ú–µ–∂—Å–µ—Ç–µ–≤–æ–µ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏–µ –≤ —Å–µ—Ç—è—Ö –Ω–∞ –±–∞–∑–µ TCP-...,web_preview,,0,0,2023-03-26 20:26:30
8289,2218,5733,1150141694,–ù–∞—Ç–∫–Ω—É–ª—Å—è –Ω–∞ –ø–æ–ª–µ–∑–Ω—ã–π –∫–∞–Ω–∞–ª –æ—Ç –¢–∏–º—É—Ä–∞ –ï–≤–≥–∞–∂—É–∫–æ...,photo,,0,0,2023-03-27 09:21:34


In [86]:
kw_model.extract_keywords(news_df2.loc[38].text, keyphrase_ngram_range=(1, 1), stop_words=None)

[('—Å–ø–µ–∫—Ç—Ä–∞', 0.5545),
 ('—Å–ø–µ–∫—Ç—Ä–µ', 0.5515),
 ('—á–∞—Å—Ç–æ—Ç–∞–º', 0.4867),
 ('–ø–∏–∫–æ–º–µ—Ç—Ä', 0.4634),
 ('–≥–∏–≥–∞–º–µ—Ç—Ä', 0.441)]

In [87]:
news_df2.loc[38].text

'–î–∏–∞–ø–∞–∑–æ–Ω –≤–∏–¥–∏–º–æ–≥–æ —Å–≤–µ—Ç–∞ –Ω–∞ —ç–ª–µ–∫—Ç—Ä–æ–º–∞–≥–Ω–∏—Ç–Ω–æ–º —Å–ø–µ–∫—Ç—Ä–µ (Reddit). . –ó–∞ –Ω–∏–∂–Ω–∏–π –ø–æ—Ä–æ–≥ —Å–ø–µ–∫—Ç—Ä–∞ –≤–∑—è—Ç–∞ –¥–ª–∏–Ω–∞ –≤–æ–ª–Ω—ã –≤ 1 –ø–∏–∫–æ–º–µ—Ç—Ä (10‚Åª¬π¬≤ –º), –∞ –∑–∞ –≤–µ—Ä—Ö–Ω–∏–π ‚Äî 1 –≥–∏–≥–∞–º–µ—Ç—Ä (10‚Åπ –º). –í–∏–¥–∏–º—ã–π —Å–≤–µ—Ç –ø–æ —Ä–∞–∑–Ω—ã–º –æ—Ü–µ–Ω–∫–∞–º —Ä–∞—Å–ø–æ–ª–∞–≥–∞–µ—Ç—Å—è –≤ –¥–∏–∞–ø–∞–∑–æ–Ω–µ –æ—Ç 380 –¥–æ 780 –Ω–∞–Ω–æ–º–µ—Ç—Ä–æ–≤ (10‚Åª‚Åπ –º). . –í –Ω–∞—à–µ–º —Å—Ç–∞—Ä–æ–º –ø–æ—Å—Ç–µ –º–æ–∂–Ω–æ –ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å –Ω–∞–≥–ª—è–¥–Ω–æ–µ —Ä–∞—Å–ø–æ–ª–æ–∂–µ–Ω–∏–µ —ç–ª–µ–∫—Ç—Ä–æ–º–∞–≥–Ω–∏—Ç–Ω–æ–≥–æ —Å–ø–µ–∫—Ç—Ä–∞ –ø–æ —á–∞—Å—Ç–æ—Ç–∞–º'