# Sentiment & Topics Analysis from Telegram HTML Export

This notebook analyses a Telegram chat export (HTML) using three Hugging Face models:
- Emotion: https://huggingface.co/Aniemore/rubert-tiny2-russian-emotion-detection
- Toxicity: https://huggingface.co/s-nlp/russian_toxicity_classifier
- Sensitive topics: https://huggingface.co/apanc/russian-sensitive-topics

It also segments the chat into conversations by time-gap (e.g., 30 minutes) and aggregates model outputs per conversation.

In [3]:
%pip install -q transformers accelerate torch --upgrade


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Optional: install dependencies in this environment
# If transformers/torch aren't installed, uncomment and run:
# %pip install -q transformers accelerate torch --upgrade

import math
from datetime import timedelta
from pathlib import Path
from typing import List, Dict

import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline

from message_analyser.retriever.telegram_html import get_mymessages_from_html

# Configure paths and names
export_path = "/Users/danuhaha/Downloads/Telegram Lite/ChatExport_2025-09-20"  # e.g., '/Users/me/Downloads/ChatExport_2025-09-21'
your_name = "Даня Горин"
target_name = "Ваня"

# Conversation segmentation threshold (in minutes)
GAP_MINUTES = 60

assert Path(export_path).exists(), f'Export path not found: {export_path}'


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load messages
msgs = get_mymessages_from_html(export_path, your_name, target_name)
len(msgs), msgs[0].date, msgs[-1].date


2025-09-23 17:41:57,173 - 38688 messages parsed from Telegram HTML export at /Users/danuhaha/Downloads/Telegram Lite/ChatExport_2025-09-20



(38688,
 datetime.datetime(2024, 9, 16, 20, 56, 54),
 datetime.datetime(2025, 9, 20, 22, 41, 8))

In [3]:
# Segment into conversations by time gap
def split_into_conversations(messages, gap_minutes=30):
    if not messages:
        return []
    convos = []
    current = [messages[0]]
    gap = timedelta(minutes=gap_minutes)
    for m in messages[1:]:
        if (m.date - current[-1].date) > gap:
            convos.append(current)
            current = [m]
        else:
            current.append(m)
    convos.append(current)
    return convos

conversations = split_into_conversations(msgs, GAP_MINUTES)
len(conversations), conversations[0][0].date, conversations[0][-1].date


(1939,
 datetime.datetime(2024, 9, 16, 20, 56, 54),
 datetime.datetime(2024, 9, 16, 22, 39, 25))

In [4]:
# Prepare and load models
device = -1  # set -1 for CPU if needed

emo_pipe = pipeline(
    'text-classification',
    model='Aniemore/rubert-tiny2-russian-emotion-detection',
    device=device,
    top_k=None,
)
tox_pipe = pipeline(
    'text-classification',
    model='s-nlp/russian_toxicity_classifier',
    device=device,
    top_k=None,
)
sens_pipe = pipeline(
    'text-classification',
    model='apanc/russian-sensitive-topics',
    device=device,
    top_k=None,
)
'models ready'


Device set to use cpu
Device set to use cpu
Device set to use cpu


'models ready'

In [7]:
# Helpers to run pipelines safely on many short texts
def batched(iterable, n=64):
    batch = []
    for x in iterable:
        batch.append(x)
        if len(batch) >= n:
            yield batch
            batch = []
    if batch:
        yield batch

def run_pipe_avg_scores(pipe, texts: List[str]) -> Dict[str, float]:
    # Returns mean score per label across texts
    agg: Dict[str, float] = {}
    count = 0
    for batch in batched(texts, 32):
        outs = pipe(batch, truncation=True, max_length=256)
        # outs can be list[list[dict(label, score)]] or list[dict]
        for res in outs:
            count += 1
            if isinstance(res, list):
                for d in res:
                    agg[d['label']] = agg.get(d['label'], 0.0) + float(d['score'])
            elif isinstance(res, dict):
                agg[res['label']] = agg.get(res['label'], 0.0) + float(res['score'])
    if count == 0:
        return {}
    return {k: v / count for k, v in agg.items()}

def run_pipe_frac_above(pipe, texts: List[str], positive_labels=None, threshold=0.5) -> float:
    # Returns fraction of texts considered positive/toxic etc.
    if positive_labels is not None:
        positive_labels = set(positive_labels)
    pos = 0
    total = 0
    for batch in batched(texts, 64):
        outs = pipe(batch, truncation=True, max_length=256)
        for res in outs:
            total += 1
            if isinstance(res, list):
                # take max label
                best = max(res, key=lambda d: d['score'])
                lab, score = best['label'], float(best['score'])
            else:
                lab, score = res['label'], float(res['score'])
            if (positive_labels is None and score >= threshold) or (positive_labels and lab in positive_labels and score >= threshold):
                pos += 1
    return (pos / total) if total else 0.0

def normalize_texts(msgs, min_len=5):
    return [m.text.strip() for m in msgs if m.text and len(m.text.strip()) >= min_len]

def sample_texts(texts: List[str], max_per_convo=200):
    if len(texts) <= max_per_convo:
        return texts
    # uniform downsample across the span
    step = len(texts) / max_per_convo
    return [texts[math.floor(i*step)] for i in range(max_per_convo)]

'helpers ready'


'helpers ready'

In [8]:
# Analyse per conversation
rows = []
for idx, conv in enumerate(tqdm(conversations, desc='Conversations')):
    texts_all = normalize_texts(conv)
    if not texts_all:
        continue
    texts = sample_texts(texts_all, max_per_convo=200)

    emo_scores = run_pipe_avg_scores(emo_pipe, texts)
    tox_frac = run_pipe_frac_above(tox_pipe, texts, positive_labels={'toxic', 'toxicity', 'TOXIC'})
    sens_scores = run_pipe_avg_scores(sens_pipe, texts)

    start, end = conv[0].date, conv[-1].date
    rows.append({
        'conversation_id': idx,
        'start': start,
        'end': end,
        'duration_min': (end - start).total_seconds() / 60.0,
        'num_messages': len(conv),
        'num_texts_used': len(texts),
        'toxicity_rate': tox_frac,
        **{f'emo_{k}': v for k, v in emo_scores.items()},
        **{f'topic_{k}': v for k, v in sens_scores.items()},
    })

df = pd.DataFrame(rows).sort_values(['start']).reset_index(drop=True)
df.head()


Conversations: 100%|██████████| 1939/1939 [18:56<00:00,  1.71it/s] 


Unnamed: 0,conversation_id,start,end,duration_min,num_messages,num_texts_used,toxicity_rate,emo_happiness,emo_sadness,emo_neutral,...,topic_LABEL_340,topic_LABEL_251,topic_LABEL_235,topic_LABEL_176,topic_LABEL_205,topic_LABEL_347,topic_LABEL_282,topic_LABEL_171,topic_LABEL_195,topic_LABEL_229
0,0,2024-09-16 20:56:54,2024-09-16 22:39:25,102.516667,119,103,0.116505,0.249543,0.097161,0.303723,...,1e-05,9e-06,1.2e-05,8e-06,7e-06,7e-06,9e-06,1e-05,7e-06,6e-06
1,1,2024-09-17 08:39:22,2024-09-17 14:07:55,328.55,187,173,0.069364,0.261177,0.093819,0.356412,...,1.5e-05,2.1e-05,2.6e-05,2e-05,1.3e-05,1e-05,1.2e-05,2.2e-05,1.8e-05,1.2e-05
2,2,2024-09-17 15:09:48,2024-09-17 16:19:18,69.5,72,55,0.145455,0.209552,0.073617,0.356981,...,4.7e-05,7.4e-05,6e-05,6.6e-05,4e-05,4.3e-05,6.3e-05,5.3e-05,4.9e-05,4.3e-05
3,3,2024-09-17 17:27:19,2024-09-17 23:20:13,352.9,336,200,0.095,0.254439,0.102971,0.368406,...,8e-06,1.2e-05,1.4e-05,1e-05,8e-06,8e-06,1.1e-05,1.3e-05,9e-06,7e-06
4,4,2024-09-18 06:40:56,2024-09-18 09:59:28,198.533333,146,124,0.145161,0.312804,0.09499,0.267474,...,3.1e-05,4.7e-05,4.5e-05,3.5e-05,4.3e-05,2.3e-05,2.5e-05,6.2e-05,4.3e-05,3.1e-05


In [9]:
# Derive convenient summaries
emo_cols = [c for c in df.columns if c.startswith('emo_')]
topic_cols = [c for c in df.columns if c.startswith('topic_')]

def topk(series: pd.Series, k=3):
    return ', '.join([f"{name}:{series[name]:.2f}" for name in series.nlargest(k).index])

df['top_emotions'] = df[emo_cols].apply(topk, axis=1) if emo_cols else ''
df['top_topics'] = df[topic_cols].apply(topk, axis=1) if topic_cols else ''

df[['conversation_id','start','end','num_messages','toxicity_rate','top_emotions','top_topics']].head(10)


Unnamed: 0,conversation_id,start,end,num_messages,toxicity_rate,top_emotions,top_topics
0,0,2024-09-16 20:56:54,2024-09-16 22:39:25,119,0.116505,"emo_neutral:0.30, emo_happiness:0.25, emo_enth...","topic_LABEL_0:0.92, topic_LABEL_5:0.02, topic_..."
1,1,2024-09-17 08:39:22,2024-09-17 14:07:55,187,0.069364,"emo_neutral:0.36, emo_happiness:0.26, emo_enth...","topic_LABEL_0:0.86, topic_LABEL_12:0.03, topic..."
2,2,2024-09-17 15:09:48,2024-09-17 16:19:18,72,0.145455,"emo_neutral:0.36, emo_happiness:0.21, emo_ange...","topic_LABEL_0:0.71, topic_LABEL_3:0.08, topic_..."
3,3,2024-09-17 17:27:19,2024-09-17 23:20:13,336,0.095,"emo_neutral:0.37, emo_happiness:0.25, emo_enth...","topic_LABEL_0:0.84, topic_LABEL_5:0.05, topic_..."
4,4,2024-09-18 06:40:56,2024-09-18 09:59:28,146,0.145161,"emo_happiness:0.31, emo_neutral:0.27, emo_enth...","topic_LABEL_0:0.79, topic_LABEL_13:0.04, topic..."
5,5,2024-09-18 11:21:15,2024-09-18 11:45:12,44,0.111111,"emo_neutral:0.36, emo_happiness:0.19, emo_ange...","topic_LABEL_0:0.88, topic_LABEL_3:0.03, topic_..."
6,6,2024-09-18 13:14:48,2024-09-18 20:26:00,237,0.125,"emo_neutral:0.33, emo_happiness:0.22, emo_sadn...","topic_LABEL_0:0.86, topic_LABEL_1:0.02, topic_..."
7,7,2024-09-19 00:49:08,2024-09-19 00:52:13,4,0.333333,"emo_happiness:0.36, emo_fear:0.31, emo_sadness...","topic_LABEL_0:1.00, topic_LABEL_2:0.00, topic_..."
8,8,2024-09-19 07:04:11,2024-09-19 08:50:21,64,0.066667,"emo_happiness:0.31, emo_neutral:0.27, emo_enth...","topic_LABEL_0:0.89, topic_LABEL_5:0.05, topic_..."
9,9,2024-09-19 10:46:25,2024-09-19 10:59:59,26,0.125,"emo_neutral:0.27, emo_sadness:0.23, emo_happin...","topic_LABEL_0:0.81, topic_LABEL_1:0.08, topic_..."


In [10]:
# Save results next to the export or in repo results/
out_dir = Path(export_path)/'analysis_outputs'
out_dir.mkdir(parents=True, exist_ok=True)
out_csv = out_dir/'sentiment_topics_by_conversation.csv'
df.to_csv(out_csv, index=False)
out_csv


PosixPath('/Users/danuhaha/Downloads/Telegram Lite/ChatExport_2025-09-20/analysis_outputs/sentiment_topics_by_conversation.csv')

## Whole chat rollups
You can also aggregate across all messages to get overall emotion/topic averages and toxicity rate.

In [11]:
texts_all = [m.text.strip() for m in msgs if m.text and len(m.text.strip()) >= 5]
texts_all = sample_texts(texts_all, max_per_convo=2000)  # cap for speed
emo_all = run_pipe_avg_scores(emo_pipe, texts_all)
tox_all = run_pipe_frac_above(tox_pipe, texts_all, positive_labels={'toxic','toxicity','TOXIC'})
sens_all = run_pipe_avg_scores(sens_pipe, texts_all)

pd.Series(emo_all).sort_values(ascending=False), tox_all, pd.Series(sens_all).sort_values(ascending=False)[:10]


(neutral       0.363574
 happiness     0.234025
 enthusiasm    0.142145
 anger         0.106395
 sadness       0.090913
 fear          0.035365
 disgust       0.008148
 dtype: float64,
 0.065,
 LABEL_0     0.845862
 LABEL_5     0.021906
 LABEL_12    0.019068
 LABEL_18    0.015486
 LABEL_11    0.010701
 LABEL_4     0.010244
 LABEL_13    0.009779
 LABEL_1     0.008752
 LABEL_2     0.008642
 LABEL_3     0.008360
 dtype: float64)