In [None]:
from razdel import sentenize, tokenize
import pymorphy3
import re
import pandas as pd
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

with open("/content/DOVLATOV_SOBR_SOCH.txt", "r", encoding="utf-8") as f:
    text = f.read()

sentences = [s.text for s in sentenize(text)]

morph = pymorphy3.MorphAnalyzer()

alcohol_words = ['–≤—ã–ø–∏–ª', '–≤—ã–ø–∏—Ç—å', '–≤–æ–¥–∫–∞', '–ø—å—è–Ω—ã–π', '–æ–ø–æ—Ö–º–µ–ª', '–∑–∞–ø–æ–π', '–±—É—Ç—ã–ª–∫–∞']
alcohol_lemmas = {morph.parse(w)[0].normal_form for w in alcohol_words}

percept_verbs_raw = ['—É–≤–∏–¥–µ—Ç—å', '–ø–æ—á—É–≤—Å—Ç–≤–æ–≤–∞—Ç—å', '—É—Å–ª—ã—à–∞—Ç—å', '–æ—â—É—Ç–∏—Ç—å', '–∑–∞–º–µ—Ç–∏—Ç—å', '—Ä–∞–∑–ª–∏—á–∏—Ç—å', '–æ—â—É—â–∞—Ç—å']
percept_lemmas = {morph.parse(w)[0].normal_form for w in percept_verbs_raw}

visual_words = {
    '—Ä–∞–∑–º—ã—Ç—ã–π', '—Ä–∞—Å–ø–ª—ã–≤—á–∞—Ç—ã–π', '—Ç—É–º–∞–Ω–Ω—ã–π', '–º—É—Ç–Ω—ã–π', '—Å–º–∞–∑–∞–Ω–Ω—ã–π',
    '–Ω–µ–∂–Ω—ã–π', '–º—è–≥–∫–∏–π', '—Ç—ë–ø–ª—ã–π', '–∑–æ–ª–æ—Ç–∏—Å—Ç—ã–π', '—Å–∏—è—é—â–∏–π',
    '—á—ë—Ä–Ω—ã–π', '—Å–µ—Ä—ã–π', '–º—Ä–∞—á–Ω—ã–π', '—è—Ä–∫–∏–π', '—Å–≤–µ—Ç–ª—ã–π'
}
auditory_words = {
    '–≥—É–ª', '—à—É–º', '–∑–≤–æ–Ω', '–∂—É–∂–∂–∞–Ω–∏–µ', '–º–µ—Ä–Ω—ã–π', '—Ç–∏—Ö–∏–π', '–≥—Ä–æ–º–∫–∏–π',
    '–ø—Ä–∏–≥–ª—É—à—ë–Ω–Ω—ã–π', '–¥–∞–ª—ë–∫–∏–π', '—Ä–µ–∑–∫–∏–π', '–º–æ–Ω–æ—Ç–æ–Ω–Ω—ã–π'
}
emotional_words = {
    '–≥–∞—Ä–º–æ–Ω–∏—è', '–ø—Ä–∏–º–∏—Ä–µ–Ω–∏–µ', '—Å–ø–æ–∫–æ–π—Å—Ç–≤–∏–µ', '—Ç–æ—Å–∫–∞', '—Ä–∞–¥–æ—Å—Ç—å',
    '–≥—Ä—É—Å—Ç—å', '—Ç—Ä–µ–≤–æ–≥–∞', '–æ–±–ª–µ–≥—á–µ–Ω–∏–µ', '–≤–æ—Å—Ç–æ—Ä–≥', '–∞–ø–∞—Ç–∏—è',
    '—É–º–∏—Ä–æ—Ç–≤–æ—Ä–µ–Ω–∏–µ', '—Ä–∞–∑–¥—Ä–∞–∂–µ–Ω–∏–µ', '–ª—é–±–æ–≤—å', '–Ω–µ–Ω–∞–≤–∏—Å—Ç—å'
}

visual_lemmas = {morph.parse(w)[0].normal_form for w in visual_words}
auditory_lemmas = {morph.parse(w)[0].normal_form for w in auditory_words}
emotional_lemmas = {morph.parse(w)[0].normal_form for w in emotional_words}

emotional_lemmas —á–µ—Ä–µ–∑ normal_form
emotional_lemmas = {morph.parse(w)[0].normal_form for w in emotional_words}

def get_context_windows(sentences, condition_func, window=3):
    contexts = []
    for i, sent in enumerate(sentences):
        if condition_func(sent):
            before = sentences[max(0, i - window):i]
            after = sentences[i + 1:i + 1 + window]
            contexts.append((sent, " ".join(before), " ".join(after)))
    return contexts

def is_alcohol_sentence(sent):
    words = [t.text.lower() for t in tokenize(sent) if t.text.isalpha()]
    lemmas = [morph.parse(w)[0].normal_form for w in words]
    return any(l in alcohol_lemmas for l in lemmas)

alcohol_episodes = get_context_windows(sentences, is_alcohol_sentence, window=3)

print(f" –ù–∞–π–¥–µ–Ω–æ {len(alcohol_episodes)} —ç–ø–∏–∑–æ–¥–æ–≤ —Å –∞–ª–∫–æ–≥–æ–ª–µ–º.")

def extract_percept_descriptions(context_text):
   tokens = [t.text.lower() for t in tokenize(context_text) if t.text.isalpha()]
   lemmas = [morph.parse(w)[0].normal_form for w in tokens]

    percept_positions = [i for i, l in enumerate(lemmas) if l in percept_lemmas]

    descriptions = {'visual': [], 'auditory': [], 'emotional': []}

        for pos in percept_positions:
        start = max(0, pos - 5)
        end = min(len(lemmas), pos + 6)
        window_lemmas = lemmas[start:end]

        for l in window_lemmas:
            if l in visual_lemmas:
                descriptions['visual'].append(l)
            elif l in auditory_lemmas:
                descriptions['auditory'].append(l)
            elif l in emotional_lemmas:
                descriptions['emotional'].append(l)

    return descriptions

all_descriptions_after = {'visual': [], 'auditory': [], 'emotional': []}
all_descriptions_before = {'visual': [], 'auditory': [], 'emotional': []}

for _, before_ctx, after_ctx in alcohol_episodes:
    desc_before = extract_percept_descriptions(before_ctx)
    desc_after = extract_percept_descriptions(after_ctx)

    for key in all_descriptions_after:
        all_descriptions_after[key].extend(desc_after[key])
        all_descriptions_before[key].extend(desc_before[key])

def compare_frequencies(before, after, top_n=10):
    cnt_before = Counter(before)
    cnt_after = Counter(after)
    all_words = set(cnt_before.keys()) | set(cnt_after.keys())

    if not all_words:
        return pd.DataFrame()   DataFrame

    data = []
    for word in all_words:
        data.append({
            '–°–ª–æ–≤–æ': word,
            '–î–æ': cnt_before[word],
            '–ü–æ—Å–ª–µ': cnt_after[word],
            '–†–∞–∑–Ω–∏—Ü–∞ (–ü–æ—Å–ª–µ ‚Äì –î–æ)': cnt_after[word] - cnt_before[word]
        })

    df = pd.DataFrame(data)
    df = df.sort_values('–†–∞–∑–Ω–∏—Ü–∞ (–ü–æ—Å–ª–µ ‚Äì –î–æ)', ascending=False)
    return df.head(top_n)

categories = ['visual', 'auditory', 'emotional']
titles = {'visual': '–í–∏–∑—É–∞–ª—å–Ω–æ–µ –≤–æ—Å–ø—Ä–∏—è—Ç–∏–µ', 'auditory': '–ê—É–¥–∏–∞–ª—å–Ω–æ–µ –≤–æ—Å–ø—Ä–∏—è—Ç–∏–µ', 'emotional': '–≠–º–æ—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–µ —Å–æ—Å—Ç–æ—è–Ω–∏–µ'}

plt.figure(figsize=(15, 5))
for i, cat in enumerate(categories):
    plt.subplot(1, 3, i + 1)
    before = all_descriptions_before[cat]
    after = all_descriptions_after[cat]


    common_words = set(Counter(before).most_common(8)) | set(Counter(after).most_common(8))
    words = list({w for w, _ in common_words})

    if not words:
        plt.title(titles[cat] + "\n(–Ω–µ—Ç –¥–∞–Ω–Ω—ã—Ö)")
        continue

    freq_before = [before.count(w) for w in words]
    freq_after = [after.count(w) for w in words]

    x = range(len(words))
    plt.bar([xi - 0.2 for xi in x], freq_before, width=0.4, label='–î–æ', color='lightblue')
    plt.bar([xi + 0.2 for xi in x], freq_after, width=0.4, label='–ü–æ—Å–ª–µ', color='lightcoral')
    plt.xticks(x, words, rotation=45, ha='right')
    plt.title(titles[cat])
    plt.legend()

plt.tight_layout()
plt.show()

print("–¢–æ–ø-10 –∏–∑–º–µ–Ω–µ–Ω–∏–π –≤ –≤–æ—Å–ø—Ä–∏—è—Ç–∏–∏ –ü–û–°–õ–ï —É–ø–æ—Ç—Ä–µ–±–ª–µ–Ω–∏—è –∞–ª–∫–æ–≥–æ–ª—è:")

for cat in categories:
    before_list = all_descriptions_before[cat]
    after_list = all_descriptions_after[cat]


    if not before_list and not after_list:
        print(f"\nüîπ {titles[cat]}: –Ω–µ—Ç –ø–µ—Ä—Ü–µ–ø—Ç–∏–≤–Ω—ã—Ö –æ–ø–∏—Å–∞–Ω–∏–π")
        continue

    df_diff = compare_frequencies(before_list, after_list)


    if df_diff.empty:
        print(f"\nüîπ {titles[cat]}: –Ω–µ—Ç –∑–Ω–∞—á–∏–º—ã—Ö —Å–ª–æ–≤ –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è")
        continue

    print(f"\nüîπ {titles[cat]}:")

    cols_to_show = ['–°–ª–æ–≤–æ', '–î–æ', '–ü–æ—Å–ª–µ']
    if '–†–∞–∑–Ω–∏—Ü–∞ (–ü–æ—Å–ª–µ ‚Äì –î–æ)' in df_diff.columns:
        cols_to_show.append('–†–∞–∑–Ω–∏—Ü–∞ (–ü–æ—Å–ª–µ ‚Äì –î–æ)')
    display(df_diff[cols_to_show].reset_index(drop=True))
