<a href="https://colab.research.google.com/github/componavt/sns4human/blob/main/src/vk/nlp/get_posts_tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script processes text data, tokenizes it, and saves it to a CSV file.
It filters out entries with empty 'tokens' fields and ensures 'tokens' are always quoted, while 'date' remains unquoted.

–≠—Ç–æ—Ç —Å–∫—Ä–∏–ø—Ç –æ–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ—Ç —Ç–µ–∫—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ, —Ç–æ–∫–µ–Ω–∏–∑–∏—Ä—É–µ—Ç –∏—Ö –∏ —Å–æ—Ö—Ä–∞–Ω—è–µ—Ç –≤ CSV-—Ñ–∞–π–ª.
–û–Ω –æ—Ç—Ñ–∏–ª—å—Ç—Ä–æ–≤—ã–≤–∞–µ—Ç –∑–∞–ø–∏—Å–∏ —Å –ø—É—Å—Ç—ã–º–∏ –ø–æ–ª—è–º–∏ ¬´—Ç–æ–∫–µ–Ω—ã¬ª –∏ –≥–∞—Ä–∞–Ω—Ç–∏—Ä—É–µ—Ç, —á—Ç–æ ¬´—Ç–æ–∫–µ–Ω—ã¬ª –≤—Å–µ–≥–¥–∞ –∑–∞–∫–ª—é—á–µ–Ω—ã –≤ –∫–∞–≤—ã—á–∫–∏, –∞ ¬´–¥–∞—Ç–∞¬ª –æ—Å—Ç–∞–µ—Ç—Å—è –±–µ–∑ –∫–∞–≤—ã—á–µ–∫.

In [16]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
!pip install -U pymorphy3
import pymorphy3
import requests
import csv

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

from nltk.corpus import stopwords
stop_words = stopwords.words("russian")
stop_words += requests.get('https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/src/vk/nlp/RussianStopWords.txt').text.split('\n')
stop_words += requests.get('https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/src/vk/nlp/stopwords-ru.txt').text.split()
alphabet = set('–∞–±–≤–≥–¥–µ—ë–∂–∑–∏–π–∫–ª–º–Ω–æ–ø—Ä—Å—Ç—É—Ñ—Ö—Ü—á—à—â—ä—ã—å—ç—é—è-')
morph = pymorphy3.MorphAnalyzer(lang='ru')



In [17]:
domains = ['club221681617','concerto','club151359929','pravoslav_karelia']# smallest groups for tests
df = pd.concat([pd.read_csv('https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/data/vk/posts/religion/' + domain +'.csv',
                            usecols = ['text','date']) for domain in domains], ignore_index=True)
df = df[df['text'].notna() & (df['text'].apply(lambda x: isinstance(x, str)))]
df = df.reset_index()
df = df.drop(columns=['index'])

def get_text_window(words, index, window_size=3):
    """Returns a context window of words around the given index."""
    start = max(0, index - window_size)
    end = min(len(words), index + window_size + 1)
    return ' '.join(words[start:end])


def process_text(text):
    check_hash = False
    processed_parts = []
    words = nltk.word_tokenize(text)
    for i, w in enumerate(words):
      if len(w) == 1:
        continue
      if w == '#':
          check_hash = True
          continue
      if check_hash:
          check_hash = False
          continue
      w_tag = morph.parse(w.strip())[0].tag
      if 'Surn' in w_tag or 'Name' in w_tag or 'Patr' in w_tag:
        context = get_text_window(words, i)
        print(f"Filtered name/surname: {w} | Context: {context}")  # Debug output for context
        continue
      if set(w.lower()).issubset(alphabet):
        if w.isalpha() and w.lower():
          if w.isupper() and len(w) <= 3:
              processed_parts.append(w)
          else:
              res = morph.parse(w.lower())[0].normal_form
              if res not in stop_words:
                  processed_parts.append(res)
    result = ' '.join(processed_parts)
    return str(result)


df['tokens'] = df['text'].apply(lambda x: process_text(x))
df_tokens = df[['tokens', 'date']]

# Removing lines with empty 'tokens'
df_tokens = df_tokens[df_tokens['tokens'].str.strip().astype(bool)]

# Save with quotes only for the "tokens" field
df_tokens.to_csv('tokens.csv', index=False, sep=';', quoting=csv.QUOTE_NONNUMERIC)


Filtered name/surname: –¢.–£—à–∞–∫–æ–≤–æ–π | Context: —Ñ–æ—Ç–æ –∏–∑ –∞—Ä—Ö–∏–≤–∞ –¢.–£—à–∞–∫–æ–≤–æ–π –∏–ª–∏ –§—Ä–µ–π–Ω–¥–ª–∏–Ω–≥–æ–≤ )
Filtered name/surname: –æ.–ê–ª–µ–∫—Å–∞–Ω–¥—Ä | Context: –æ.–ê–ª–µ–∫—Å–∞–Ω–¥—Ä –†–æ–º–∞–Ω—É—à–∫–æ - —Å–≤—è—â–µ–Ω–Ω–∏–∫
Filtered name/surname: –ê–ª–µ–∫—Å–∞–Ω–¥—Ä—É | Context: , –∫–æ–≥–¥–∞ –æ—Ç—Ü—É –ê–ª–µ–∫—Å–∞–Ω–¥—Ä—É –ø—Ä–∏—à–ª–æ—Å—å –æ—Ç–ø–µ–≤–∞—Ç—å –æ–¥–Ω–æ–≥–æ
Filtered name/surname: –í–∞—Å–∏–ª–∏—é | Context: –∫–æ–º–∞–Ω–¥–∏—Ä–∞ –ø–∞—Ä—Ç–∏–∑–∞–Ω –≥–µ–Ω–µ—Ä–∞–ª-–º–∞–π–æ—Ä–∞ –í–∞—Å–∏–ª–∏—é –ö–æ—Ä–∂–∞ [ 7
Filtered name/surname: –ê–ª–µ–∫—Å–∞–Ω–¥—Ä—É | Context: –ö–æ—Ä–∂ –ø—Ä–µ–¥–æ—Å—Ç–∞–≤–∏–ª –æ—Ç—Ü—É –ê–ª–µ–∫—Å–∞–Ω–¥—Ä—É –ø—Ä–∞–≤–æ –ø—Ä–∏–Ω–∏–º–∞—Ç—å —Ä–µ—à–µ–Ω–∏–µ
Filtered name/surname: –ê–ª–µ–∫—Å–∞–Ω–¥—Ä | Context: . –û–¥–Ω–∞–∫–æ –æ—Ç–µ—Ü –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –Ω–µ –æ–≥—Ä–∞–Ω–∏—á–∏–ª—Å—è —ç—Ç–∏–º
Filtered name/surname: –ê–ª–µ–∫—Å–∞–Ω–¥—Ä–∞ | Context: –ü–æ—Ç—Ä—è—Å—ë–Ω–Ω—ã–µ —Ä–µ—á—å—é –æ—Ç—Ü–∞ –ê–ª–µ–∫—Å–∞–Ω–¥—Ä–∞ , –ø–æ–ª–∏—Ü–µ–π—Å–∫–∏–µ –¥–∞–∂–µ
Filtered na