<a href="https://colab.research.google.com/github/componavt/sns4human/blob/main/src/vk/voyant_tools/get_posts_tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script processes text data, tokenizes it, and saves it to a CSV file.
For [Voyant Tools](https://voyant-tools.org/), periods in sentences are preserved.
It filters out entries with empty 'tokens' fields and ensures 'tokens' are always quoted, while 'date' remains unquoted.

Этот скрипт обрабатывает текстовые данные, токенизирует их и сохраняет в CSV-файл.
Для Voyant Tools сохраняются точки в предложениях.
Он отфильтровывает записи с пустыми полями «токены» и гарантирует, что «токены» всегда заключены в кавычки, а «дата» остается без кавычек.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
!pip install -U pymorphy3
import pymorphy3
import requests
import csv

nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

from nltk.corpus import stopwords
stop_words = stopwords.words("russian")
stop_words += requests.get('https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/src/vk/nlp/RussianStopWords.txt').text.split('\n')
stop_words += requests.get('https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/src/vk/nlp/stopwords-ru.txt').text.split()

alphabet = set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя')
alphabet_dash = set('абвгдеёжзийклмнопрстуфхцчшщъыьэюя-')

morph = pymorphy3.MorphAnalyzer(lang='ru')



In [None]:
domains = ['club221681617','concerto','club151359929','pravoslav_karelia']# smallest groups for tests
df = pd.concat([pd.read_csv('https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/data/vk/posts/religion/' + domain +'.csv',
                            usecols = ['text','date']) for domain in domains], ignore_index=True)
df = df[df['text'].notna() & (df['text'].apply(lambda x: isinstance(x, str)))]
df = df.reset_index()
df = df.drop(columns=['index'])

def get_text_window(words, index, window_size=3):
    """Returns a context window of words around the given index."""
    start = max(0, index - window_size)
    end = min(len(words), index + window_size + 1)
    return ' '.join(words[start:end])

def contains_non_dash(s):
    """Check if a string consists not only dash characters."""
    return s.count('-') < len(s)

def process_text(text):
    sentences = sent_tokenize(text)  # Split into sentences
    processed_sentences = []

    for sentence in sentences:
        check_hash = False
        processed_parts = []
        words = word_tokenize(sentence)

        for i, w in enumerate(words):
          if len(w) == 1:
            continue
          if w == '#':
            check_hash = True
            continue
          if check_hash:
            check_hash = False
            continue

          # skip name and surname
          # w_tag = morph.parse(w.strip())[0].tag
          #if 'Surn' in w_tag or 'Name' in w_tag or 'Patr' in w_tag:
          #  context = get_text_window(words, i)
          #  print(f"Filtered name/surname: {w} | Context: {context}")  # Debug output for context
          #  continue

          if set(w.lower()).issubset(alphabet_dash) and contains_non_dash(w):
            res = morph.parse(w.lower())[0].normal_form
            if res and (res not in stop_words):
                  processed_parts.append(res)
          else:
            # has 4+ Cyrillic characters then will parse too (e.g. блж.Фаддея о.Алексия г.Петрозаводске)
            if sum(1 for char in w.lower() if char in alphabet) >= 4:
              if ('\\' not in w) and ('/' not in w): # skip words-hyperlinks
                #context = get_text_window(words, i)
                #print(f"Filtered not subset(alphabet): {w} | Context: {context}")
                res = morph.parse(w.lower())[0].normal_form
                if res not in stop_words:
                  processed_parts.append(res)

        if processed_parts:
            last_word = processed_parts[-1]
            if last_word[-1] not in ".!?":
                processed_parts.append(".")  # Add period at the end of sentence

        processed_sentences.append(" ".join(processed_parts))

    return " ".join(processed_sentences)

df['tokens'] = df['text'].apply(process_text)
df_tokens = df[['tokens', 'date']]

# Removing lines with empty 'tokens'
df_tokens = df_tokens[df_tokens['tokens'].str.strip().astype(bool)]

# Save CSV with quotes only for 'tokens' field, without quotes for 'date'
with open('tokens.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=';', quoting=csv.QUOTE_NONE, quotechar=None)
    writer.writerow(['tokens', 'date'])
    for _, row in df_tokens.iterrows():
        writer.writerow([f'"{row["tokens"]}"', row['date']])