# Полная предобработка для всех источников

# 1. Установка зависимостей и загрузка моделей

In [None]:
!pip install pymorphy2 nltk tqdm

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl.metadata (3.6 kB)
Collecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docop

# 2. Импорт библиотек

In [None]:
import re
import os
import pandas as pd
from collections import defaultdict, Counter
import math
import nltk
import pymorphy2
from nltk.corpus import stopwords
from tqdm import tqdm
import csv

# Заплатка для совместимости pymorphy2 с Python 3.11
import inspect
from collections import namedtuple

if not hasattr(inspect, 'getargspec'):
    ArgSpec = namedtuple('ArgSpec', ['args', 'varargs', 'keywords', 'defaults'])

    def getargspec(func):
        spec = inspect.getfullargspec(func)
        return ArgSpec(
            args=spec.args,
            varargs=spec.varargs,
            keywords=spec.varkw,
            defaults=spec.defaults
        )

    inspect.getargspec = getargspec

# 3. Монтирование Google Диска

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 4. Загрузка моделей и стоп-слов

In [None]:
# Части речи, которые считаем "мусорными"
bad_pos = {"PRON", "ADV", "AUX", "PART", "DET", "SCONJ", "CCONJ", "INTJ", "ADP", "NUM", "SYM", "X", "PUNCT", "SPACE"}

# Загружаем пользовательские стоп-слова
nltk.download('stopwords')
nltk.download('punkt_tab')
russian_stopwords = set(stopwords.words("russian"))

custom_stopwords_path = '/content/drive/MyDrive/TextScope/custom_stopwords.txt'
os.makedirs(os.path.dirname(custom_stopwords_path), exist_ok=True)

if not os.path.exists(custom_stopwords_path):
    with open(custom_stopwords_path, 'w', encoding='utf-8') as f:
        f.write("# Добавьте сюда свои стоп-слова по одному на строку.\n")
        f.write("привет\nздравствуйте\nстатья\nдоклад\n")

with open(custom_stopwords_path, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line and not line.startswith("#"):
            russian_stopwords.add(line.lower())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# 5. Вспомогательные функции для предобработки

In [None]:
# Инициализация pymorphy2
morph = pymorphy2.MorphAnalyzer()

def full_preprocess(text):
    text = re.sub(r'[^а-яА-ЯёЁ\s]', '', str(text).lower())
    tokens = nltk.word_tokenize(text)
    lemmas = []
    for token in tokens:
        parsed = morph.parse(token)[0]
        lemma = parsed.normal_form
        if parsed.tag.POS not in bad_pos and lemma not in russian_stopwords:
            lemmas.append(lemma)
    return ' '.join(lemmas)

# 6. Функция для извлечения IDF-стоп-слов

In [None]:
def get_idf_stopwords_streamed(corpus, idf_threshold=6.0, max_df_ratio=0.98):
    total_docs = 0
    doc_freq = defaultdict(int)
    term_freq = Counter()
    for text in tqdm(corpus, desc="Вычисление IDF", leave=False):
        tokens = nltk.word_tokenize(text)
        lemmas = set()
        for token in tokens:
            parsed = morph.parse(token)[0]
            lemma = parsed.normal_form
            if parsed.tag.POS not in bad_pos:
                lemmas.add(lemma)
                term_freq[lemma] += 1
        for lemma in lemmas:
            doc_freq[lemma] += 1
        total_docs += 1

    return {word for word, df in doc_freq.items()
            if math.log((total_docs + 1) / (df + 1)) + 1 < idf_threshold or df / total_docs > max_df_ratio}

# 7. Основная функция обработки CSV-файла

In [None]:
def is_fully_processed(input_path, output_path):
    """Проверяет, что выходной файл содержит метку .done, что означает полную обработку"""
    return os.path.exists(f"{output_path}.done")

def save_processed_info(chunk_count, output_path):
    """Сохраняет количество обработанных чанков"""
    with open(f"{output_path}.progress", 'w') as f:
        f.write(str(chunk_count))

def get_last_processed_chunk(output_path):
    """Считывает количество уже обработанных чанков, если есть"""
    progress_file = f"{output_path}.progress"
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            return int(f.read().strip())
    return 0

In [None]:
def preprocess_csv(file_path, output_path, text_column, chunksize=5000):
    if is_fully_processed(file_path, output_path):
        print(f"Файл уже полностью обработан: {output_path} — пропущен.\n")
        return

    print(f"Обработка файла: {file_path}")
    all_lemmas = []
    all_docs = []
    is_first_chunk = not os.path.exists(output_path)

    total_rows = 0
    total_chunks = 0
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        total_chunks += 1
        total_rows += len(chunk)
    print(f"Всего строк: {total_rows}, всего чанков: {total_chunks}")

    processed_chunks = get_last_processed_chunk(output_path)
    print(f"Последний обработанный чанк: {processed_chunks}/{total_chunks}")

    reader = pd.read_csv(file_path, chunksize=chunksize)
    with tqdm(total=total_chunks, initial=processed_chunks, unit='chunk', desc=os.path.basename(file_path)) as pbar:
        for i, chunk in enumerate(reader):
            idx = i + processed_chunks
            if idx < processed_chunks:
                pbar.update(1); continue
            if idx >= total_chunks:
                break

            if text_column not in chunk.columns or chunk.empty:
                pbar.update(1); continue
            chunk.dropna(subset=[text_column], inplace=True)
            chunk.drop_duplicates(subset=[text_column], inplace=True)

            if 'gnews_data.csv' in file_path and 'date' in chunk.columns:
                chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce', utc=True)
                chunk['year'] = chunk['date'].dt.year
                chunk['date'] = chunk['date'].dt.strftime('%Y-%m-%d')
            elif 'cyberleninka_data.csv' in file_path and 'year' in chunk.columns:
                chunk['date'] = pd.to_datetime(chunk['year'].astype(str) + '-01-01', errors='coerce')
                chunk['year'] = chunk['date'].dt.year
                chunk['date'] = chunk['date'].dt.strftime('%Y-%m-%d')
            elif 'date' in chunk.columns:
                chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce', utc=True)
                chunk['year'] = chunk['date'].dt.year

            if 'cyberleninka_data.csv' in file_path and 'author' in chunk.columns:
                chunk['author'] = chunk['author'].astype(str).str.replace(r'\n', ' ', regex=True)

            chunk[text_column] = chunk[text_column].apply(full_preprocess)
            chunk = chunk[chunk[text_column].str.strip().astype(bool)]

            docs_str = chunk[text_column].tolist()
            all_docs.extend(docs_str)
            for doc in docs_str:
                all_lemmas.extend(doc.split())

            chunk.to_csv(output_path, mode='a', header=is_first_chunk, index=False)
            is_first_chunk = False
            save_processed_info(idx + 1, output_path)
            pbar.update(1)

    idf_stopwords = get_idf_stopwords_streamed(all_docs)
    russian_stopwords.update(idf_stopwords)
    with open(custom_stopwords_path, 'a', encoding='utf-8') as f:
        for w in sorted(idf_stopwords):
            f.write(w + "\n")

    freq_clean = Counter(all_lemmas)
    rare = {w for w, f in freq_clean.items() if f == 1}

    df = pd.read_csv(output_path)
    df[text_column] = df[text_column].apply(
    lambda t: ' '.join(w for w in t.split() if w not in rare and len(w) >= 4)
    )
    df.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL, quotechar='"')

    base = os.path.splitext(os.path.basename(file_path))[0]
    freq_path = f"/content/drive/MyDrive/TextScope/{base}_freq_dict.txt"
    with open(freq_path, 'w', encoding='utf-8') as f:
        for word, cnt in sorted(freq_clean.items(), key=lambda x: x[1], reverse=True):
            if cnt > 1:
                f.write(f"{word}:{cnt}\n")
    print(f"Частотный словарь сохранен: {freq_path}")

    with open(f"{output_path}.done", 'w', encoding='utf-8') as f:
        f.write("Готово\n")
    print(f"Файл обработан: {output_path}\n")

# 8. Запуск обработки всех файлов

In [None]:
# Путь к CSV-файлам
csv_files = [
    {'path': '/content/drive/MyDrive/TextScope/vk_data.csv', 'output': '/content/drive/MyDrive/TextScope/processed_vk_data.csv', 'text_column': 'text'},
    {'path': '/content/drive/MyDrive/TextScope/gnews_data.csv', 'output': '/content/drive/MyDrive/TextScope/processed_gnews_data.csv', 'text_column': 'text'},
    {'path': '/content/drive/MyDrive/TextScope/cyberleninka_data.csv', 'output': '/content/drive/MyDrive/TextScope/processed_cyberleninka_data.csv', 'text_column': 'text'}
]

for file_info in csv_files:
    preprocess_csv(file_info['path'], file_info['output'], file_info['text_column'])

Обработка файла: /content/drive/MyDrive/TextScope/vk_data.csv
Всего строк: 42395, всего чанков: 9
Последний обработанный чанк: 0/9


vk_data.csv: 100%|██████████| 9/9 [27:43<00:00, 184.85s/chunk]


Частотный словарь сохранен: /content/drive/MyDrive/TextScope/vk_data_freq_dict.txt
Топ-50 частых слов сохранен: /content/drive/MyDrive/TextScope/vk_data_top_words.txt
Файл обработан: /content/drive/MyDrive/TextScope/processed_vk_data.csv

Обработка файла: /content/drive/MyDrive/TextScope/gnews_data.csv
Всего строк: 16504, всего чанков: 4
Последний обработанный чанк: 0/4


gnews_data.csv: 100%|██████████| 4/4 [20:19<00:00, 304.78s/chunk]


Частотный словарь сохранен: /content/drive/MyDrive/TextScope/gnews_data_freq_dict.txt
Топ-50 частых слов сохранен: /content/drive/MyDrive/TextScope/gnews_data_top_words.txt
Файл обработан: /content/drive/MyDrive/TextScope/processed_gnews_data.csv

Обработка файла: /content/drive/MyDrive/TextScope/cyberleninka_data.csv
Всего строк: 22639, всего чанков: 5
Последний обработанный чанк: 0/5


cyberleninka_data.csv: 100%|██████████| 5/5 [2:47:07<00:00, 2005.54s/chunk]


Частотный словарь сохранен: /content/drive/MyDrive/TextScope/cyberleninka_data_freq_dict.txt
Топ-50 частых слов сохранен: /content/drive/MyDrive/TextScope/cyberleninka_data_top_words.txt
Файл обработан: /content/drive/MyDrive/TextScope/processed_cyberleninka_data.csv

