# 1. Установка и загрузка библиотек

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Установка необходимых библиотек
!pip install --upgrade --force-reinstall numpy
!pip install -U gensim summa tqdm

Mounted at /content/drive
Collecting numpy
  Downloading numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.1/62.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.3.0-cp311-cp311-manylinux_2_28_x86_64.whl (16.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.9/16.9 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.3.0 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.0 which is incompatible.
cup

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting summa
  Downloading summa-1.2.0.tar.gz (54 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/54.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.7 MB/s[0m eta [

# 2. Перезапуск среды выполнения

In [None]:
# Среда выполнения → Перезапустить сеанс

# 3. Импорт библиотек

In [None]:
import pandas as pd
from gensim import corpora, models
from summa import keywords as summa_keywords
from tqdm.notebook import tqdm
import os, gc, csv

# 4. Настройки и путь к файлу Cyberleninka

In [None]:
path = '/content/drive/MyDrive/TextScope/processed_cyberleninka_data.csv'
out_path = path.replace('.csv', '_keywords.csv')
chunk_size = 500
text_column = 'text'

# 5. Функции извлечения кандидатов

In [None]:
# Чистка текста: токенизация
def tokenize(text):
    return text.lower().split()

# Топ слов по TF-IDF (gensim)
def top_tfidf_gensim(doc_bow, tfidf_model, dictionary, top_n=30):
    tfidf_scores = tfidf_model[doc_bow]
    tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    top_terms = [dictionary[id] for id, score in tfidf_scores[:top_n]]
    return set(top_terms)

# Топ слов по TextRank (summa)
def top_textrank(text, top_n=30):
    terms = summa_keywords.keywords(text, split=True, scores=False)
    return set(terms[:top_n])

def union_candidates(tfidf_set, tr_set, top_k=30):
    union = list(tfidf_set | tr_set)
    return union[:top_k]

# 6. Главная функция извлечения ключевых слов

In [None]:
MAX_WORDS = 5000  # максимум слов в тексте

def iter_corpus(path, dictionary):
    reader = pd.read_csv(path, chunksize=chunk_size)
    for chunk in reader:
        texts = [tokenize(str(text)) for text in chunk[text_column]]
        for text in texts:
            if len(text) > MAX_WORDS:
                continue  # пропуск слишком длинных текстов
            yield dictionary.doc2bow(text)

def iter_texts(path):
    reader = pd.read_csv(path, chunksize=chunk_size)
    for chunk in reader:
        texts = [tokenize(str(text)) for text in chunk[text_column]]
        for text in texts:
            if len(text) > MAX_WORDS:
                continue  # пропуск слишком длинных текстов
            yield text

print("Строим словарь...")
dictionary = corpora.Dictionary(iter_texts(path))
dictionary.filter_extremes(no_below=2, no_above=0.85)
print(f"Размер словаря: {len(dictionary)}")

print("Строим TF-IDF модель...")
corpus = iter_corpus(path, dictionary)
tfidf_model = models.TfidfModel(corpus, dictionary=dictionary)

print("Создаем индекс для TF-IDF...")
index_corpus = list(iter_corpus(path, dictionary))
tfidf_corpus = tfidf_model[index_corpus]

print("Извлечение ключевых слов...")

reader = pd.read_csv(path, chunksize=chunk_size)
total_rows = sum(1 for _ in open(path)) - 1  # для прогресса

for chunk_num, chunk in enumerate(tqdm(reader, desc="Обработка чанков")):
    chunk_keywords = []

    for row in tqdm(chunk.itertuples(), desc=f"Чанк {chunk_num + 1}", leave=False):
        text = str(getattr(row, text_column))
        tokens = tokenize(text)
        if len(tokens) > MAX_WORDS:
            chunk_keywords.append([])  # если слишком длинный — пустой список
            continue

        bow = dictionary.doc2bow(tokens)
        tfidf_weights = tfidf_model[bow]

        tfidf_cand = set([dictionary[id] for id, weight in sorted(tfidf_weights, key=lambda x: -x[1])[:30]])
        tr_cand = top_textrank(text)

        final_keywords = union_candidates(tfidf_cand, tr_cand)
        chunk_keywords.append(final_keywords)

    chunk['keywords'] = chunk_keywords
    mode = 'a' if os.path.exists(out_path) else 'w'
    header = not os.path.exists(out_path)
    chunk.to_csv(out_path, mode=mode, index=False, header=header, quoting=csv.QUOTE_ALL, quotechar='"')
    gc.collect()

print(f"Обработка завершена. Результат сохранен в: {out_path}")

Строим словарь...




Размер словаря: 100000
Строим TF-IDF модель...
Создаем индекс для TF-IDF...
Извлечение ключевых слов...


Обработка чанков: 0it [00:00, ?it/s]

Чанк 1: 0it [00:00, ?it/s]

Чанк 2: 0it [00:00, ?it/s]

Чанк 3: 0it [00:00, ?it/s]

Чанк 4: 0it [00:00, ?it/s]

Чанк 5: 0it [00:00, ?it/s]

Чанк 6: 0it [00:00, ?it/s]

Чанк 7: 0it [00:00, ?it/s]

Чанк 8: 0it [00:00, ?it/s]

Чанк 9: 0it [00:00, ?it/s]

Чанк 10: 0it [00:00, ?it/s]

Чанк 11: 0it [00:00, ?it/s]

Чанк 12: 0it [00:00, ?it/s]

Чанк 13: 0it [00:00, ?it/s]

Чанк 14: 0it [00:00, ?it/s]

Чанк 15: 0it [00:00, ?it/s]

Чанк 16: 0it [00:00, ?it/s]

Чанк 17: 0it [00:00, ?it/s]

Чанк 18: 0it [00:00, ?it/s]

Чанк 19: 0it [00:00, ?it/s]

Чанк 20: 0it [00:00, ?it/s]

Чанк 21: 0it [00:00, ?it/s]

Чанк 22: 0it [00:00, ?it/s]

Чанк 23: 0it [00:00, ?it/s]

Чанк 24: 0it [00:00, ?it/s]

Чанк 25: 0it [00:00, ?it/s]

Чанк 26: 0it [00:00, ?it/s]

Чанк 27: 0it [00:00, ?it/s]

Чанк 28: 0it [00:00, ?it/s]

Чанк 29: 0it [00:00, ?it/s]

Чанк 30: 0it [00:00, ?it/s]

Чанк 31: 0it [00:00, ?it/s]

Чанк 32: 0it [00:00, ?it/s]

Чанк 33: 0it [00:00, ?it/s]

Чанк 34: 0it [00:00, ?it/s]

Чанк 35: 0it [00:00, ?it/s]

Чанк 36: 0it [00:00, ?it/s]

Чанк 37: 0it [00:00, ?it/s]

Чанк 38: 0it [00:00, ?it/s]

Чанк 39: 0it [00:00, ?it/s]

Чанк 40: 0it [00:00, ?it/s]

Чанк 41: 0it [00:00, ?it/s]

Чанк 42: 0it [00:00, ?it/s]

Чанк 43: 0it [00:00, ?it/s]

Чанк 44: 0it [00:00, ?it/s]

Чанк 45: 0it [00:00, ?it/s]

Чанк 46: 0it [00:00, ?it/s]

Обработка завершена. Результат сохранен в: /content/drive/MyDrive/TextScope/processed_cyberleninka_data_keywords.csv
