<a href="https://colab.research.google.com/github/componavt/sns4human/blob/main/src/vk/nlp/vikiperdia_markers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This script searches for articles in Russian Wikipedia by given keywords, extracts meaningful Russian words and phrases from their titles, and then outputs them as sorted labels.

Этот скрипт ищет статьи в русской Википедии по заданным ключевым словам  извлекает из их заголовков осмысленные русские слова и словосочетания, а затем выводит их в виде отсортированных меток.

In [5]:
!pip install -q wikipedia-api nltk
!pip install  -q pymorphy3
import requests
import re
from pymorphy3 import MorphAnalyzer

morph = MorphAnalyzer()

In [22]:
def get_russian_wikipedia_titles(query, limit=8):
    endpoint = "https://ru.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": query,
        "srlimit": limit,
        "srprop": "",
        "srwhat": "text",
        "srinfo": "",
    }

    try:
        response = requests.get(endpoint, params=params)
        response.raise_for_status()
        data = response.json()
        articles = data.get("query", {}).get("search", [])
        return [article["title"] for article in articles]
    except requests.exceptions.RequestException as e:
        print(f"Ошибка при запросе к Wikipedia API: {e}")
        return []

def extract_russian_phrases(titles, morph):
    phrases = set()
    pos_keep = {'NOUN', 'ADJF', 'ADJS', 'VERB', 'INFN', 'PRTF', 'PRTS', 'GRND'}

    for title in titles:
        clean_title = re.sub(r'\([^)]*\)', '', title)
        clean_title = re.sub(r'[^а-яёА-ЯЁ\s-]', '', clean_title)
        clean_title = re.sub(r'\s+', ' ', clean_title).strip()

        words = [w.strip('-').lower() for w in clean_title.split() if w.strip('-')]
        single_words = set()
        for word in words:
            if len(word) < 3 or not re.fullmatch(r'[а-яё]+', word):
                continue

            parsed = morph.parse(word)[0]
            if parsed.tag.POS in pos_keep:
                normal_form = parsed.normal_form
                single_words.add(normal_form)

        for i in range(len(words) - 1):
            word1, word2 = words[i], words[i+1]

            if (len(word1) < 3 or len(word2) < 3 or
                not re.fullmatch(r'[а-яё]+', word1) or
                not re.fullmatch(r'[а-яё]+', word2)):
                continue

            parsed1 = morph.parse(word1)[0]
            parsed2 = morph.parse(word2)[0]

            if (parsed1.tag.POS == 'ADJF' and parsed2.tag.POS == 'NOUN' and
                parsed1.tag.gender == parsed2.tag.gender and
                parsed1.tag.number == parsed2.tag.number):

                noun_norm = parsed2.inflect({'nomn'}).word if 'nomn' in parsed2.tag else parsed2.normal_form
                adj_norm = parsed1.normal_form
                phrases.add(f"{adj_norm} {noun_norm}")
                phrases.add(f"{word1} {word2}")

            elif (parsed1.tag.POS == 'NOUN' and parsed2.tag.POS == 'NOUN' and
                  parsed2.tag.case == 'gent'):

                first_noun = parsed1.inflect({'nomn'}).word if 'nomn' in parsed1.tag else parsed1.normal_form
                second_noun = parsed2.normal_form
                phrases.add(f"{first_noun} {second_noun}")
                phrases.add(f"{word1} {word2}")

        phrases.update(single_words)

    return sorted(phrases, key=lambda x: (-len(x.split()), x.lower()))


query = "море, петроглиф, озеро, река, берег, побережье"
search_query = [word.strip().lower() for word in query.split(",")]
search_query = ' '.join(search_query)
titles = get_russian_wikipedia_titles(search_query)

print("\nНайденные заголовки статей:")
for i, title in enumerate(titles, 1):
  print(f"{i}. {title}")

phrases = extract_russian_phrases(titles, morph)

print("\nМетки:")
for phrase in phrases:
  print(f"- {phrase}")


Найденные заголовки статей:
1. Байкал
2. Онежские петроглифы
3. Онежское озеро
4. Убсу-Нур
5. Остров Врангеля
6. Итуруп
7. Африка
8. Поной

Метки:
- онежский озеро
- онежское озеро
- остров врангель
- остров врангеля
- африка
- байкал
- врангель
- итуруп
- озеро
- онежский
- остров
- петроглиф
- поной
