In [None]:
!pip install requests



In [None]:
import requests
import json
from typing import Dict, List, Union
import re
import os
from datetime import datetime
import bs4
from hashlib import md5

In [None]:
def get_wikipedia_data(query: str, lang: str = "ru", related_depth: int = 1) -> Dict[str, Union[str, List[str]]]:
    url = f"https://{lang}.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'extracts|info|links|images',
        'exintro': True,  # только вводный раздел.
        'explaintext': True,  # вывод текста без HTML-тегов.
        'titles': query,
        'inprop': 'url',
        # 'pllimit': 'max',  # Максимальное количество ссылок.
        'imlimit': 'max'   # Максимальное количество изображений.
    }

    response = requests.get(url, params=params)
    data = response.json()

    page = next(iter(data['query']['pages'].values()))
    result = {
        'title': page.get('title'),
        'extract': page.get('extract'),
        'url': page.get('fullurl'),
        'links': [],
        'images': [],
        'related_articles': []
    }

    if 'links' in page:
        base_url = f"https://{lang}.wikipedia.org/wiki/"
        result['links'] = [base_url + link['title'].replace(' ', '_') for link in page['links']]

    if 'images' in page:
        image_titles = [img['title'] for img in page['images'] if img['title'].lower().endswith(('.jpg', '.jpeg', '.png', '.gif'))]

        for image_title in image_titles:
            image_info_url = f"https://{lang}.wikipedia.org/w/api.php"
            image_info_params = {
                'action': 'query',
                'format': 'json',
                'titles': image_title,
                'prop': 'imageinfo',
                'iiprop': 'url'
            }
            image_response = requests.get(image_info_url, params=image_info_params)
            image_data = image_response.json()
            image_page = next(iter(image_data['query']['pages'].values()))
            if 'imageinfo' in image_page:
                result['images'].append(image_page['imageinfo'][0]['url'])

    if 'links' in page and related_depth > 0:
        related_articles = []
        for link in page['links']:
            related_article = get_wikipedia_data(link['title'], lang=lang, related_depth=0)
            related_articles.append(related_article)
        result['related_articles'] = related_articles

    return result

In [None]:
def get_wikidata_data(query: str, lang: str = "ru", related_depth: int = 1) -> Dict[str, Union[str, Dict]]:
    url = "https://www.wikidata.org/w/api.php"
    params = {
        'action': 'wbsearchentities',
        'search': query,
        'language': lang,
        'format': 'json',
        'limit': 1
    }

    response = requests.get(url, params=params)
    data = response.json()

    entity_id = data['search'][0]['id']
    entity_url = f"https://www.wikidata.org/wiki/Special:EntityData/{entity_id}.json"
    entity_response = requests.get(entity_url)
    entity_data = entity_response.json()

    entity = entity_data['entities'][entity_id]

    result = {
        'title': entity['labels'][lang]['value'] if lang in entity['labels'] else "Название не найдено",
        'description': entity.get('descriptions', {}).get(lang, {}).get('value', "Описание не найдено"),
        'url': f"https://www.wikidata.org/wiki/{entity_id}",
        'images': [],
        'related_entities': []
    }

    if 'claims' in entity and 'P18' in entity['claims']:
        image_titles = [img_claim['mainsnak']['datavalue']['value']
                        for img_claim in entity['claims']['P18']
                        if 'mainsnak' in img_claim and 'datavalue' in img_claim['mainsnak']]

        for image_title in image_titles:
            image_url = f"https://commons.wikimedia.org/wiki/Special:FilePath/{image_title.replace(' ', '_')}"
            result['images'].append(image_url)

    if 'claims' in entity and related_depth > 0:
        related_entities = []
        for prop, claims in entity['claims'].items():
            for claim in claims:
                if 'mainsnak' in claim and 'datavalue' in claim['mainsnak']:
                    if claim['mainsnak']['datavalue']['type'] == 'wikibase-entityid':
                        related_entity_id = claim['mainsnak']['datavalue']['value']['id']
                        related_entity = get_wikidata_data(related_entity_id, lang=lang, related_depth=0)
                        related_entities.append(related_entity)
        result['related_entities'] = related_entities

    return result

In [None]:
def parse_wikitext(content: str) -> Dict[str, Union[str, List[str]]]:
    result = {
        'значение': [],
        'синонимы': [],
        'антонимы': [],
        'гиперонимы': [],
        'гипонимы': []
    }

    value_pattern = re.compile(r'==== Значение ====\n(.*?)(?=\n====)', re.DOTALL)
    synonym_pattern = re.compile(r'==== Синонимы ====\n(.*?)(?=\n====)', re.DOTALL)
    antonym_pattern = re.compile(r'==== Антонимы ====\n(.*?)(?=\n====)', re.DOTALL)
    hypernym_pattern = re.compile(r'==== Гиперонимы ====\n(.*?)(?=\n====)', re.DOTALL)
    hyponym_pattern = re.compile(r'==== Гипонимы ====\n(.*?)(?=\n====)', re.DOTALL)

    value_match = value_pattern.search(content)
    if value_match:
        result['значение'] = [v.strip() for v in value_match.group(1).split('\n#') if v]
    synonym_match = synonym_pattern.search(content)
    if synonym_match:
        result['синонимы'] = [s.strip() for s in synonym_match.group(1).split('\n#') if s]
    antonym_match = antonym_pattern.search(content)
    if antonym_match:
        result['антонимы'] = [a.strip() for a in antonym_match.group(1).split('\n#') if a]
    hypernym_match = hypernym_pattern.search(content)
    if hypernym_match:
        result['гиперонимы'] = [h.strip() for h in hypernym_match.group(1).split('\n#') if h]
    hyponym_match = hyponym_pattern.search(content)
    if hyponym_match:
        result['гипонимы'] = [h.strip() for h in hyponym_match.group(1).split('\n#') if h]

    return result

def get_wiktionary_data(query: str, lang: str = "ru", related_depth: int = 1) -> Dict[str, Union[str, List[str]]]:
    def check_variants(variants):
        for variant in variants:
            params = {
                'action': 'query',
                'format': 'json',
                'prop': 'revisions|links',
                'rvprop': 'content',
                'rvslots': '*',
                'titles': variant,
            }
            response = requests.get(f"https://{lang}.wiktionary.org/w/api.php", params=params)
            data = response.json()
            page = next(iter(data['query']['pages'].values()))
            if not page.get('missing') and page.get('missing') != '':
                return page
        return None

    variants = [query, query.lower(), query.capitalize(), query.upper()]

    page = check_variants(variants)
    if page is None:
        return {"error": f"Статья '{query}' не найдена в Викисловаре."}

    result = {
        'title': page.get('title'),
        'extract': '',
        'url': f"https://{lang}.wiktionary.org/wiki/{page.get('title').replace(' ', '_')}",
        'links': [],
        'related_articles': []
    }

    if 'revisions' in page:
        revision_content = page['revisions'][0]['slots']['main'].get('*')
        if revision_content:
            result['extract'] = parse_wikitext(revision_content)

    if 'links' in page:
        base_url = f"https://{lang}.wiktionary.org/wiki/"
        result['links'] = [base_url + link['title'].replace(' ', '_') for link in page['links']]

    if 'links' in page and related_depth > 0:
        related_articles = []
        for link in page['links']:
            related_article = get_wiktionary_data(link['title'], lang=lang, related_depth=0)
            related_articles.append(related_article)
        result['related_articles'] = related_articles

    return result

In [None]:
def get_commons_data(query: str, lang: str = "commons") -> Dict[str, List[Dict[str, str]]]:
    url = f"https://{lang}.wikimedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'search',
        'srsearch': query,
        'srnamespace': 6,  # Ограничение на пространство имен.
    }

    response = requests.get(url, params=params)
    data = response.json()

    print(data)

    result = {
        'title': query,
        'images': [],
    }

    if 'query' in data and 'search' in data['query']:
        for item in data['query']['search']:
            image_title = item['title']
            image_url = f"https://commons.wikimedia.org/wiki/{image_title.replace(' ', '_')}"
            image_description = item.get('snippet', '')

            result['images'].append({
                'title': image_title,
                'url': image_url,
                'description': image_description,
            })

    return result

In [None]:
def get_picture_of_the_day(lang: str = "en"):
    current_date = datetime.today().isoformat()
    year = datetime.today().year
    month = f"{datetime.today().month:02d}"
    day = f"{datetime.today().day:02d}"

    title = f"Template:Potd/{year}-{month}-{day}"

    url = f"https://commons.wikimedia.org/w/api.php"

    parse_query_params = {
        "action": "parse",
        "format": "json",
        "prop": "text",
        "page": f"{title} ({lang})"
    }

    response = requests.get(url, params=parse_query_params)
    data = response.json()

    if 'parse' not in data:
        print(f"Не удалось найти Картинку дня для {current_date}.")
        return

    potd_caption = data['parse']['text'].get('*')

    potd_soup = bs4.BeautifulSoup(potd_caption, 'html.parser')
    description_div = potd_soup.find('div', class_='description')

    if description_div:
        description = description_div.text.strip()
    else:
        description = "Описание не найдено."

    image_query_params = {
        "action": "query",
        "format": "json",
        "prop": "images",
        "titles": title
    }

    image_response = requests.get(url, params=image_query_params)
    image_data = image_response.json()

    if 'query' in image_data and 'pages' in image_data['query']:
        pages = image_data['query']['pages']
        first_page = next(iter(pages.values()))
        if 'images' in first_page:
            filename = first_page['images'][0]['title'].replace(' ', '_')
        else:
            print(f"Изображение не найдено для {current_date}.")
            return
    else:
        print(f"Изображение не найдено для {current_date}.")
        return

    md5_hash = md5(filename[5:].encode('utf-8')).hexdigest()
    image_url = f"https://upload.wikimedia.org/wikipedia/commons/{md5_hash[0]}/{md5_hash[0:2]}/{filename[5:]}"

    image_filename = f"{current_date}_{filename.split(':')[-1]}"
    text_filename = f"{current_date}_{filename.split(':')[-1].split('.')[0]}.txt"

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'Referer': f'https://commons.wikimedia.org/wiki/{filename}',  # Можно добавить страницу с файлом в Referer
    }

    image_response = requests.get(image_url, headers=headers)
    if image_response.status_code == 200:
        with open(image_filename, 'wb') as img_file:
            img_file.write(image_response.content)
    else:
        print(f"Не удалось скачать изображение: {image_url}")
        return

    with open(text_filename, 'w', encoding='utf-8') as text_file:
        text_file.write(f"Title: {filename}\n")
        text_file.write(f"Description: {description}\n")
        text_file.write(f"Image URL: {image_url}\n")

In [None]:
def collect_data(query: str) -> Dict[str, Union[str, List]]:
    result = {
        'wikipedia': get_wikipedia_data(query),
        'wikidata': get_wikidata_data(query),
        'wiktionary': get_wiktionary_data(query),
        'commons': get_commons_data(query)
    }

    get_picture_of_the_day()

    return result

In [None]:
query = "Чёрная металлургия"
data = collect_data(query)

print(json.dumps(data, ensure_ascii=False, indent=4))

{'batchcomplete': '', 'continue': {'sroffset': 10, 'continue': '-||'}, 'query': {'searchinfo': {'totalhits': 57, 'suggestion': 'чёрной металлургии', 'suggestionsnippet': '<em>чёрной металлургии</em>'}, 'search': [{'ns': 6, 'title': 'File:Известия высших учебных заведений. Черная металлургия. 1998, выпуск 12 (6).png', 'pageid': 149204146, 'size': 2339, 'wordcount': 103, 'snippet': 'DescriptionИзвестия высших учебных заведений. <span class="searchmatch">Черная</span> <span class="searchmatch">металлургия</span>. 1998, выпуск 12 (6).png English: Physical chemistry and theory of metallurgical processes', 'timestamp': '2024-07-01T21:01:29Z'}, {'ns': 6, 'title': 'File:Известия высших учебных заведений. Черная металлургия. 1998, выпуск 17.png', 'pageid': 149204144, 'size': 2339, 'wordcount': 103, 'snippet': 'DescriptionИзвестия высших учебных заведений. <span class="searchmatch">Черная</span> <span class="searchmatch">металлургия</span>. 1998, выпуск 17.png English: Physical chemistry and the