In [4]:
import httpx
import os
import json
import requests
from bs4 import BeautifulSoup

In [5]:
class WikiScraper:
    
    def __init__(self, topic):
        self.response = None
        self.soup = None
        self.full_text = None
        self.big_headers = None
        self.title = None
        self.page_url = None
        self.topic = topic
        self.find_article()
        self.scrape_article()
        
    def get_subsection(self, header):
        bs4_header = None
        for h in self.bs4_article_headers:
            if h.text == header:
                bs4_header = h
                break
        ret = ''
        for sibling in bs4_header.next_siblings:
            if sibling.name == 'p':
                ret += sibling.text + '\n'
            elif sibling.name == bs4_header.name:
                break
        return ret
    
    def get_headers(self):
        return '\n'.join(self.str_article_headers)
        
    def find_article(self):
        self.response = httpx.get("https://en.wikipedia.org/w/api.php", params={
            "action": "query",
            "list": "search",
            "srsearch": self.topic,
            "format": "json"
        })

        search_results = self.response.json().get("query", {}).get("search", [])
        if not search_results:
            raise Exception("No results found.")

        self.title = search_results[0]["title"]
        self.page_url = f"https://en.wikipedia.org/wiki/{self.title.replace(' ', '_')}"
    
    def scrape_article(self):
        self.response = requests.get(self.page_url)
        if self.response.status_code != 200:
            raise Exception(f"Failed to load page {self.page_url}")

        self.soup = BeautifulSoup(self.response.content, 'html.parser')

        content_div = self.soup.find(id='mw-content-text')
        if not content_div:
            raise Exception("Failed to find main content of the article")

        paragraphs = content_div.find_all('p')
        self.full_text = ''.join([p.text for p in paragraphs if p.get_text().strip() != ""])
        
        self.bs4_article_headers = list(content_div.find_all('h3')) + list(content_div.find_all('h2'))
        self.str_article_headers = [h3.text for h3 in self.bs4_article_headers]
        # self.big_headers = [f'{h3.find_parent("h2")}/{h3.text[:-6]}' for h3 in self.bs_headers]
        # self.big_headers = [print(sibling.text) for sibling in self.bs_headers.find_next_siblings('h3')]

In [6]:
scraper = WikiScraper(topic='Dogs')

In [90]:
print(scraper.get_subsection('Skeleton'))

All healthy dogs, regardless of their size and type, have an identical skeletal structure with the exception of the number of bones in the tail, although there is significant skeletal variation between dogs of different types.[22][23] The dog's skeleton is well adapted for running; the vertebrae on the neck and back have extensions for back muscles, consisting of epaxial muscles and hypaxial muscles, to connect to; the long ribs provide room for the heart and lungs; and the shoulders are unattached to the skeleton, allowing for flexibility.[22][23][24]

Compared to the dog's wolf-like ancestors, selective breeding since domestication has seen the dog's skeleton larger in size for larger types such as mastiffs and miniaturised for smaller types such as terriers; dwarfism has been selectively used for some types where short legs are advantageous, such as dachshunds and corgis.[23] Most dogs naturally have 26 vertebrae in their tails, but some with naturally short tails have as few as thr

In [7]:
print(scraper.get_headers())

Domestication
Breeds
Skeleton
Senses
Coat
Dewclaw
Tail
Lifespan
Reproduction
Nursing
Intelligence
Communication
Population
Competitors and predators
Diet
Range
Pets
Workers
Shows and sports
Dogs as food
Health risks
Health benefits
Cultural importance
Taxonomy
Evolution
Anatomy
Health
Behavior
Ecology
Roles with humans
Terminology
See also
References
Bibliography
External links


In [51]:
print(scraper.big_headers)

[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
