In [62]:
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable

import requests
from bs4 import BeautifulSoup, Tag



In [63]:
@dataclass
class QA:
    question: str
    answer: str

In [64]:

class NIHScraper:
    def __init__(self, url="https://seed.nih.gov/faqs"):
        self.qa: list[QA] = []
        NIHScraper._feed_qa_items(self.qa, url)

    @staticmethod
    def _feed_qa_items(qas: list[QA], url: str):
        print(f"Visiting {url} ...")

        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        for q_item in NIHScraper._find_qa_sections(soup):
            qas.append(NIHScraper._extract_qa(q_item))

    @staticmethod
    def _find_qa_sections(soup: BeautifulSoup) -> Iterable[BeautifulSoup]:
        for item in soup.find_all("span", {"class": "faq-question"}):
            yield item

    @staticmethod
    def _extract_qa(item: Tag) -> QA:
        q = item.get_text(strip=True).strip()
        full_answer = NIHScraper._extract_answer(item)
        return QA(q, full_answer)

    @staticmethod
    def _extract_answer(tag: Tag) -> str:
        content = []
        answer_div = tag.find_next_sibling()
        assert answer_div.name == "div" and answer_div["class"] == ["faq-answer"]

        for p in answer_div.find_all("p"):
            answer = p.get_text().strip()
            if answer:
                content.append(answer + "." if not answer.endswith(".") else answer)

        return "\n".join(content)







In [72]:
import yaml


def save_scrape(scraper, dest_filename):
    kb_file = Path('/home/skfl/learning/nb/') / dest_filename
    with open(kb_file, "w") as f:
        items = []
        for i in scraper.qa:
            items.append(i.question)
            items.append(i.answer)
        yaml.dump(items, f)


if __name__ == "__main__":
    scrapers = {"NIH": NIHScraper}
    for name, scraper in scrapers.items():
        save_scrape(scraper(), f"{name}.yml")

Visiting https://seed.nih.gov/faqs ...


In [73]:

class ODScraper:
    def __init__(self, url="https://www.opendoor.com/w/faq"):
        self.qa: list[QA] = []
        ODScraper._feed_qa_items(self.qa, url)

    @staticmethod
    def _feed_qa_items(qas: list[QA], url: str):
        print(f"Visiting {url} ...")

        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        for q_item in ODScraper._find_qa_sections(soup):
            qas.append(ODScraper._extract_qa(q_item))           

    @staticmethod
    def _find_qa_sections(soup: BeautifulSoup) -> Iterable[BeautifulSoup]:
        for item in soup.find_all("h6", {"class": "article-title"}):
            yield item

    @staticmethod
    def _extract_qa(item: Tag) -> QA:
        q = item.get_text(strip=True).strip()
        for item in soup.find_all("p", {"class": "article-"}):
            yield item
        full_answer = ODScraper._extract_answer(item)
        return QA(q, full_answer)

    @staticmethod
    def _extract_qa(soup: BeautifulSoup) -> Iterable[QA]:
        answer_divs = soup.find_all("div", class_="article-")
        for ans_div in answer_divs:
            if h2 := ans_div.find("h6"):
                question = h2.get_text(".", strip=True)
                if div_ans := ans_div.find("div", itemprop="acceptedAnswer"):
                    answer = "\n".join(
                        [p.get_text("\n", strip=True) for p in div_ans.find_all("p")]
                    )
                    yield QA(question, answer)


    @staticmethod
    def _extract_answer(tag: Tag) -> str:
        content = []
        answer_div = tag.find_next_sibling()
        assert answer_div.name == "div" and answer_div["class"] == ["entry-content"]

        for p in answer_div.find_all("p"):
            answer = p.get_text().strip()
            if answer:
                content.append(answer + "." if not answer.endswith(".") else answer)

        return "\n".join(content)

In [74]:
import yaml


def save_scrape(scraper, dest_filename):
    kb_file = Path('/home/skfl/learning/nb/')/ dest_filename
    with open(kb_file, "w") as f:
        items = []
        for i in scraper.qa:
            items.append(i.question)
            items.append(i.answer)
        yaml.dump(items, f)


if __name__ == "__main__":
    scrapers = {"OD": ODScraper}
    for name, scraper in scrapers.items():
        save_scrape(scraper(), f"{name}.yml")

Visiting https://www.opendoor.com/w/faq ...


AttributeError: 'generator' object has no attribute 'question'

In [None]:
import yaml


def save_scrape(scraper, dest_filename):
    kb_file = Path('/home/skfl/learning/nb/') / dest_filename
    with open(kb_file, "w") as f:
        items = []
        for i in scraper.qa:
            items.append(i.question)
            items.append(i.answer)
        yaml.dump(items, f)


if __name__ == "__main__":
    scrapers = {"OD": ODScraper}
    for name, scraper in scrapers.items():
        save_scrape(scraper(), f"{name}.yml")

In [41]:
class WwfScraper:
    def __init__(self, url="https://www.wwf.org.uk/faqs"):
        self.qa: list[QA] = []
        WwfScraper._feed_qa_items(self.qa, url)

    @staticmethod
    def _feed_qa_items(qas: list[QA], url: str):
        print(f"Visiting {url} ...")

        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        for q_item in WwfScraper._find_qa_sections(soup):
            qas.append(WwfScraper._extract_qa(q_item))

    @staticmethod
    def _find_qa_sections(soup: BeautifulSoup) -> Iterable[BeautifulSoup]:
        for item in soup.find_all("h3", {"class": "faqfield-question"}):
            yield item

    @staticmethod
    def _extract_qa(item: Tag) -> QA:
        q = item.get_text(strip=True).strip()
        full_answer = WwfScraper._extract_answer(item)
        return QA(q, full_answer)

    @staticmethod
    def _extract_answer(tag: Tag) -> str:
        content = []
        answer_div = tag.find_next_sibling()
        assert answer_div.name == "div" and answer_div["class"] == ["faqfield-answer"]

        for p in answer_div.find_all("p"):
            answer = p.get_text().strip()
            if answer:
                content.append(answer + "." if not answer.endswith(".") else answer)

        return "\n".join(content)

In [46]:
import yaml


def save_scrape(scraper, dest_filename):
    kb_file = Path('/home/skfl/learning/nb/') / dest_filename
    with open(kb_file, "w") as f:
        items = []
        for i in scraper.qa:
            items.append(i.question)
            items.append(i.answer)
        yaml.dump(items, f)


if __name__ == "__main__":
    scrapers = {"wwf": WwfScraper}
    for name, scraper in scrapers.items():
        save_scrape(scraper(), f"{name}.yml")

Visiting https://www.wwf.org.uk/faqs ...


In [None]:
class EuropcarScraper:
    def __init__(self, url="https://faq.europcar.com/", max_depth=3):
        self.qa = []
        EuropcarScraper._feed_qa_items(self.qa, url, max_depth)

    @staticmethod
    def _feed_qa_items(qas: list[QA], url: str, remaining_depth: int):
        print(f"Visiting {url} ...")
        if remaining_depth == 0:
            return

        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        for child_uri in EuropcarScraper._find_qa_page_links(soup):
            time.sleep(1)  # Pause avoiding to be rejected by website
            EuropcarScraper._feed_qa_items(qas, child_uri, remaining_depth - 1)

        qas.extend(EuropcarScraper._extract_qa(soup))

    @staticmethod
    def _find_qa_page_links(soup: BeautifulSoup) -> Iterable[str]:
        return list(EuropcarScraper._faq_big_icon_link(soup)) + list(
            EuropcarScraper._faq_big_simple_link(soup)
        )

    @staticmethod
    def _extract_qa(soup: BeautifulSoup) -> Iterable[QA]:
        answer_divs = soup.find_all("div", class_="dydu_answer")
        for ans_div in answer_divs:
            if h2 := ans_div.find("h2"):
                question = h2.get_text(".", strip=True)
                if div_ans := ans_div.find("div", itemprop="acceptedAnswer"):
                    answer = "\n".join(
                        [p.get_text("\n", strip=True) for p in div_ans.find_all("p")]
                    )
                    yield QA(question, answer)

    @staticmethod
    def _faq_big_icon_link(soup: BeautifulSoup) -> Iterable[str]:
        for div in soup.find_all("div", {"class": "dydu_thematic-icon"}):
            if par := div.parent:
                if par.name == "a":
                    yield par["href"]

    @staticmethod
    def _faq_big_simple_link(soup: BeautifulSoup) -> Iterable[str]:
        for li in soup.find_all("li", {"class": "dydu_knowledge"}):
            if a := li.a:
                yield a["href"]

In [34]:
import yaml


def save_scrape(scraper, dest_filename):
    kb_file = Path('/home/skfl/learning/nb/') / dest_filename
    with open(kb_file, "w") as f:
        items = []
        for i in scraper.qa:
            items.append(i.question)
            items.append(i.answer)
        yaml.dump(items, f)


if __name__ == "__main__":
    scrapers = {"europcar": EuropcarScraper}
    for name, scraper in scrapers.items():
        save_scrape(scraper(), f"{name}.yml")

Visiting https://faq.europcar.com/ ...
Visiting https://faq.europcar.com/after-rental/ ...
Visiting https://faq.europcar.com/after-rental/can-i-get-a-copy-of-my-invoice.html ...
Visiting https://faq.europcar.com/after-rental/do-i-need-to-return-the-car-with-a-full-tank-of-fuel.html ...
Visiting https://faq.europcar.com/after-rental/what-happens-in-case-of-dispute-related-to-my-rental.html ...
Visiting https://faq.europcar.com/after-rental/can-i-retrieve-property-left-in-the-vehicle.html ...
Visiting https://faq.europcar.com/bookings/ ...
Visiting https://faq.europcar.com/bookings/how-do-i-know-if-my-reservationbooking-is-confirmed.html ...
Visiting https://faq.europcar.com/bookings/is-it-possible-to-modifycancel-my-reservation-after-it-is.html ...
Visiting https://faq.europcar.com/bookings/is-it-possible-to-view-my-reservation-after-it-is-confirmed.html ...
Visiting https://faq.europcar.com/bookings/what-is-the-europcar-reservation-guarantee-policy.html ...
Visiting https://faq.europca

In [None]:
"https://www.pillsburybaking.com/frequently-asked-questions/