In [1]:
import yaml
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable

import requests
from bs4 import BeautifulSoup, Tag

In [8]:
@dataclass
class QA:
    question: str
    answer: str
    
STORAGE_DIR = '/Users/jlinho/Desktop/scraping'    

In [27]:
class EuropcarScraper:
    def __init__(self, url='https://faq.europcar.com/', max_depth=3):
        self.qa = []
        EuropcarScraper._feed_qa_items(self.qa, url, max_depth)

    @staticmethod
    def _feed_qa_items(qas: list[QA], url: str, remaining_depth: int):
        print(f'Visiting {url} ...')
        if remaining_depth == 0:
            return

        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        for child_uri in EuropcarScraper._find_qa_page_links(soup):
            time.sleep(0.5)  # Pause avoiding to be rejected by website
            EuropcarScraper._feed_qa_items(qas, child_uri, remaining_depth - 1)

        qas.extend(EuropcarScraper._extract_qa(soup))

    @staticmethod
    def _find_qa_page_links(soup: BeautifulSoup) -> Iterable[str]:
        return list(EuropcarScraper._faq_big_icon_link(soup)) + list(EuropcarScraper._faq_big_simple_link(soup))

    @staticmethod
    def _extract_qa(soup: BeautifulSoup) -> Iterable[QA]:
        answer_divs = soup.find_all('div', class_='dydu_answer')
        for ans_div in answer_divs:
            if h2 := ans_div.find('h2'):
                question = h2.get_text(".", strip=True)
                if div_ans := ans_div.find('div', itemprop='acceptedAnswer'):
                    answer = ".".join([p.get_text(".", strip=True) for p in div_ans.find_all('p')])
                    yield QA(question, answer)

    @staticmethod
    def _faq_big_icon_link(soup: BeautifulSoup) -> Iterable[str]:
        for div in soup.find_all('div', {'class': 'dydu_thematic-icon'}):
            if par := div.parent:
                if par.name == 'a':
                    yield par['href']

    @staticmethod
    def _faq_big_simple_link(soup: BeautifulSoup) -> Iterable[str]:
        for li in soup.find_all('li', {'class': 'dydu_knowledge'}):
            if a := li.a:
                yield a['href']

In [15]:

class FedoraScraper:
    def __init__(self, url='https://fedoraproject.org/wiki/FAQ#Getting_Started'):
        self.qa: list[QA] = []
        FedoraScraper._feed_qa_items(self.qa, url)

    @staticmethod
    def _feed_qa_items(qas: list[QA], url: str):
        print(f'Visiting {url} ...')

        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')

        for q_item in FedoraScraper._find_qa_sections(soup):
            qas.append(FedoraScraper._extract_qa(q_item))

    @staticmethod
    def _find_qa_sections(soup: BeautifulSoup) -> Iterable[BeautifulSoup]:
        for item in soup.find_all('h3'):
            if item.get_text(strip=True).strip().endswith('?'):
                yield item

    @staticmethod
    def _extract_qa(item: Tag) -> QA:
        q = item.get_text(strip=True).strip()
        full_answer = FedoraScraper._extract_answer(item)
        return QA(q, full_answer)

    @staticmethod
    def _extract_answer(tag: Tag) -> str:
        content = []
        next_p = tag
        while (next_p := next_p.find_next_sibling()) and FedoraScraper._is_paragraph(next_p):
            answer = next_p.get_text().strip()
            if answer:
                content.append(answer)
        full_answer = ". ".join(content)
        return full_answer

    @staticmethod
    def _is_paragraph(tag: Tag) -> bool:
        return tag is not None and tag.name == 'p'

In [25]:
import numpy as np

fedora_scraper = FedoraScraper()
rand_indices = np.random.randint(0, len(fedora_scraper.qa), 10)
for idx in rand_indices:
    print(fedora_scraper.qa[idx])

Visiting https://fedoraproject.org/wiki/FAQ#Getting_Started ...
QA(question='Will Red Hat provide formal technical support for The Fedora Project?', answer="No, no formal Web or phone support for The Fedora Project will be available from Red Hat. Red Hat's supported product line will be based in part on a recent release of Fedora, and our development will be done externally as part of The Fedora Project as much as possible.")
QA(question='Why should I help?', answer="Your name in lights, an online CV, and maybe a trip to a FUDCon.. First, contribute to Fedora and you may get your name in the distribution.  Hey, fair is fair.. Second, if you're a Fedora contributor, you don't need a fancy resume; you can just tell potential employers to 'Google' your name.. Third, top Fedora contributors can receive travel stipends to attend the Fedora Users and Developers Conference nearest them.  You will have the opportunity to meet some of the giants of the open source movement in person, as their p

In [28]:
europcar_scraper = EuropcarScraper()

Visiting https://faq.europcar.com/ ...
Visiting https://faq.europcar.com/after-rental/ ...
Visiting https://faq.europcar.com/after-rental/can-i-get-a-copy-of-my-invoice.html ...
Visiting https://faq.europcar.com/after-rental/do-i-need-to-return-the-car-with-a-full-tank-of-fuel.html ...
Visiting https://faq.europcar.com/after-rental/what-happens-in-case-of-dispute-related-to-my-rental.html ...
Visiting https://faq.europcar.com/after-rental/can-i-retrieve-property-left-in-the-vehicle.html ...
Visiting https://faq.europcar.com/bookings/ ...
Visiting https://faq.europcar.com/bookings/how-do-i-know-if-my-reservationbooking-is-confirmed.html ...
Visiting https://faq.europcar.com/bookings/is-it-possible-to-modifycancel-my-reservation-after-it-is.html ...
Visiting https://faq.europcar.com/bookings/is-it-possible-to-view-my-reservation-after-it-is-confirmed.html ...
Visiting https://faq.europcar.com/bookings/what-is-the-europcar-reservation-guarantee-policy.html ...
Visiting https://faq.europca

In [29]:
rand_indices = np.random.randint(0, len(europcar_scraper.qa), 20)
for idx in rand_indices:
    print(europcar_scraper.qa[idx])

QA(question='What should I do if the rental vehicle is stolen?', answer='In case of theft of the vehicle, you shall contact the Europcar pick up location. You shall as well provide Europcar with a copy of the report of theft filed before by the local police authorities within two (2) business days with the keys and official papers of the Vehicle if those have not been stolen. \xa0A replacement car will be provided to you to cover your rental length..Please feel free to tell us why you don’t find this answer helpful.')
QA(question='What are Privilege credits?', answer='Privilege credits are defined as a unit value, counted in credits, and generated by the number of completed rentals or rental days..For your tier level, Privilege credits are cummulated within the 24-month membership period, which starts on the date of enrolment, upgrade, renewel or downgrade..For rewards, privilege credits are cummulated in each calendar year: 1st January to 31st December..Please feel free to tell us why

In [24]:
def save_scrapes_to_yaml(scraper, dest_filename):
    kb_file = Path(STORAGE_DIR) / dest_filename
    with open(kb_file, "w") as f:
        items = []
        for i in scraper.qa:
            items.append(i.question)
            items.append(i.answer)
        yaml.dump(items, f)

In [13]:
save_scrapes_to_yaml(fedora_scraper, 'fedora.yaml')
save_scrapes_to_yaml(europcar_scraper, 'europcar.yaml')