In [56]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.chrome.service import Service
import os 
import re
from tqdm import tqdm
import time
import csv

In [57]:
### headless Chrome, Handling JavaScript ###

chrome_driver = os.path.join(os.getcwd(), "chromedriver") 
service = Service(chrome_driver)

chrome_options = Options()
chrome_options.add_argument('--headless=new')

driver = webdriver.Chrome(service=service, options=chrome_options) ### headless browser - faster

In [58]:
BASE_URL = "https://www.oreilly.com/search/skills/?rows=10&order_by=_oreilly_popularity"

In [None]:
all_records = []
seen_urls = set()

for page in tqdm(range(1, 50 + 1), desc="Scraping Pages"):
    if page <= 1:
        url = BASE_URL
    else:
        url = BASE_URL + '&page=' + str(page)  
            
    driver.get(url)
    time.sleep(2)
                
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    STABLE_CARD_SELECTOR = "article[data-testid^='search-card-']"
    
    cards = soup.select(STABLE_CARD_SELECTOR)

    for card in cards:
        strong_tag = card.select_one('strong[data-testid^="format-label-"]')
        type = strong_tag.find('span', {'data-viz': 'srOnly'}).next_sibling.strip()

        if type != 'Book':
            continue
            
        title = card.select_one("h3 a").get_text(strip=True)
        url = card.select_one("h3 a")["href"].strip()
        
        if not title:
            continue

        if url in seen_urls:
            continue
        
        seen_urls.add(url)
        
        all_records.append(
            {
                'title' : title,
                'level' : '',
                'overview' : '',
                'url' : url
            }
        )

print(len(all_records))
print(all_records[:10])

Scraping Pages:   8%|▊         | 4/50 [00:11<02:13,  2.90s/it]

In [None]:
def clean_overview(text: str) -> str:

    if not text:
        return ""

    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[•·]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text

In [None]:
for i in tqdm(range(0, len(all_records)), desc="Scraping Records"):
    url = all_records[i]['url']
    driver.get(url)
    time.sleep(1)
                
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")   
    container_selector = '[data-testid="main-content"] [data-testid="titleDescription"]'
    
    overview_container = soup.select_one(container_selector)
    overview = []
    
    if overview_container:
        for p in overview_container.find_all("p"):
            text = p.get_text().strip()
            if text:
                overview.append(text)

        for li in overview_container.find_all("li"):

            text = "• " + li.get_text().strip()
            if text:
                overview.append(text)
            

        full_overview = clean_overview(" ".join(overview))

    container_selector2 = '[data-testid="header-content"] [data-testid="header-stats"]'
    level_container = soup.select_one(container_selector2)
    level = level_container.find_all("div")[1].find("span").get_text(strip=True)

    all_records[i]['overview'] = full_overview
    all_records[i]['level'] = level

Scraping Records: 100%|██████████| 13/13 [00:31<00:00,  2.41s/it]


In [None]:
with open('data/oreilly.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
        
        # Write CSV header row
    writer.writerow([
            'title', 'level', 'overview', 'url'
        ])
        
        # Write data rows (one row per review)
    for record in all_records:
        writer.writerow([
            record['title'], record['level'], record['overview'], record['url']
                    ])