In [8]:
import requests
import time
import json

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

BASE_URL = 'https://gb-kb.sage.com'
SEARCH_URL = 'https://gb-kb.sage.com/portal/ss/?searchaliases=custom_gb_en_erponethousandlinefivehundred&kbpage={}&kbsort=viewcount&tabid=2'

HEADERS = {'User-Agent': 'Mozilla/5.0 (compatible; SageScraper/1.0)'}

def collect_links():
    
    # Use Selenium to paginate through search results and collect document links
    
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options = chrome_options)

    all_links = set()
    page = 1

    while True:
        url = SEARCH_URL.format(page)
        print(f'Visiting search page {page}: {url}')
        driver.get(url)

        try:
            # Wait up to 10s for results to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'ul.solution-list li.view-link a.view-link__title'))
            )
            links = driver.find_elements(By.CSS_SELECTOR, 'ul.solution-list li.view-link a.view-link__title')
            if not links:
                print('No results found, stopping.')
                break

            for link in links:
                href = link.get_attribute('href')
                if href:
                    all_links.add(href)

        except Exception:
            print'"No results found on this page, terminating.')
            break

        page += 1
        time.sleep(1)  # polite delay

    driver.quit()
    return list(all_links)


def parse_document(url):
    
    # Fetch and parse a document page using requests + BeautifulSoup
    
    r = requests.get(url, headers = HEADERS)
    soup = BeautifulSoup(r.text, 'html.parser')

    title = soup.select_one('h1.solution-head__title')
    title = title.get_text(strip = True) if title else ''

    info = soup.select_one('div.solution-head__info')
    info = info.get_text(' ', strip=True) if info else ''

    content = soup.select_one('article#solutionContent')
    content_text = content.get_text(' ', strip = True) if content else ''

    metadata = {}
    aside = soup.select_one('aside#solution-metadata')
    if aside:
        for div in aside.select('div.solution-content__footer'):
            text = div.get_text(' ', strip = True)
            if ':' in text:
                k, v = text.split(':', 1)
                metadata[k.strip()] = v.strip()

    return {
        'url' : url,
        'title' : title,
        'info' : info,
        'content' : content_text,
        'metadata' : metadata,
    }


def main():

    additional_links = [
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=220110144821613',
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=220215161758440',
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=211116104421503',
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=210609094109537',
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=210922092537107',
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=210712145551380',
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=210420150731810',
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=210407121531790',
        'https://gb-kb.sage.com/portal/app/portlets/results/view2.jsp?k2dockey=210225104150907',
        'https://gb-kb.sage.com/portal/app/portlets/results/viewsolution.jsp?solutionid=220107105739233'
    ]
    
    links = collect_links()
    print(f"Total links collected: {len(links)}")

    links.extend(additional_links)
    links = list(set(links))

    results = []
    for link in links:
        try:
            doc = parse_document(link)
            results.append(doc)
            print(f"Parsed: {doc['title']}")
            time.sleep(1)  # polite delay
        except Exception as e:
            print(f"Error parsing {link}: {e}")

    with open('Data/sage_docs.json', 'w', encoding = 'utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)


if __name__ == "__main__":
    main()


Visiting search page 1: https://gb-kb.sage.com/portal/ss/?searchaliases=custom_gb_en_erponethousandlinefivehundred&kbpage=1&kbsort=viewcount&tabid=2
Visiting search page 2: https://gb-kb.sage.com/portal/ss/?searchaliases=custom_gb_en_erponethousandlinefivehundred&kbpage=2&kbsort=viewcount&tabid=2
Visiting search page 3: https://gb-kb.sage.com/portal/ss/?searchaliases=custom_gb_en_erponethousandlinefivehundred&kbpage=3&kbsort=viewcount&tabid=2
Visiting search page 4: https://gb-kb.sage.com/portal/ss/?searchaliases=custom_gb_en_erponethousandlinefivehundred&kbpage=4&kbsort=viewcount&tabid=2
Visiting search page 5: https://gb-kb.sage.com/portal/ss/?searchaliases=custom_gb_en_erponethousandlinefivehundred&kbpage=5&kbsort=viewcount&tabid=2
Visiting search page 6: https://gb-kb.sage.com/portal/ss/?searchaliases=custom_gb_en_erponethousandlinefivehundred&kbpage=6&kbsort=viewcount&tabid=2
Visiting search page 7: https://gb-kb.sage.com/portal/ss/?searchaliases=custom_gb_en_erponethousandlinefiv