# Custom Crawler tool :
### Here I have used selenium to scrape data for each link

In [None]:
# Install required packages
!pip install selenium
# Install Chrome/Chromium and ChromeDriver for the Linux environment
!apt update
!apt install chromium-chromedriver

Collecting selenium
  Downloading selenium-4.38.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio<1.0,>=0.31.0 (from selenium)
  Downloading trio-0.32.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket<1.0,>=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio<1.0,>=0.31.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket<1.0,>=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.38.0-py3-none-any.whl (9.7 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.7/9.7 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.32.0-py3-none-any.whl (512 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚î

## Scraping Links from Product Catalogue Page

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
import time

# --- Setup ---
chrome_options = Options()
chrome_options.add_argument("--headless")  # run in background
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

service = Service()  # auto uses chromedriver if in PATH
driver = webdriver.Chrome(service=service, options=chrome_options)

base_url = "https://www.shl.com/products/product-catalog/?start={}&type=1&type=1"
page_vars = list(range(12, 373, 12))  # 12, 24, ..., 372
data = {}

try:
    for page_var in page_vars:
        url = base_url.format(page_var)
        driver.get(url)
        time.sleep(3)  # wait for page to load

        for var in range(2, 14):  # 2 to 13 inclusive
            xpath = f"/html/body/main/div[3]/div/div/div/div[2]/div/table/tbody/tr[{var}]/td[1]/a"
            try:
                element = driver.find_element(By.XPATH, xpath)
                link = element.get_attribute("href")
                text = element.text.strip()
                if text and link:
                    data[text] = link
                    print(data[text])
            except Exception:
                # Skip if row not found or missing link
                continue

finally:
    driver.quit()

# --- Save output to JSON file ---
output_file = "shl_product_links.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print(f"‚úÖ Data extraction completed. Saved to {output_file}")


https://www.shl.com/products/product-catalog/view/adobe-experience-manager-new/
https://www.shl.com/products/product-catalog/view/adobe-photoshop-cc/
https://www.shl.com/products/product-catalog/view/aeronautical-engineering-new/
https://www.shl.com/products/product-catalog/view/aerospace-engineering-new/
https://www.shl.com/products/product-catalog/view/agile-software-development/
https://www.shl.com/products/product-catalog/view/agile-testing-new/
https://www.shl.com/products/product-catalog/view/ai-skills/
https://www.shl.com/products/product-catalog/view/amazon-web-services-aws-development-new/
https://www.shl.com/products/product-catalog/view/android-development-new/
https://www.shl.com/products/product-catalog/view/angular-6-new/
https://www.shl.com/products/product-catalog/view/angularjs-new/
https://www.shl.com/products/product-catalog/view/apache-hadoop-new/
https://www.shl.com/products/product-catalog/view/apache-hadoop-extensions-new/
https://www.shl.com/products/product-cat

### Text preprocessing on Scraped Text

In [None]:
import re

def extract_duration(text: str) -> int:
    """
    Extracts duration in minutes from a text string.
    If a range like '25-35' exists, returns the upper bound (e.g., 35).
    If no number is found, returns -1.
    """
    if not text:
        return -1

    # Find all numbers in the text
    numbers = re.findall(r'\d+', text)

    if not numbers:
        return -1

    # If it's a range like "25-35", pick the last number
    return int(numbers[-1])

# ‚úÖ Test cases
examples = [
    "Approximate Completion Time in minutes = 17",
    "Approximate Completion Time in minutes = 25-35",
    "Completion time: about 50 mins",
    "This test takes approximately half an hour",
    ""
]

for e in examples:
    print(f"{e!r} ‚Üí {extract_duration(e)}")


'Approximate Completion Time in minutes = 17' ‚Üí 17
'Approximate Completion Time in minutes = 25-35' ‚Üí 35
'Completion time: about 50 mins' ‚Üí 50
'This test takes approximately half an hour' ‚Üí -1
'' ‚Üí -1


## Scraping Data related to each assesment

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import json, time

# ---------- Setup ----------
chrome_options = Options()
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(), options=chrome_options)
wait = WebDriverWait(driver, 10)

# ---------- Load the product links ----------
input_file = "shl_product_links_start.json"
with open(input_file, "r", encoding="utf-8") as f:
    assessments = json.load(f)

output_data = {}

# ---------- XPath Map ----------
combined_xpath = "/html/body/main/div[2]/div/div[2]/div[1]/div"
name_xpath = "/html/body/main/div[2]/div/div[1]/h1"

# ---------- Scrape each assessment ----------
for assessment_name, url in assessments.items():
    print(f"üîç Scraping: {assessment_name} ‚Üí {url}")
    driver.get(url)

    try:
        wait.until(EC.presence_of_element_located((By.XPATH, combined_xpath)))
    except Exception as e:
        print(f"‚ö†Ô∏è Skipping {assessment_name}, page didn't load properly: {e}")
        continue

    details = {"url": url, "name": assessment_name}

    try:
        combined_html = driver.find_element(By.XPATH, combined_xpath).get_attribute("outerHTML")
    except Exception as e:
        print(f"‚ö†Ô∏è Could not get combined HTML: {e}")
        continue

    # ---------- Parse with BeautifulSoup ----------
    soup = BeautifulSoup(combined_html, "html.parser")

    # --- 1. Description ---
    desc_div = soup.find("h4", string=lambda s: s and "Description" in s)
    if desc_div and desc_div.find_next("p"):
        details["description"] = desc_div.find_next("p").get_text(strip=True)
    else:
        details["description"] = ""

    # --- 2. Job Levels ---
    job_div = soup.find("h4", string=lambda s: s and "Job level" in s)
    if job_div and job_div.find_next("p"):
        job_text = job_div.find_next("p").get_text(strip=True)
        details["joblevel"] = [x.strip() for x in job_text.split(",") if x.strip()]
    else:
        details["joblevel"] = []

    # --- 3. Language(s) ---
    lang_div = soup.find("h4", string=lambda s: s and "Language" in s)
    if lang_div and lang_div.find_next("p"):
        lang_text = lang_div.find_next("p").get_text(strip=True)
        details["language"] = [x.strip() for x in lang_text.split(",") if x.strip()]
    else:
        # fallback from Downloads section
        langs = [p.get_text(strip=True) for p in soup.select(".product-catalogue__download-language")]
        details["language"] = list(set(langs)) if langs else []

    # --- 4. Test Duration ---
    dur_div = soup.find("h4", string=lambda s: s and "Assessment length" in s)
    duration_text = ""
    if dur_div and dur_div.find_next("p"):
        duration_text = dur_div.find_next("p").get_text(strip=True)
    elif "minutes" in soup.get_text():
        # Fallback search
        for p in soup.find_all("p"):
            if "minute" in p.get_text():
                duration_text = p.get_text(strip=True)
                break
    details["test_duration"] = extract_duration(duration_text)

    # --- 5. Test Type ---
    test_spans = soup.select("span.product-catalogue__key")
    details["test_type"] = [span.get_text(strip=True) for span in test_spans] if test_spans else []

    output_data[assessment_name] = details

    print(f"‚úîÔ∏è Extracted: {assessment_name}")
    print(details)
    time.sleep(1.5)

driver.quit()

# ---------- Save to JSON ----------
output_file = "shl_assessment_extracted.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=4, ensure_ascii=False)

print(f"\n‚úÖ Extraction complete. Saved to {output_file}")


üîç Scraping: Global Skills Development Report ‚Üí https://www.shl.com/products/product-catalog/view/global-skills-development-report/
‚úîÔ∏è Extracted: Global Skills Development Report
{'url': 'https://www.shl.com/products/product-catalog/view/global-skills-development-report/', 'name': 'Global Skills Development Report', 'description': 'This report is designed to be given to individuals who have completed the Global Skills Assessment (GSA). With coverage across the Great 8 Domains, this measure of self-reported behaviors offers a complete overview of their current skills. Participants receive actionable tips on leveraging their top skill strengths and how they might develop their growth skills.', 'joblevel': ['Director', 'Entry-Level', 'Executive', 'General Population', 'Graduate', 'Manager', 'Mid-Professional', 'Front Line Manager', 'Supervisor'], 'language': ['English International', 'English (USA)'], 'test_duration': -1, 'test_type': ['A', 'E', 'B', 'C', 'D', 'P']}
üîç Scraping: