#### Import Libraries

In [4]:
import requests
from bs4 import BeautifulSoup
import json
import os

#### Define URLs to scrape

In [5]:
url_groups = [
    {
        "name" : "mobil-1",
        "urls": [
            'https://www.mobil.com/en/sap/our-products/mobil-1',
            'https://www.mobil.com/en/sap/our-products/products/mobil-1-esp-5w30',
            'https://www.mobil.com/en/sap/our-products/products/mobil-1-racing-4t-10w40',
            'https://www.mobil.com/en/sap/our-products/products/mobil-1-esp-0w-30',
            'https://www.mobil.com/en/sap/personal-vehicles/car/recommended-for-your-car',
        ]
    },
    {
        "name" : "mobil-super-product",
        "urls": [
            'https://www.mobil.com/en/sap/our-products/mobil-super',
        ]
    },
    {
        "name": "mobil-super-moto-products",
        "urls": [
            'https://www.mobil.com/en/sap/our-products/mobil-super-moto',
            'https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-scooter-gear-oil',
            'https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-20w50',
            'https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-20w40',
            'https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-15w40',
            'https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-10w-40',
            'https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-10w30',
            'https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-scooter-10w30',
            'https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-scooter-10w-40',
            'https://www.mobil.com/en/sap/personal-vehicles/bike-and-scooter/tips-to-change-motorbike-engine-oil',
            'https://www.mobil.com/en/sap/personal-vehicles/bike-and-scooter/changing-motorbike-engine-oil',
            'https://www.mobil.com/en/sap/personal-vehicles/bike-and-scooter/benchmarks-to-change-motorcycle-engine-oils',
        ]
    },
    {
        "name": "mobil-delvac-product",
        "urls": [
            'https://www.mobil.com/en/sap/our-products/mobil-delvac',
            'https://www.mobil.com/en/sap/our-products/why-mobil-delvac',
        ]
    },
    {
        "name" : "why-mobil",
        "urls": [
            'https://www.mobil.com/en/sap/our-products/why-mobil',
            'https://www.mobil.com/en/sap/our-products/why-mobil/our-purpose-advancing-productivity',
            'https://www.mobil.com/en/sap/our-products/why-mobil/driving-performance-and-protection'
        ]
    }
]

#### Define reusable functions for processing text

In [6]:
# Directory to save JSON files
def make_dir(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

# Function to chunk text into segments of up to 1000 words
def chunk_text(text, chunk_size=1000):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to extract text from various HTML tags
def extract_text(soup):
    paragraphs = soup.find_all('p')
    divs = soup.find_all('div')
    list_items = soup.find_all('li')

    texts = [tag.text for tag in paragraphs + divs + list_items]
    return ' '.join(texts)

#### Scrape each URL and chunk to multiple 1000 characters document

In [7]:
output_dir = 'processed'
make_dir(output_dir)

# Process each URL group
for group in url_groups:
    group_name = group['name']
    urls = group['urls']
    
    for i, url in enumerate(urls):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        text = extract_text(soup)

        # Chunk the text
        chunks = chunk_text(text)

        # Save each chunk to a JSON file
        for j, chunk in enumerate(chunks):
            file_name = f"{output_dir}/{group_name}_url_{i+1}_chunk_{j+1}.json"
            with open(file_name, 'w') as f:
                json.dump({"topic": group_name, "source": url , "content": chunk}, f, indent=4)

        print(f"Processed {group_name} URL {i+1}/{len(urls)}: {url}")

Processed mobil-1 URL 1/5: https://www.mobil.com/en/sap/our-products/mobil-1
Processed mobil-1 URL 2/5: https://www.mobil.com/en/sap/our-products/products/mobil-1-esp-5w30
Processed mobil-1 URL 3/5: https://www.mobil.com/en/sap/our-products/products/mobil-1-racing-4t-10w40
Processed mobil-1 URL 4/5: https://www.mobil.com/en/sap/our-products/products/mobil-1-esp-0w-30
Processed mobil-1 URL 5/5: https://www.mobil.com/en/sap/personal-vehicles/car/recommended-for-your-car
Processed mobil-super-product URL 1/1: https://www.mobil.com/en/sap/our-products/mobil-super
Processed mobil-super-moto-products URL 1/12: https://www.mobil.com/en/sap/our-products/mobil-super-moto
Processed mobil-super-moto-products URL 2/12: https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-scooter-gear-oil
Processed mobil-super-moto-products URL 3/12: https://www.mobil.com/en/sap/our-products/products/mobil-super-moto-20w50
Processed mobil-super-moto-products URL 4/12: https://www.mobil.com/en/sap/our