In [22]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

import time
import os
import json
from tqdm.auto import tqdm
import logging

from langchain_community.document_loaders import PyPDFLoader

## Step 1. Extract all sublinks

URLs and PDF files  

In [23]:
# Global Parameters 
faqs_url = 'https://philnat.unibas.ch/de/faqs/' # ground truth dataset

main_urls = ['https://philnat.unibas.ch/de/studium/'
            'https://philnat.unibas.ch/de/studium/bachelor/',
            'https://philnat.unibas.ch/de/studium/master/',
            'https://philnat.unibas.ch/de/studium/ausserfakultaere-studienfaecher/',
            'https://philnat.unibas.ch/de/examen/',
            'https://tales.nmc.unibas.ch/en/von-adam-bis-zoom-25/',
            'https://philnat.unibas.ch/de/forschung',
            'https://philnat.unibas.ch/de/forschung/habilitation/'
            'https://philnat.unibas.ch/de/forschung/promotionphd/',
            'https://philnat.unibas.ch/de/forschung/promotionphd/immatrikulation-ab-hs-2016-registered-fall-semester-2016-or-later/',
            'https://philnat.unibas.ch/de/forschung/promotionphd/immatrikulation-bis-und-mit-fs-2016-registered-before-fall-semester-2016/',
            'https://philnat.unibas.ch/de/forschung/promotionphd/doktoratsprogramme/',
            'https://philnat.unibas.ch/de/forschung/qualitaet/',
            'https://philnat.unibas.ch/de/forschung/tenure-verfahren/',
            'https://philnat.unibas.ch/de/forschung/titularprofessuren/',
            'https://philnat.unibas.ch/de/termine-aktuelles/',
            'https://philnat.unibas.ch/de/studium/bachelorfeier/',
            'https://www.unibas.ch/de/Studium/Studierendenordnung.html',
            'https://www.unibas.ch/de/Studium/Im-Studium/Datenabschrift.html',]


limit = 300 # how many PDFs and HTML pages will be processed; in total 2*limit

filename = 'faculty_of_science' # resulting json file 

bot_name = 'Unibasel_RAG_bot/1.0'

directory_path = '../../../../data'   # for PDFs 

In [24]:
def save_to_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)


In [25]:
# only for depth = 1

def extract_sublinks(url):
    sublinks = set()
    pdfs = set()
    try:
        headers = {'User-Agent': bot_name}
        response = requests.get(url, headers=headers)
    
        soup = BeautifulSoup(response.text, 'html.parser')
        
        base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
        
        a_tags = soup.find_all('a', href=True)
        for a_tag in tqdm(a_tags, desc='Processing links', unit='link'):
            link = a_tag['href']
            if link.startswith('http://') or link.startswith('https://'):
                full_url = link
            else:
                full_url = urljoin(base_url, link)
            
            if full_url.endswith('.pdf'):
                pdfs.add(full_url)
            else:
                sublinks.add(full_url)

            time.sleep(0.5)
                
    except Exception as e:
        print(f"Error: {e}")
    
    return sublinks, pdfs

In [26]:
# Example

url = "https://philnat.unibas.ch/de/studium/"

sublinks, pdfs = extract_sublinks(url)

print(f"Total sublinks found: {len(sublinks)}")
print(f"Total PDFs found: {len(pdfs)}")

Processing links:   0%|          | 0/70 [00:00<?, ?link/s]

Total sublinks found: 45
Total PDFs found: 14


In [27]:
data = {
    "main_url": url,
    "sublinks": list(sublinks)
}

# Save to JSON file
save_to_json(data, "sublinks.json")

print(f"Extracted {len(sublinks)} sublinks and saved them to sublinks.json")
print(f"Found {len(pdfs)} PDF documents.")

Extracted 45 sublinks and saved them to sublinks.json
Found 14 PDF documents.


In [28]:
def get_all_links_and_pdfs(urls):

    links = set()
    pdfs = set()

    for url in urls:

        sublinks, subpdfs = extract_sublinks(url)
        links.update(sublinks)
        pdfs.update(subpdfs)

    return list(links), list(pdfs)

links, pdfs = get_all_links_and_pdfs(main_urls)

Processing links:   0%|          | 0/36 [00:00<?, ?link/s]

Processing links:   0%|          | 0/130 [00:00<?, ?link/s]

Processing links:   0%|          | 0/70 [00:00<?, ?link/s]

Processing links:   0%|          | 0/54 [00:00<?, ?link/s]

Processing links:   0%|          | 0/1 [00:00<?, ?link/s]

Processing links:   0%|          | 0/50 [00:00<?, ?link/s]

Processing links:   0%|          | 0/36 [00:00<?, ?link/s]

Processing links:   0%|          | 0/66 [00:00<?, ?link/s]

Processing links:   0%|          | 0/57 [00:00<?, ?link/s]

Processing links:   0%|          | 0/48 [00:00<?, ?link/s]

Processing links:   0%|          | 0/39 [00:00<?, ?link/s]

Processing links:   0%|          | 0/39 [00:00<?, ?link/s]

Processing links:   0%|          | 0/40 [00:00<?, ?link/s]

Processing links:   0%|          | 0/41 [00:00<?, ?link/s]

Processing links:   0%|          | 0/40 [00:00<?, ?link/s]

Processing links:   0%|          | 0/129 [00:00<?, ?link/s]

Processing links:   0%|          | 0/142 [00:00<?, ?link/s]

In [29]:
print(f"Extracted {len(links)} sublinks")
print(f"Found {len(pdfs)} PDF documents.")

Extracted 251 sublinks
Found 121 PDF documents.


## Step 2. Count characters

In [30]:
def count_characters_on_webpage(url):
 
    headers = {'User-Agent': bot_name}
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract all text from the webpage
        text = soup.get_text()
        
        # Count the characters
        character_count = len(text)
        print(f"The URL contains {character_count} characters.")
        
        return character_count
    else:
        print(f"Failed to fetch the webpage. Status code: {response.status_code}")
        return None

count_characters_on_webpage(url)

The URL contains 8810 characters.


8810

In [31]:
# Load the PDF document
file_path = 'https://www.unibas.ch/dam/jcr:afdc85dc-b51c-4b7c-bdc4-9e301871d6e0/446_710_10.pdf'

def count_characters_in_pdf(file_path, include_whitespace=True, log_level=logging.ERROR):
    logging.basicConfig(level=log_level)
    total_characters = 0
    try:
        loader = PyPDFLoader(file_path) #PdfReader(file)
        # Load and split the document into pages
        pages = loader.load_and_split()
        
        # Iterate over each page and count characters
        for page in pages:
            text = page.page_content
        
            total_characters += len(text)
            logging.info(f"Processed page. Current total: {total_characters}")

        print(f"The PDF contains {total_characters} characters.")
        return total_characters
    
    except Exception as e:
        logging.error(f"Error processing PDF: {e}")
        return 0

count_1_pdf = count_characters_in_pdf(file_path)

The PDF contains 24009 characters.


## Step 3. Check  

In [32]:
def is_pdf_downloadable(url):
    try:
        headers = {'User-Agent': bot_name}
        response = requests.get(url, headers=headers)
        
        # Send a HEAD request
        #response = requests.head(url, headers=headers, allow_redirects=True, timeout=5)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Check the Content-Type header
            content_type = response.headers.get('Content-Type', '').lower()
            if 'application/pdf' in content_type:
                # Check if Content-Length header is present
                if 'Content-Length' in response.headers:
                    return True
                else:
                    print("Warning: Content-Length header is missing")
                    return True
            else:
                print(f"Not a PDF file. Content-Type: {content_type}")
                return False
        else:
            print(f"Failed to access the URL. Status code: {response.status_code}")
            return False
    
    except requests.RequestException as e:
        print(f"Error occurred: {e}")
        return False

In [33]:
is_pdf_downloadable('http://informatik.unibas.ch/fileadmin/downloads/BSc-MSc_Computer_Science_genehmigt_philnat_20160524.pdf')

Not a PDF file. Content-Type: text/html; charset=utf-8


False

In [34]:
def find_all_pdf_files_in_web(pdfs_list):
  
    total_vol = 0
    pdfs_checked = []
    
    for pdf in pdfs_list:

        # Fetch the webpage content
        if is_pdf_downloadable(pdf):

            total_doc = 0

            print(f"Name: {pdf}")

            total_doc = count_characters_in_pdf(pdf)
    
            total_vol += total_doc
            
            pdfs_checked.append(pdf)
                
            print("------------------------------------------------")
        else:
            print(f"Name: {pdf}")
            print(f"---> Failed to fetch the webpage.")
            print("------------------------------------------------")

        time.sleep(1)

    print(f"The database contains {total_vol} characters.")

    return total_vol, pdfs_checked
    
count_web_pdfs, pdfs_checked = find_all_pdf_files_in_web(pdfs)


Name: https://philnat.unibas.ch/fileadmin/user_upload/philnat/2_Studium_QP/MSc_Epidemiology.pdf
The PDF contains 5616 characters.
------------------------------------------------
Name: https://philnat.unibas.ch/fileadmin/user_upload/philnat/3_Forschung/Formular_Betreuungszusage_2.pdf
The PDF contains 1807 characters.
------------------------------------------------
Name: https://philnat.unibas.ch/fileadmin/user_upload/philnat/3_Forschung/Formular_Abbruch_der_Doktoratsausbildung_April_2024.pdf
The PDF contains 1262 characters.
------------------------------------------------
Name: https://www.unibas.ch/dam/jcr:2eb607be-80ab-42c6-876d-fb1acac4d6b9/DA.pdf
The PDF contains 1663 characters.
------------------------------------------------
Name: https://www.unibas.ch/dam/jcr:81f378a3-1625-4df8-883c-0162768e031f/Anhang%2012_MSG_Oekologie_00.pdf
The PDF contains 2600 characters.
------------------------------------------------
Name: https://philnat.unibas.ch/fileadmin/user_upload/philnat/3_For

In [35]:
print("Total found PDF files:", len(pdfs))
print("Downloadable PDF files:", len(pdfs_checked))

Total found PDF files: 121
Downloadable PDF files: 114


In [36]:
def check_url(url, timeout=1):
    try:
        # Ensure the URL has a scheme
        if not urlparse(url).scheme:
            url = 'http://' + url

        # Send a HEAD request
        headers = {'User-Agent': bot_name}
        response = requests.head(url, timeout=timeout, headers = headers, allow_redirects=True)
        
        # Check if the status code indicates success (2xx)
        if 200 <= response.status_code < 300:
            return True, response.status_code, "URL is accessible"
        else:
            return False, response.status_code, f"URL returned status code {response.status_code}"

    except requests.ConnectionError:
        return False, None, "Failed to establish a connection"
    except requests.Timeout:
        return False, None, "Request timed out"
    except requests.TooManyRedirects:
        return False, None, "Too many redirects"
    except requests.RequestException as e:
        return False, None, f"An error occurred: {str(e)}"

## Step 4. Select 

In [37]:
def filter_url(url):
    return not (url.endswith("faqs/") or url.endswith("FAQ-Studium.html"))

def select_links(urls, limit):
    not_selected = []
    selected = []

    for url in urls:
        if check_url(url)[0] and filter_url(url):
            if len(selected) < limit:
                selected.append(url)
            else:
                not_selected.append(url)
        else:
            not_selected.append(url)

        if len(selected) == limit:
            break

    return selected, not_selected 
            
selected_urls, not_selected_urls  = select_links(links, limit)

print("Included: ", selected_urls)
print("Excluded:", not_selected_urls)

Included:  ['https://philnat.unibas.ch/de/studium/', 'https://dmi.unibas.ch/de/studium/computer-science-informatik/organisatorisches/computer-science-als-zweitfach/', 'https://msd.unibas.ch/en/home/', 'http://intranet.unibas.ch', 'https://dmi.unibas.ch/de/studium/actuarial-science/dokumente-links/', 'https://phdschoolqcqt.unibas.ch/en/', 'http://ub.unibas.ch', 'https://chemie.unibas.ch/de/studium/bachelor/', 'https://philnat.unibas.ch/de/forschung/tenure-verfahren/', 'https://evasys.unibas.ch/evasys/online.php?p=UEGED', 'https://philnat.unibas.ch/de/forschung/qualitaet/', 'https://philnat.unibas.ch/de/studium/bachelorfeier/', 'https://www.bio.unibas.ch/de/studium/msc-plant-science/', 'https://www.biozentrum.unibas.ch/education/degree-programms/msc-in-physics-of-life', 'https://philnat.unibas.ch/de/studium/master/', 'https://www.enable-javascript.com/de/', 'https://dmi.unibas.ch/de/studium/mathematik/mathematik-als-zweitfach/', 'https://pharma.unibas.ch/de/education/', 'https://personen

In [38]:

def select_pdfs(pdfs, limit):

    not_selected = []
    selected = []

    for pdf in pdfs:

        if len(selected) < limit:
            selected.append(pdf)
        else:
            not_selected.append(pdf)

        if len(selected) == limit:
            break

    return selected, not_selected
            
selected_pdfs, not_selected_pdfs  = select_pdfs(pdfs_checked, limit)
print("Included: ", selected_pdfs)
print("Excluded:", not_selected_pdfs)

Included:  ['https://philnat.unibas.ch/fileadmin/user_upload/philnat/2_Studium_QP/MSc_Epidemiology.pdf', 'https://philnat.unibas.ch/fileadmin/user_upload/philnat/3_Forschung/Formular_Betreuungszusage_2.pdf', 'https://philnat.unibas.ch/fileadmin/user_upload/philnat/3_Forschung/Formular_Abbruch_der_Doktoratsausbildung_April_2024.pdf', 'https://www.unibas.ch/dam/jcr:2eb607be-80ab-42c6-876d-fb1acac4d6b9/DA.pdf', 'https://www.unibas.ch/dam/jcr:81f378a3-1625-4df8-883c-0162768e031f/Anhang%2012_MSG_Oekologie_00.pdf', 'https://philnat.unibas.ch/fileadmin/user_upload/philnat/3_Forschung/PhD_application_March_2024.pdf', 'https://www.unibas.ch/dam/jcr:4c4e2ac4-a0a3-44a6-9b28-603876c2f06f/MSG_Data%20Science_00.pdf', 'https://www.unibas.ch/dam/jcr:c67b41e1-b339-4404-91ae-7a60a66a7a28/446_710_11.pdf', 'https://philnat.unibas.ch/fileadmin/user_upload/philnat/3_Forschung/Doktoratsstudium_-_Leitfaden_Version_06.2023_update.pdf', 'https://philnat.unibas.ch/fileadmin/user_upload/philnat/2_Studium_QP/MSc_C

## Step 5. Save 

In [39]:
def download_pdf(url, directory):

    headers = {'User-Agent': bot_name}   
    response = requests.get(url, headers=headers, stream=True)

    if response.status_code == 200:
        # Get the filename from the URL
        filename = os.path.basename(urlparse(url).path)
        
        # Create the full path for saving the file
        filepath = os.path.join(directory, filename)
        
        # Open the file in write-binary mode
        with open(filepath, 'wb') as file:
            # Write the content to the file
            file.write(response.content)

        time.sleep(1)
        print(f"Downloaded: {filename}")
        total_doc = count_characters_in_pdf(url)
        print("------------------------------------------------")
        return True
    else:
        print(f"Failed to download: {url}")
        return False

def download_pdfs(urls, directory):
    
    # Create the directory if it doesn't exist
    # os.makedirs(directory, exist_ok=True)
    
    for url in urls:
        if url.lower().endswith('.pdf'):
            download_pdf(url, directory)
        else:
            print(f"Skipped: {url} (not a PDF)")
            

In [40]:
# comment out to download docs into /data folder  
# download_pdfs(selected_pdfs, directory_path)

In [41]:
def find_all_pdf_files_in_dir(directory_from):
    pdf_files = []
    total_vol = 0
    
    for root, dirs, files in os.walk(directory_from):
        for file in files:
            if file.endswith('.pdf'):
                path_to_file = os.path.join(directory_from, file)
                pdf_files.append(path_to_file)

                total_characters = 0

                total_characters = count_characters_in_pdf(path_to_file)

                total_vol += total_characters
                    
                print(f"Name: {file}")
                print("------------------------------------------------")

    print(f"The database contains {total_vol} characters.")

    return pdf_files

def count_urls_size(urls):
    total = 0
    
    for url in urls:
        total += count_characters_on_webpage(url)

        time.sleep(0.5)
        print(f"Name: {url}")
        print("------------------------------------------------")

    print(f"The database contains {total} characters.")

    return total
    

In [42]:
# comment out to check saved docs
# pdf_files_list = find_all_pdf_files_in_dir(directory_path)

In [43]:
count_urls_size(selected_urls) 

The URL contains 8810 characters.
Name: https://philnat.unibas.ch/de/studium/
------------------------------------------------
The URL contains 2942 characters.
Name: https://dmi.unibas.ch/de/studium/computer-science-informatik/organisatorisches/computer-science-als-zweitfach/
------------------------------------------------
The URL contains 3174 characters.
Name: https://msd.unibas.ch/en/home/
------------------------------------------------
The URL contains 2127 characters.
Name: http://intranet.unibas.ch
------------------------------------------------
The URL contains 8572 characters.
Name: https://dmi.unibas.ch/de/studium/actuarial-science/dokumente-links/
------------------------------------------------
The URL contains 2900 characters.
Name: https://phdschoolqcqt.unibas.ch/en/
------------------------------------------------
The URL contains 5693 characters.
Name: http://ub.unibas.ch
------------------------------------------------
The URL contains 6819 characters.
Name: https:/

656939

### Evaluate the number of chunks 

The exact number of chunks can be found in MongoDB Compass after ingestion.

In [44]:
import math
# chunk_size < N
N = 656939  + 780625 

chunk_size = 3000
overlap_char = 600 
# 10-20% is usually recommended
overlap = overlap_char/chunk_size
 
rag_input_char = N/(1-overlap)
print(" Input size:", N, "\n Expected RAG size:", round(rag_input_char))
print(" Number of chunks:", math.ceil(rag_input_char/chunk_size))

 Input size: 1437564 
 Expected RAG size: 1796955
 Number of chunks: 599


In [45]:
# Create a dictionary with URLs
# PDF files have been already saved and will be processed later in get_config.py
data = {
        "urls": selected_urls,
        "pdfs": selected_pdfs

}

# Save to JSON file
save_to_json(data, f"../{filename}_links.json")

print(f"Extracted {len(selected_urls)} sublinks and saved them to {filename}_links.json")
print(f"Saved {len(selected_pdfs)} PDF documents.")


Extracted 119 sublinks and saved them to faculty_of_science_links.json
Saved 114 PDF documents.


---
* Author: Anastasiia Popova
* Email: anastasiia.popova@stud.unibas.ch

[Perplexity AI](https://www.perplexity.ai/) assisted in code writing, editing, and more effective information searches. The generated output underwent critical evaluation. The author is solely responsible for the content.
