This short project is an example for webscraping.
For this project, https://elibrary.judiciary.gov.ph was used to get all the jurisprudence posted in the site.

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import json
import random

# Base URL
elibrary_url = 'https://elibrary.judiciary.gov.ph/'

# Regular expression patterns
year_pattern = r'/(\d{4})/'
month_pattern = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'


def extract_years_months(url):
    """
    Extracts all links with available data
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    container = soup.find('div', id='container_date')
    # Extract year and month
    year_months_list = [
        (
            a['href'],
            re.search(year_pattern, a['href']).group(1) if re.search(year_pattern, a['href']) else None,
            re.search(month_pattern, a['href'], re.IGNORECASE).group(0) if re.search(month_pattern, a['href'], re.IGNORECASE) else None
        )
        for a in container.find_all('a') if 'href' in a.attrs
    ]
    return year_months_list


def extract_text_links(url):
    """
    Function to extract texts from a given elibrary URL
    """
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    container = soup.find('div', id='container_title')
    decision_links = container.find_all('a')
    # print("Count decisions: ", len(decision_links))
    return [(a['href'], a.text) for a in decision_links]

# Function to extract resolution text
def extract_text_data(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    container = soup.find('div', class_='single_content')

    if container:
        resolution_text = container.get_text()

        # Remove the first 6 lines (For printing purposes)
        lines = resolution_text.split('\n')
        filtered_lines = lines[6:]

        # Join the remaining lines back into a single string
        filtered_text = '\n'.join(filtered_lines)
        return filtered_text
    else:
        return ""


def save_text_to_files(text, url, file_base_name):
    """
    Function to save text to json file
    """
    file_name = f"{file_base_name}.json"
    data = {
        'text': text,
        'url': url
    }
    with open(file_name, 'w', encoding='utf-8') as output_file:
        json.dump(data, output_file, ensure_ascii=False, indent=4)

In [2]:
# DEPRECATED data extraction technique
# import requests
# from bs4 import BeautifulSoup
# import re
# import time
# from transformers import GPT2Tokenizer
# import json

# # Initialize the tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# # Base URL
# elibrary_url = 'https://elibrary.judiciary.gov.ph/'
# max_tokens = 7800  # Assuming a token limit, adjust based on actual requirements

# # Regular expression patterns
# year_pattern = r'/(\d{4})/'
# month_pattern = r'(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)'

# # Function to extract years and months
# def extract_years_months(url):
#     response = requests.get(url)
#     soup = BeautifulSoup(response.content, 'html.parser')
#     container = soup.find('div', id='container_date')
#     # Extract year and month
#     year_months_list = [
#         (
#             a['href'],
#             re.search(year_pattern, a['href']).group(1) if re.search(year_pattern, a['href']) else None,
#             re.search(month_pattern, a['href'], re.IGNORECASE).group(0) if re.search(month_pattern, a['href'], re.IGNORECASE) else None
#         )
#         for a in container.find_all('a') if 'href' in a.attrs
#     ]
#     return year_months_list

# # Function to extract decision links for a specific month
# def extract_decision_links(url):
#     response = requests.get(url)
#     soup = BeautifulSoup(response.content, 'html.parser')
#     container = soup.find('div', id='container_title')
#     decision_links = container.find_all('a')
#     # print("Count decisions: ", len(decision_links))
#     return [(a['href'], a.text) for a in decision_links]

# # Function to extract resolution text
# def extract_resolution_text(url):
#     response = requests.get(url)
#     soup = BeautifulSoup(response.content, 'html.parser')
#     container = soup.find('div', class_='single_content')
    
#     if container:
#         resolution_text = container.get_text()
        
#         # Remove the first 6 lines (For printing purposes)
#         lines = resolution_text.split('\n')
#         filtered_lines = lines[6:]

#         # Join the remaining lines back into a single string
#         filtered_text = '\n'.join(filtered_lines)
#         return filtered_text
#     else:
#         return ""

# # Function to save text to files
# def save_text_to_files(text, url, file_base_name):
#     tokens = tokenizer.encode(text)
#     part = 1
#     while tokens:
#         chunk = tokens[:max_tokens]
#         tokens = tokens[max_tokens:]
#         chunk_text = tokenizer.decode(chunk)
#         file_name = f"{file_base_name}_Part{part}.json"
#         data = {
#             'text': chunk_text,
#             'url': url
#         }
#         with open(file_name, 'w', encoding='utf-8') as output_file:
#             json.dump(data, output_file, ensure_ascii=False, indent=4)
#         part += 1


In [3]:
year_month_urls = extract_years_months(elibrary_url)
len(year_month_urls)

333

In [4]:
def sanitize_filename(filename):
    # Replace invalid characters with an underscore
    sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
    return sanitized

In [7]:
# Starting point for scraping (adjust as needed)
start_year = 2015
start_month = 'Jan'
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

In [8]:
# Scrape all text data for each outer link
# year_month_urls is the outer URL
for ym_link, year, month in year_month_urls:
    # Check if we need to skip this year and month
    if int(year) > start_year or (int(year) == start_year and months.index(month) < months.index(start_month)):
        continue
    print(f"---------- {year}-{month} ----------")
    text_links = extract_text_links(ym_link)
    print("Count text links: ", len(text_links))
    # Iterate through each inner link to get the text
    # text_links is the inner URL

    # Shuffle and select 5 random text links, if available
    random.shuffle(text_links)
    selected_text_links = text_links[:1]
    
    for i, (link_to_text, anchor) in enumerate(selected_text_links):
        match = re.match(r'^(.*?)\n', anchor)
        if match:
            title = sanitize_filename(match.group(1))
            print(title)

        text_data = extract_text_data(link_to_text)
        if text_data:
            case_number = title[:100]
            file_base_name = f'{year}_{month}_{case_number}'
            save_text_to_files(text_data, link_to_text, file_base_name)

---------- 2015-Jan ----------
Count text links:  95
G.R. No. 156995
---------- 2015-Feb ----------
Count text links:  105
G.R. No. 171672
---------- 2015-Mar ----------
Count text links:  94
G.R. No. 179047
---------- 2015-Apr ----------
Count text links:  70
G.R. No. 171601
---------- 2015-Jun ----------
Count text links:  109
A.C. No. 10138 (Formerly CBD Case No. 06-1876)
---------- 2015-Jul ----------
Count text links:  138
G.R. No. 205228
---------- 2015-Aug ----------
Count text links:  101
A.C. No. 6738
---------- 2015-Sep ----------
Count text links:  98
G.R. No. 205379
---------- 2015-Oct ----------
Count text links:  57
G.R. No. 181284
---------- 2015-Nov ----------
Count text links:  61
A.C. No. 8507
---------- 2015-Dec ----------
Count text links:  61
G.R. No. 159979
---------- 2014-Jan ----------
Count text links:  79
A.C. No. 8644 [Formerly CBD Case No. 11-2908]
---------- 2014-Feb ----------
Count text links:  75
G.R. No. 179625
---------- 2014-Mar ----------
Count text 