# Approach


#### I tested Beautiful Soup for extractions, but it didn't work, so I developed a system that uses the Perplexity LLM API to extract structured data about saints. By crafting dynamic queries with a few-shot learning approach, I guide the LLM to return saint information in a specific JSON format. After validating the response, I save the data into both JSON and CSV files for easy access and scalability.

In [None]:
!pip install --upgrade numpy pandas

Collecting numpy
  Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl.metadata (59 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp39-cp39-win_amd64.whl.metadata (19 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Using cached numpy-2.0.2-cp39-cp39-win_amd64.whl (15.9 MB)
Downloading pandas-2.2.3-cp39-cp39-win_amd64.whl (11.6 MB)
   ---------------------------------------- 11.6/11.6 MB 38.2 MB/s eta 0:00:00
Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Installing collected packages: python-dateutil, numpy, pandas
  Attempting uninstall: python-dateutil
    Found existing installation: python-dateutil 2.8.1
    Uninstalling python-dateutil-2.8.1:
      Successfully uninstalled python-dateutil-2.8.1
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'd:\\anaconda3\\envs\\env\\lib\\site-packages\\numpy\\linalg\\_umath_linalg.cp39-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



# BeautifulSoup Approach

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

saint_data = []

# Goes thru 100 saints
for saint_id in range(1, 101):
    # Saint Url formula
    url = f'https://www.catholic.org/saints/saint.php?saint_id={saint_id}'

    response = requests.get(url)

    if response.status_code != 200:
        print(f"Skipping saint ID {saint_id} due to error {response.status_code}")
        continue

    soup = BeautifulSoup(response.content, 'html.parser')

    saint_info = [None] * 15  # Placeholder for 15 fields
    try:
        saint_info[0] = soup.find('h1').text.strip()  # Name

        # Extract details
        birth_name = soup.find('td', text='Birth Name:').find_next('td').text.strip() if soup.find('td', text='Birth Name:') else None
        saint_info[1] = birth_name

        birth_date = soup.find('td', text='Birth Date:').find_next('td').text.strip() if soup.find('td', text='Birth Date:') else None
        saint_info[2] = birth_date

        birthplace = soup.find('td', text='Birth Place:').find_next('td').text.strip() if soup.find('td', text='Birth Place:') else None
        saint_info[3] = birthplace

        death_date = soup.find('td', text='Death Date:').find_next('td').text.strip() if soup.find('td', text='Death Date:') else None
        saint_info[4] = death_date

        death_place = soup.find('td', text='Death Place:').find_next('td').text.strip() if soup.find('td', text='Death Place:') else None
        saint_info[5] = death_place

        feast_day = soup.find('td', text='Feast Day:').find_next('td').text.strip() if soup.find('td', text='Feast Day:') else None
        saint_info[6] = feast_day

        virtues_traits = soup.find('td', text='Virtues/Traits:').find_next('td').text.strip() if soup.find('td', text='Virtues/Traits:') else None
        saint_info[7] = virtues_traits

        patron_of = soup.find('td', text='Patron Of:').find_next('td').text.strip() if soup.find('td', text='Patron Of:') else None
        saint_info[8] = patron_of

        martyr = soup.find('td', text='Martyr?').find_next('td').text.strip() if soup.find('td', text='Martyr?') else None
        saint_info[9] = martyr

        priest_religious = soup.find('td', text='Priest/Religious?').find_next('td').text.strip() if soup.find('td', text='Priest/Religious?') else None
        saint_info[10] = priest_religious

        religious_order = soup.find('td', text='Religious Order:').find_next('td').text.strip() if soup.find('td', text='Religious Order:') else None
        saint_info[11] = religious_order

        major_events = soup.find('td', text='Major Events:').find_next('td').text.strip() if soup.find('td', text='Major Events:') else None
        saint_info[12] = major_events

        notable_contributions = soup.find('td', text='Notable Contributions:').find_next('td').text.strip() if soup.find('td', text='Notable Contributions:') else None
        saint_info[13] = notable_contributions

        quotes = soup.find('td', text='Quotes:').find_next('td').text.strip() if soup.find('td', text='Quotes:') else None
        saint_info[14] = quotes

        # Append if we have it
        if saint_info[0]:
            saint_data.append(saint_info)
        else:
            print(f"Skipping saint ID {saint_id} because no name was found.")
    except Exception as e:
        print(f"Error processing saint ID {saint_id}: {e}")

# Write to our csv
with open('catholic_saints.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    # Write the header
    writer.writerow(['Name', 'Birth Name', 'Birth Date', 'Birthplace', 'Death Date', 'Death Place', 'Feast Day',
                     'Virtues/Traits', 'Patron Of', 'Martyr?', 'Priest/Religious?', 'Religious Order',
                     'Major Events', 'Notable Contributions', 'Quotes'])

    # Write to saint data
    writer.writerows(saint_data)

print(f"Data has been written to catholic_saints.csv with {len(saint_data)} saints.")


  birth_name = soup.find('td', text='Birth Name:').find_next('td').text.strip() if soup.find('td', text='Birth Name:') else None
  birth_date = soup.find('td', text='Birth Date:').find_next('td').text.strip() if soup.find('td', text='Birth Date:') else None
  birthplace = soup.find('td', text='Birth Place:').find_next('td').text.strip() if soup.find('td', text='Birth Place:') else None
  death_date = soup.find('td', text='Death Date:').find_next('td').text.strip() if soup.find('td', text='Death Date:') else None
  death_place = soup.find('td', text='Death Place:').find_next('td').text.strip() if soup.find('td', text='Death Place:') else None
  feast_day = soup.find('td', text='Feast Day:').find_next('td').text.strip() if soup.find('td', text='Feast Day:') else None
  virtues_traits = soup.find('td', text='Virtues/Traits:').find_next('td').text.strip() if soup.find('td', text='Virtues/Traits:') else None
  patron_of = soup.find('td', text='Patron Of:').find_next('td').text.strip() if so

Data has been written to catholic_saints.csv with 100 saints.


In [None]:
# !pip install openai

Collecting openai
  Downloading openai-1.63.0-py3-none-any.whl.metadata (27 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp39-cp39-win_amd64.whl.metadata (5.3 kB)
Downloading openai-1.63.0-py3-none-any.whl (472 kB)
Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Downloading jiter-0.8.2-cp39-cp39-win_amd64.whl (207 kB)
Installing collected packages: jiter, distro, openai
Successfully installed distro-1.9.0 jiter-0.8.2 openai-1.63.0


# LLM Approach

In [None]:
from openai import OpenAI
import json
import csv
import os

def get_api_key():
    """For API Key Management cuz it's annoying"""
    api_key = os.getenv("PERPLEXITY_API_KEY") or YOUR_API_KEY
    if not api_key.startswith("pplx-"):
        raise ValueError("Invalid Perplexity API key format. Key should start with 'pplx-'")
    return api_key

# Example template based on the provided data format - few shot learnings for the LLM
SAINT_TEMPLATE = {
    "Name": "Saint Benedict of Nursia",
    "Birth_Name": "Benedict of Nursia",
    "Birth_Date": "0480-03-02",
    "Birthplace": "Nursia, Kingdom of the Lombards",
    "Death_Date": "0547-03-21",
    "Death_Place": "Monte Cassino",
    "Feast_Day": "0547-07-11",
    "Virtues_Traits": [
        "Balance",
        "Moderation",
        "Reasonableness"
    ],
    "Patron_Of": [
        "Europe",
        "Monks",
        "Farmers"
    ],
    "Martyr": False,
    "Religious_Order": "Benedictines",
    "Major_Events": [
        "Founded twelve monasteries at Subiaco",
        "Established the monastery at Monte Cassino",
        "Formulated the Benedictine Rule"
    ],
    "Notable_Contributions": [
        "Founded Western Christian monasticism",
        "Developed the Benedictine Rule",
        "Established a structured monastic lifestyle"
    ],
    "Quotes": [
        "Idleness is the enemy of the soul."
    ],
    "Vocation_Job": "Monk, Abbot, Theologian",
    "Conversion_Status": "Lifelong Catholic",
    "Popes": {
        "Was_Pope": False,
        "Pontifical_Motto": "" 
    }
}

def create_saints_query(saint_name):
    template_str = json.dumps(SAINT_TEMPLATE, indent=2)
    return (
        f"For the saint named {saint_name}, provide detailed information about their life and legacy. "
        "Format your response EXACTLY like this template, maintaining the same structure and field types. "
        f"Here is the template to follow:\n\n{template_str}\n\n"
        "Important formatting rules:\n"
        "1. Dates should be in YYYY-MM-DD format, using leading zeros for years before 1000 (e.g., 0480-03-02)\n"
        "2. Use 'Unknown' for any unknown dates or information\n"
        "3. Boolean values should be true or false (lowercase)\n"
        "4. Lists should be formatted with each item separated by commas\n"
        "5. Return ONLY the JSON object, no additional text\n"
        "6. Ensure 'Vocation_Job' explicitly states the saint’s occupation(s)\n"
        "7. Clearly indicate if the saint was a Convert or Lifelong Catholic in 'Conversion_Status'\n"
        "8. If the saint was a pope, include their 'Pontifical_Motto' in the 'Popes' field\n"
        "9. Ensure all quotes are properly escaped\n"
        "10. Only extract direct quotes from the saint, not general information"
    )

def query_saint_info(saint_name):
    try:
        api_key = get_api_key()
        
        messages = [
            {
                "role": "system",
                "content": "You are a JSON formatter that returns only valid JSON objects containing information about Catholic saints. Match the template format exactly. Please remember to do not put something you don't know and please put the correct format to help build this dataset for the Catholic Church. Only extract direct quotes from the saints, not general information."
            },
            {
                "role": "user",
                "content": create_saints_query(saint_name)
            }
        ]
        
        client = OpenAI(
            api_key=api_key,
            base_url="https://api.perplexity.ai"
        )
        
        response = client.chat.completions.create(
            model="sonar-pro",
            messages=messages,
            temperature=0.7
        )
        
        content = response.choices[0].message.content.strip()
        
        # Find and extract the JSON objects
        try:
            json_data = json.loads(content)
            return json.dumps(json_data)
        except json.JSONDecodeError:
            # If the initial parse fails, try the regex approach to parse thru it
            import re
            json_match = re.search(r'(\{[\s\S]*\})', content)
            if json_match:
                possible_json = json_match.group(1)
                # Verify the extracted JSON is valid
                json_data = json.loads(possible_json)
                return json.dumps(json_data)
            raise ValueError("Could not extract valid JSON from response")
        
    except Exception as e:
        print(f"Error querying information for {saint_name}: {str(e)}")
        raise

def save_saints_data(saints_list, base_filename="saints_data"):
    """
    Saves em as a JSON and CSV
    """
    saints_data = []
    
    try:
        get_api_key()
        
        for saint in saints_list:
            try:
                print(f"Processing {saint}...")
                saint_info = query_saint_info(saint)
                parsed_info = json.loads(saint_info)
                saints_data.append(parsed_info)
                print(f"Successfully processed {saint}")
            except Exception as e:
                print(f"Error processing {saint}: {str(e)}")
                continue
        
        if not saints_data:
            print("No saint data was successfully collected. Files will not be created.")
            return []
        
        # Save JSON - Best for data and language accent preservation
        json_filename = f"{base_filename}.json"
        with open(json_filename, 'w', encoding='utf-8') as f:
            json.dump(saints_data, f, indent=2, ensure_ascii=False)
        
        # Save CSV file - sometimes has issues with accents in other languages FR, SP, etc.
        csv_filename = f"{base_filename}_perplexity_10v5.csv"
        
        # Use template keys to ensure consistent field order
        fields = list(SAINT_TEMPLATE.keys())
        
        # Convert lists to strings and write CSV
        with open(csv_filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fields)
            writer.writeheader()
            
            for saint in saints_data:
                row = {}
                for key in fields:
                    value = saint.get(key, '')
                    if isinstance(value, list):
                        row[key] = '; '.join(str(item) for item in value)
                    else:
                        row[key] = value
                writer.writerow(row)
        
        print(f"Data successfully saved to {json_filename} and {csv_filename}")
        return saints_data
        
    except Exception as e:
        print(f"Unexpected error: {str(e)}")
        return []

if __name__ == "__main__":
    saints_to_query = [
        "Saint Benedict of Nursia",
        "Saint Catherine of Siena",
        "Saint Joan of Arc",
        "Saint Peter",
        "Saint Paul",
        "Saint Ignatius of Loyola",
        "Saint Therese of Lisieux",
        "Saint Elizabeth Ann Seton",
        "Saint Vincent de Paul",
        "Saint Maximilian Kolbe"
    ]
    
    saints_data = save_saints_data(saints_to_query, "saints_data")

Processing Saint Benedict of Nursia...
Successfully processed Saint Benedict of Nursia
Processing Saint Catherine of Siena...
Successfully processed Saint Catherine of Siena
Processing Saint Joan of Arc...
Successfully processed Saint Joan of Arc
Processing Saint Peter...
Successfully processed Saint Peter
Processing Saint Paul...
Successfully processed Saint Paul
Processing Saint Ignatius of Loyola...
Successfully processed Saint Ignatius of Loyola
Processing Saint Therese of Lisieux...
Successfully processed Saint Therese of Lisieux
Processing Saint Elizabeth Ann Seton...
Successfully processed Saint Elizabeth Ann Seton
Processing Saint Vincent de Paul...
Successfully processed Saint Vincent de Paul
Processing Saint Maximilian Kolbe...
Successfully processed Saint Maximilian Kolbe
Data successfully saved to saints_data.json and saints_data_perplexity_10v5.csv
