In [58]:
import requests
from bs4 import BeautifulSoup
import json
import re

# Dictionary mapping Unicode fraction symbols to their decimal equivalents
fraction_mapping = {
    '\u00BD': '1/5',  # 1/2
    '\u00BC': '1/4', # 1/4
    '\u00BE': '3/4', # 3/4
    '\u2153': '1/3', # 1/3
    '\u2154': '2/3', # 2/3
    '\u215b': '1/8', # 1/8
    '\u00b0': 'degrees ', #°
    '\u2013': '-', # -
    '\u00a0': ' ' # space
    # Add more mappings as needed
}

def replace_fraction_symbols(text):
    # Define a regex pattern that captures numbers possibly adjacent to fraction symbols
    pattern = r'(\d*)(%s)(\d*)' % '|'.join(re.escape(key) for key in fraction_mapping.keys())

    # Function to replace each match with appropriate spacing
    def replace(match):
        # Pre-number, fraction, and post-number
        pre, frac, post = match.groups()
        # Replace the fraction with its decimal equivalent from the dictionary
        frac_decimal = fraction_mapping[frac]
        # Add space if there is a preceding or succeeding number
        if frac_decimal == 'degrees ' or frac_decimal == '-' or frac_decimal == ' ':
            if pre and post:
                return f'{pre} {frac_decimal} {post}'
            elif pre:
                return f'{pre} {frac_decimal}'
            elif post:
                return f'{frac_decimal} {post}' 
        elif pre and post:
            return f'{pre} and {frac_decimal} {post}'
        elif pre:
            return f'{pre} and {frac_decimal}'
        elif post:
            return f'{frac_decimal} {post}'
        return frac_decimal

    return re.sub(pattern, replace, text)


# Global sets to store unique ingredients and categories
all_ingredients = set()
all_categories = set()


## Scrapes detailed information about a recipe from its page. ##
def scrape_recipe_details(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
            
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Attempting to extract data based on typical Wikibooks structure
        full_title = soup.title.text.replace(" - Wikibooks, open books for an open world", "").strip()
        # Use regex to find everything after "Cookbook:"
        match = re.search(r'Cookbook:(.*)', full_title)
        if match:
            title = match.group(1).strip()
        else:
            title = full_title  # Fallback to the full title if "Cookbook:" is not found
        difficulty = soup.find('table', class_='infobox')
        if difficulty:
            difficulty_row = difficulty.find('th', text=lambda text: text and 'Difficulty' in text)
            if difficulty_row:
                # The difficulty value might be represented by an image alt text
                difficulty_image = difficulty_row.find_next_sibling('td').find('img')
                if difficulty_image and 'alt' in difficulty_image.attrs:
                    difficulty = difficulty_image['alt'].strip()
                else:
                    difficulty = "Difficulty image or alt text not found."
            else:
                difficulty = "Difficulty row not found."
        else:
            difficulty = "Infobox table not found."

        servings = soup.find('th', text='Servings').find_next_sibling('td').text if soup.find('th', text='Servings') else 'Not specified'

        cooking_time = soup.find('table', class_='infobox')
        if cooking_time:
            time_row = cooking_time.find('th', string='Time')
            if time_row:
                time_data = time_row.find_next_sibling('td')
                if time_data:
                    time_text = time_data.text.strip()
                    # Use regex to capture everything after "Cooking:"
                    match = re.search(r"Cooking:\s*(.*)", time_text)
                    if match:
                        cooking_time = match.group(1).strip()  # Give only the text after "Cooking:"
                    else:
                        cooking_time = time_text
                else:
                    cooking_time = "Cooking time data not found."
            else:
                cooking_time = "Time row not found."
        else:
            cooking_time = "Infobox table not found."

        ingredients_list = [li.text.strip() for li in soup.find('span', text='Ingredients').parent.find_next_sibling('ul').find_all('li')] if soup.find('span', text='Ingredients') else []
        
        directions_list = soup.find('ol')
        if directions_list:
            directions_list = [li.text.strip() for li in directions_list.find_all('li')]
        else:
            directions_list = ["Procedure section not found."]
        
        categories_elements = [a.text for a in soup.find_all('a', href=True) if 'Category:' in a['href']]
        
        procedure_span = soup.find('span', string='Procedure')
        notes_list = []
        if procedure_span and procedure_span.parent:
            notes_ul = procedure_span.parent.find_next_sibling('ul')
            if notes_ul:
                # Find all list item elements within the <ul> and extract their text
                notes_list = [li.text.strip() for li in notes_ul.find_all('li')]
            if not notes_list:  # If the list is empty, no notes were found
                notes_list = ['No notes available']
        else:
            notes_list = ['No procedure section found.']

    recipe_info = {
        'title': title,
        'level': difficulty,
        'yield': servings,
        'cooking time': replace_fraction_symbols(cooking_time),
        'ingredients': [replace_fraction_symbols(string) for string in ingredients_list], ## TODO: before adding ingredients to recipe info
        'steps': [replace_fraction_symbols(string) for string in directions_list],
        'cook note': [replace_fraction_symbols(string) for string in notes_list],
        'categories': categories_elements
    }
    
    all_ingredients.update(recipe_info['ingredients']) ## TODO: remove the measurments before adding to ingredients
    all_categories.update(recipe_info['categories'])
    
    return recipe_info


## Writes data to a JSON file. ##
def write_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)


def main():
    counter = 0
    test_urls = ["https://en.wikibooks.org/wiki/Cookbook:A_Nice_Cup_of_Tea", "https://en.wikibooks.org/wiki/Cookbook:Baked_Penne"]
    recipes = {}
    
    for url in test_urls:
        print(f"Processing {url}...")

        # Extract information from the recipe
        details = scrape_recipe_details(url)
        recipes[counter] = details
        counter += 1
        print(f"{url} has been processed.")
    
    # Writing the index file and ingredient/category accumulations
    write_json(recipes, "recipes.json")
    write_json(list(all_ingredients), "all_ingredients.json")
    write_json(list(all_categories), "all_categories.json")

main()

Processing https://en.wikibooks.org/wiki/Cookbook:A_Nice_Cup_of_Tea...
https://en.wikibooks.org/wiki/Cookbook:A_Nice_Cup_of_Tea has been processed.
Processing https://en.wikibooks.org/wiki/Cookbook:Baked_Penne...
https://en.wikibooks.org/wiki/Cookbook:Baked_Penne has been processed.


  difficulty_row = difficulty.find('th', text=lambda text: text and 'Difficulty' in text)
  servings = soup.find('th', text='Servings').find_next_sibling('td').text if soup.find('th', text='Servings') else 'Not specified'
  ingredients_list = [li.text.strip() for li in soup.find('span', text='Ingredients').parent.find_next_sibling('ul').find_all('li')] if soup.find('span', text='Ingredients') else []
