In [1]:
# !pip install --upgrade crawl4ai 
# !playwright install --with-deps chromium
# !pip install openai-agents
# !pip install duckduckgo_search

In [2]:
# !crawl4ai-setup
# !crawl4ai-doctor

In [3]:
from duckduckgo_search import DDGS
from dotenv import load_dotenv
import nest_asyncio
import os

from playwright.async_api import async_playwright

from crawl4ai.content_filter_strategy import PruningContentFilter, LLMContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer

import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from openai import OpenAI
from IPython.display import Markdown, display, update_display

load_dotenv(override=True)
nest_asyncio.apply()

home_dir = os.path.expanduser("~")

In [4]:
url_list = [
            ''
            ]

data_path = f'{home_dir}/Google Drive/My Drive/Projects/Data/Knowledge-base/Unsorted'

In [5]:
local_model = "qwen3-14b-mlx"
local_api_key = 'None'
local_base_url = 'http://127.0.0.1:1234/v1'

google_model = 'gemini-2.0-flash-lite'
google_api_key = os.getenv('GOOGLE_API_KEY')
google_base_url = 'https://generativelanguage.googleapis.com/v1beta/openai/'

# llm_option = 'google'

# topic='google top treads for singapore'

In [6]:
system_recipe = """You are a helpful Assistance, 
                    that helpswrite a detail recipe based on the extracted text.
                    
                    Respond in markdown. 
        
                    It is very important to only use the extracted text for the recipe 
                    and to cross check only with the extracted text 
                    and if you don't know,
                    don't try to make up any details.
                """

            
prompt_recipe = f"""Below is an extracted text from a website. 
                    Please write a detail Recipe only based on extracted text in markdown.

                    Include only the following:
                        Title: Recipe title the with Chef's name (remove Chef's name if unavaliable).
                        URL link to webpage recipe.
                        URL link to the youtube video , remove if unavaliable.
                        Chef's name, remove if unavaliable. 
                        Executive summary.
                        Total Cooking timing and preparation timing, remove if unavaliable.
                        Servings size, remove if unavaliable.
                        Detailed Ingredient List (Remove any URL Links).
                        Method of the recipes.
                        Tips and tricks.
                        Conclusion.
                    
                    It is very important to only use the extracted text for the recipe 
                    and to cross check only with the extracted text 
                    and if you don't know,
                    don't try to make up any details.
                    
                   extracted text:
                   
            """

system_validation = """
                        You are a helpful Assistance that looks at a recipe and extracted text
                        Based only on the extracted text. Cross check the recipe and validate the recipe
                        
                        Respond in markdown. 
                        
                        It is very important not to change the format of the recipe.
                        and fix any error in the recipe and remove any unavaliable information in the recipe.
                        
                        It is very important to only based it on the extracted text,
                        If you don't know,don't try to make up any details
                        
                    """

            
prompt_validation = f"""
                        Below is a recipe, helps Cross check the recipe and validate the recipe
                        Please write a detail Recipe only based on extracted text in markdown.

                        It is very important not to change the format of the recipe.
                        and fix any error in the report and remove any unavaliable information in the report.
               
                    """

system_measure_unit_check = """
                                You are a helpful Assistance that looks at a recipe and helps to convert 
                                any imperial units of measure to metric units of measure.
            
                                Respond in markdown. 
                    
                                It is very important not to change the format of the recipe.
                                and only do the convertion.
                                show both units of measure in the recipe.

                                If you don't know,
                                don't try to make up any details
                                """

            
prompt_measure_unit_check = f"""
                                Below is a recipe, helps to convert any imperial units of measure to metric units of measure.
                                Please write a detail Recipe only based on extracted text in markdown.

                                It is very important not to change the format of the recipe.
                                and only do the convertion.
                                show both units of measure in the recipe.
                                
                                Recipe:
               
                            """

In [7]:
def get_url(topic, max_results):
    print(f"Running DuckDuckGo news search for {topic}...")
    
    # DuckDuckGo search
    ddg_api = DDGS()
    results = ddg_api.text(f"{topic} ", max_results=max_results)
    if results:
        # title = []
        url = []
        for result in results:
            # title.append(result['title'])
            url.append(result['href'])
        return url
    else:
        return f"Could not find news results for {topic}."


async def crawl_url(url):
    browser_config = BrowserConfig(browser_type="chromium",  # Type of browser to simulate
                                    headless=True,  # Whether to run in headless mode (no GUI)
                                    verbose=False,  # Enable verbose logging)
                                  )
    
    prune_filter = PruningContentFilter()
    markdown_generator=DefaultMarkdownGenerator(content_filter=prune_filter)
    run_config = CrawlerRunConfig(markdown_generator=markdown_generator)  

    async with AsyncWebCrawler(config=browser_config) as crawler:
        scrape_data = await crawler.arun(url, config=run_config)
    
    return scrape_data

def call_llm(system_prompt, user_prompt, stream):

    messages = [{"role": "system", "content": system_prompt,
                "role": "user","content": user_prompt}]
    
    openai = OpenAI(base_url=local_base_url, 
                          api_key=local_api_key)
    
    
    if stream == True:

        stream_response = openai.chat.completions.create(model = local_model,
                                            messages = messages,
                                            stream=stream,
                                            temperature=0.0)
        
        response = ""
        display_handle = display(Markdown(""), display_id=True)
        
        for chunk in stream_response:
                response += chunk.choices[0].delta.content or ''
                response = response.replace("```","").replace("markdown", "")
                update_display(Markdown(response), 
                               display_id=display_handle.display_id)
            
    else:
        response = openai.chat.completions.create(model = local_model,
                                            messages = messages, 
                                            temperature=0.0)

        response = response.choices[0].message.content
        
    response_without_thinking = response.split('/think>')[-1]
    return response_without_thinking


def get_recipe_from_url(url, system_recipe, prompt_recipe, 
                        stream):
    
    prompt_recipe += f'URL to website recipe: {url}\n'
    prompt_recipe += scrape_data_fit_markdown
    
    recipe = call_llm(system_recipe, prompt_recipe, stream=stream)
    
    return recipe

def recipe_validation(url, recipe, extracted_text, system_validation, prompt_validation, 
                        stream):
    
    prompt_validation += f'URL to website recipe: {url} \n'
    prompt_validation += f'Recipe: {recipe} \n'
    prompt_validation += f'extracted_text : {extracted_text} \n'
    
    recipe = call_llm(system_validation, prompt_validation, stream=stream)
    
    return recipe


def get_check_unit_of_measure(recipe, system_measure_unit_check, prompt_measure_unit_check, 
                              stream):
    
    prompt_measure_unit_check += recipe
    recipe = call_llm(system_measure_unit_check, 
                      prompt_measure_unit_check, stream=stream)
    
    return recipe

def save_response_to_txt(recipe, folder, MODEL):
    counter = 0
    file_name = ''
    
    while (len(file_name) > 200) | (file_name == ''):
        file_name = recipe.replace('```markdown', '')\
          .replace('```', '').splitlines()[counter]\
        .replace('#', '')\
        .replace(':', '')\
        .replace('*', '').replace("''", '')
        
        counter+= 1
    
    while file_name[0]==" ":
        file_name = file_name[1:]
        
    file_name = file_name.replace('_', '')\
                            .replace('#', '')\
                            .replace(':', '')\
                            .replace('*', '')\
                            .replace('/','')\
                            .replace('|','')
                        
        
    f = open(f'{folder}/{file_name}_{MODEL}.txt', 'w')
    f.write(recipe.replace('```markdown', '').replace('```', ''))
    f.close()
    
    print(f'Saved: {folder}_{file_name}_{MODEL}.txt \n\n')

In [8]:
for url in url_list:
    print(f"URL: {url} \n")
    scrape_data = asyncio.run(crawl_url(url=url))
    scrape_data_fit_markdown = scrape_data.markdown.fit_markdown
    
    print('\n Getting Recipe From Scrape Data')
    recipe = get_recipe_from_url(url, system_recipe, prompt_recipe, 
                                 stream = False)
    

    print('\n Validating Recipe')
    recipe = recipe_validation(url, recipe, scrape_data_fit_markdown, 
                               system_validation, prompt_validation, 
                               stream = False)
    
    
    print('\n Checking Unit Of Measure')
    recipe = get_check_unit_of_measure(recipe, system_measure_unit_check, 
                                       prompt_measure_unit_check,
                                      stream = False)
    Markdown(recipe)
    # 
    # print('\n Saving to Text File')
    save_response_to_txt(recipe, folder=data_path, MODEL=local_model)

    
    
# scrape_data_fit_markdown

URL: https://aaronandclaire.com/yakisoba-japanese-stir-fried-noodles/ 

[FETCH]... ↓ https://aaronandclaire.com/yakisoba-japanese-stir-fried-noodles/                                     | ✓ | ⏱: 0.98s
[SCRAPE].. ◆ https://aaronandclaire.com/yakisoba-japanese-stir-fried-noodles/                                     | ✓ | ⏱: 0.09s
[COMPLETE] ● https://aaronandclaire.com/yakisoba-japanese-stir-fried-noodles/                                     | ✓ | ⏱: 1.07s

 Getting Recipe From Scrape Data

 Validating Recipe

 Checking Unit Of Measure
Saved: /Users/daveng/Google Drive/My Drive/Projects/Data/Knowledge-base/Unsorted_Yakisoba (Japanese Stir-Fried Noodles) by Aaron and Claire  _qwen3-14b-mlx.txt 


URL: https://aaronandclaire.com/chicken-fried-rice/ 

[FETCH]... ↓ https://aaronandclaire.com/chicken-fried-rice/                                                       | ✓ | ⏱: 1.95s
[SCRAPE].. ◆ https://aaronandclaire.com/chicken-fried-rice/                                                       

In [9]:
# Markdown(recipe)

In [10]:
# save_response_to_txt(recipe, folder=data_path, MODEL=local_model)