In [1]:
import asyncio
import json
import os
import nest_asyncio
import pprint 
import base64
from io import BytesIO
from playwright.async_api import async_playwright
import google.generativeai as genai
#import tubulate
from PIL import Image
from IPython.display import display, HTML,Markdown
from pydantic import BaseModel
from helper import get_openai_api_key,visualizeCourses,get_openai_client

  from .autonotebook import tqdm as notebook_tqdm


work


In [17]:
client = get_openai_client()
nest_asyncio.apply()

response = client.generate_content("Explain AI in simple terms.")
print(response.text)

Imagine you're teaching a dog a trick.  You show it what to do, reward it when it gets it right, and correct it when it's wrong.  Eventually, the dog learns the trick.

AI is similar.  We "teach" computers to do things by showing them lots of examples and giving them feedback.  Instead of treats, we use data.  The computer learns patterns and rules from that data, and then uses those patterns to make decisions or predictions on its own, like recognizing faces in a photo or translating languages.

It's not actually "thinking" like a human, but it can appear that way because it can solve problems and learn from experience.  Think of it as a really smart calculator that can learn and adapt.



WebScraper Agent

In [18]:
class WebScraperAgent:
    def __init__(self):
        self.playwright = None
        self.browser = None
        self.page = None
        # Apply nest_asyncio to handle async in Jupyter
        nest_asyncio.apply()

    async def init_browser(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(
            headless=True,
            args=[
                "--disable-dev-shm-usage",
                "--no-sandbox",
                "--disable-setuid-sandbox",
                "--disable-accelerated-2d-canvas",
                "--disable-gpu",
                "--no-zygote",
                "--disable-audio-output",
                "--disable-software-rasterizer",
                "--disable-webgl",
                "--disable-web-security",
                "--disable-features=LazyFrameLoading",
                "--disable-features=IsolateOrigins",
                "--disable-background-networking"

            ]
        )
        self.page = await self.browser.new_page()

    async def scrape_content(self, url):
        if not self.page or self.page.is_closed():
            await self.init_browser()
        await self.page.goto(url, wait_until="load")
        await self.page.wait_for_timeout(2000)
        return await self.page.content()
    
    async def take_screenshot(self, path="screenshot.png"):
        await self.page.screenshot(path=path, full_page=True)
        return path
    
    async def screenshot_buffer(self):
        screenshot_bytes = await self.page.screenshot(type="png", full_page=False)
        return screenshot_bytes
    
    async def close(self):
        try:
            if self.browser:
                await self.browser.close()
        except Exception as e:
            print(f"Warning: Error closing browser: {e}")
        
        try:
            if self.playwright:
                await self.playwright.stop()
        except Exception as e:
            print(f"Warning: Error stopping playwright: {e}")
        
        self.playwright = None
        self.browser = None
        self.page = None

In [19]:
scraper = WebScraperAgent()

Structured data format

In [35]:
from typing import Optional, List

class DeeplearningCourse(BaseModel):
    title: str
    description: str
    presenter: Optional[List[str]] = []  # Optional list, defaults to empty
    imageUrl: Optional[str] = None  # Made optional with default None
    courseURL: str

class DeeplearningCourseList(BaseModel):
    courses: list[DeeplearningCourse]

LLM Client for Gemini Ai

In [13]:
async def process_with_llm(html, instructions, truncate=False):
    prompt = f"""
    You are an expert web scraping agent. Your task is to:
    Extract relevant information from this HTML to JSON 
    following these instructions:
    {instructions}
    
    Extract the title, description, presenter, 
    the image URL and course URL for each of 
    all the courses for the deeplearning.ai website

    Return ONLY valid JSON in the following format:
    {{
        "courses": [
            {{
                "title": "Course Title",
                "description": "Course Description", 
                "presenter": ["Presenter Name"],
                "imageUrl": "Image URL",
                "courseURL": "Course URL"
            }}
        ]
    }}

    HTML Content:
    {html[:150000]}
    """
    
    response = client.generate_content(prompt)
    
    # Parse the JSON response
    import json
    try:
        json_text = response.text.strip()
        # Remove markdown code blocks if present
        if json_text.startswith('```json'):
            json_text = json_text[7:]
        if json_text.endswith('```'):
            json_text = json_text[:-3]
        json_text = json_text.strip()
        
        parsed_data = json.loads(json_text)
        return DeeplearningCourseList(**parsed_data)
    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        print(f"Response text: {response.text[:500]}...")
        return None

In [14]:
async def webscraper(target_url, instructions):
    result = None
    screenshot = None
    try:
        print("Extracting HTML Content\n")
        html_content = await scraper.scrape_content(target_url)

        print("Taking Screenshot \n")
        screenshot = await scraper.screenshot_buffer()

        print("Processing..")
        result: DeeplearningCourseList = await process_with_llm(html_content, instructions, False)
        if result:
            print("\nGenerated Structured Response")
        else:
            print("\nFailed to generate structured response")
    except Exception as e:
        print(f"Error: {str(e)}")
    finally:
        await scraper.close()
    return result, screenshot

examples

In [8]:
target_url="https://www.deeplearning.ai/courses"
base_url="https://www.deeplearning.ai"

In [15]:
instructions="""Get all the courses"""
result,screenshot = await webscraper(target_url,instructions)

Extracting HTML Content

Error: 


In [22]:
# Fix for Windows asyncio issues with Playwright
import sys
import platform

if platform.system() == 'Windows':
    # Set the event loop policy for Windows
    if sys.version_info >= (3, 8):
        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

# Alternative: Let's try using requests and BeautifulSoup instead of Playwright for now
import requests
from bs4 import BeautifulSoup

async def simple_webscraper(target_url, instructions):
    result = None
    screenshot = None
    try:
        print("Extracting HTML Content with requests...")
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(target_url, headers=headers)
        html_content = response.text
        
        print(f"HTML extracted, length: {len(html_content)}")
        print("Processing with LLM...")
        
        result: DeeplearningCourseList = await process_with_llm(html_content, instructions, False)
        if result:
            print("\nGenerated Structured Response")
        else:
            print("\nFailed to generate structured response")
    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()
    
    return result, screenshot

# Test the simple scraper
result, screenshot = await simple_webscraper(target_url, instructions)
if result:
    print(f"\nFound {len(result.courses)} courses!")
    for i, course in enumerate(result.courses[:3]):  # Show first 3 courses
        print(f"\n{i+1}. {course.title}")
        print(f"   Presenter: {course.presenter}")
        print(f"   URL: {course.courseURL}")
else:
    print("No courses found or parsing failed")

Extracting HTML Content with requests...
HTML extracted, length: 106065
Processing with LLM...

Generated Structured Response

Found 7 courses!

1. AI Python for Beginners
   Presenter: ['DeepLearning.AI']
   URL: /courses/ai-python-for-beginners

2. ChatGPT Prompt Engineering for Developers
   Presenter: ['OpenAI']
   URL: /courses/chatgpt-prompt-engineering-for-developers

3. Generative AI for Everyone
   Presenter: ['DeepLearning.AI']
   URL: /courses/generative-ai-for-everyone


In [None]:
await visualizeCourses(result=result, 
                       screenshot=screenshot, 
                       target_url=target_url, 
                       instructions=instructions, 
                       base_url=base_url)

work


### Scraped Course Data:

title,description,presenter,imageUrl,courseURL
AI Python for Beginners,"Learn Python programming with AI assistance. Gain skills writing, testing, and debugging code efficiently, and create real-world AI applications.",DeepLearning.AI,,AI Python for Beginners
ChatGPT Prompt Engineering for Developers,"Learn the fundamentals of prompt engineering for ChatGPT. Learn effective prompting, and how to use LLMs for summarizing, inferring, transforming, and expanding.",OpenAI,,ChatGPT Prompt Engineering for Developers
Generative AI for Everyone,"Learn how to use generative AI's capabilities & limitations. Get an overview of real-world examples, and impact on business & society for effective strategies.",DeepLearning.AI,,Generative AI for Everyone
Machine Learning Specialization,"Learn foundational AI concepts through an intuitive visual approach, then learn the code needed to implement the algorithms and math for ML.",Stanford Online,,Machine Learning Specialization
Multi AI Agent Systems with crewAI,Automate business workflows with multi-AI agent systems. Exceed the performance of prompting a single LLM by designing and prompting a team of AI agents through natural language.,crewAI,,Multi AI Agent Systems with crewAI
LangChain for LLM Application Development,"Use the powerful and extensible LangChain framework, using prompts, parsing, memory, chains, question answering, and agents.",LangChain,,LangChain for LLM Application Development
AI for Everyone,"Learn about AI technologies and how to use them. Examine AI's societal impact, and learn to navigate this technological shift.",DeepLearning.AI,,AI for Everyone


### No Screenshot Available

Screenshot was not captured (using simple HTTP scraper instead of browser).

In [30]:
subject = "Retrieval Augmented Generation (RAG) "

instructions = f"""
Read the description of the courses and only 
provide the three courses that are about {subject}. 
Make sure that we don't have any other
cources in the output
"""
result, screenshot = await simple_webscraper(target_url, instructions)

Extracting HTML Content with requests...
HTML extracted, length: 106151
Processing with LLM...
HTML extracted, length: 106151
Processing with LLM...

Generated Structured Response

Generated Structured Response


In [32]:
# Display the RAG-specific results
if result:
    print(f"Found {len(result.courses)} RAG-related courses:")
    for i, course in enumerate(result.courses):
        print(f"\n{i+1}. {course.title}")
        print(f"   Description: {course.description}")
        print(f"   Presenter: {course.presenter}")
        print(f"   URL: {course.courseURL}")
        print(f"   Image URL: {course.imageUrl or 'Not available'}")
    
    # Visualize the RAG courses
    await visualizeCourses(result=result, 
                           screenshot=screenshot, 
                           target_url=target_url, 
                           instructions=f"RAG courses: {instructions}", 
                           base_url=base_url)
else:
    print("No RAG courses found")

Found 3 RAG-related courses:

1. Retrieval Augmented Generation (RAG)
   Description: Gain fundamental understanding and the practical knowledge to develop production-ready RAG applications, from architecture to deployment and evaluation.
   Presenter: ['Zain Hasan']
   URL: /courses/retrieval-augmented-generation-rag
   Image URL: Not available

2. ACP: Agent Communication Protocol
   Description: Build agents that communicate and collaborate across different frameworks using ACP.
   Presenter: ['Sandi Besen', 'Nicholas Renotte']
   URL: /short-courses/acp-agent-communication-protocol
   Image URL: Not available

3. Orchestrating Workflows for GenAI Applications
   Description: Turn your GenAI prototype into an automated pipeline using Apache Airflow
   Presenter: ['Kenten Danas', 'Tamara Fingerlin']
   URL: /short-courses/orchestrating-workflows-for-genai-applications
   Image URL: Not available


### Scraped Course Data:

title,description,presenter,imageUrl,courseURL
Retrieval Augmented Generation (RAG),"Gain fundamental understanding and the practical knowledge to develop production-ready RAG applications, from architecture to deployment and evaluation.",Zain Hasan,,Retrieval Augmented Generation (RAG)
ACP: Agent Communication Protocol,Build agents that communicate and collaborate across different frameworks using ACP.,"Sandi Besen, Nicholas Renotte",,ACP: Agent Communication Protocol
Orchestrating Workflows for GenAI Applications,Turn your GenAI prototype into an automated pipeline using Apache Airflow,"Kenten Danas, Tamara Fingerlin",,Orchestrating Workflows for GenAI Applications


### No Screenshot Available

Screenshot was not captured (using simple HTTP scraper instead of browser).

In [36]:
subject = "Retrieval Augmented Generation (RAG) "
instructions = f"""
Can you get the summary of the top course on
{subject} provide the learnings from it
"""
result, screenshot = await simple_webscraper(target_url, instructions)

Extracting HTML Content with requests...
HTML extracted, length: 106151
Processing with LLM...
HTML extracted, length: 106151
Processing with LLM...

Generated Structured Response

Generated Structured Response


In [37]:
await visualizeCourses(result=result,
                       screenshot=screenshot,
                       target_url=target_url,
                       instructions=instructions,
                       base_url=base_url)

### Scraped Course Data:

title,description,presenter,imageUrl,courseURL
AI Python for Beginners,"Learn Python programming with AI assistance. Gain skills writing, testing, and debugging code efficiently, and create real-world AI applications.",DeepLearning.AI,,AI Python for Beginners
ChatGPT Prompt Engineering for Developers,"Learn the fundamentals of prompt engineering for ChatGPT. Learn effective prompting, and how to use LLMs for summarizing, inferring, transforming, and expanding.",OpenAI,,ChatGPT Prompt Engineering for Developers
Generative AI for Everyone,"Learn how to use generative AI's capabilities & limitations. Get an overview of real-world examples, and impact on business & society for effective strategies.",DeepLearning.AI,,Generative AI for Everyone
Machine Learning Specialization,"Learn foundational AI concepts through an intuitive visual approach, then learn the code needed to implement the algorithms and math for ML.",Stanford Online,,Machine Learning Specialization
Multi AI Agent Systems with crewAI,Automate business workflows with multi-AI agent systems. Exceed the performance of prompting a single LLM by designing and prompting a team of AI agents through natural language.,crewAI,,Multi AI Agent Systems with crewAI
LangChain for LLM Application Development,"Use the powerful and extensible LangChain framework, using prompts, parsing, memory, chains, question answering, and agents.",LangChain,,LangChain for LLM Application Development
AI for Everyone,"Learn about AI technologies and how to use them. Examine AI's societal impact, and learn to navigate this technological shift.",DeepLearning.AI,,AI for Everyone


### No Screenshot Available

Screenshot was not captured (using simple HTTP scraper instead of browser).

In [38]:
# Display the course summary results
if result:
    print(f"Found {len(result.courses)} course(s) in the summary:")
    for i, course in enumerate(result.courses):
        print(f"\n{i+1}. {course.title}")
        print(f"   Description: {course.description}")
        print(f"   Presenter: {course.presenter or 'Not specified'}")
        print(f"   URL: {course.courseURL}")
        print(f"   Image URL: {course.imageUrl or 'Not available'}")
        print("-" * 80)
else:
    print("No course summary found")

Found 7 course(s) in the summary:

1. AI Python for Beginners
   Description: Learn Python programming with AI assistance. Gain skills writing, testing, and debugging code efficiently, and create real-world AI applications.
   Presenter: ['DeepLearning.AI']
   URL: /courses/ai-python-for-beginners
   Image URL: https://home-wordpress.deeplearning.ai/wp-content/uploads/2024/08/Most-Popular-section-card-2.png
--------------------------------------------------------------------------------

2. ChatGPT Prompt Engineering for Developers
   Description: Learn the fundamentals of prompt engineering for ChatGPT. Learn effective prompting, and how to use LLMs for summarizing, inferring, transforming, and expanding.
   Presenter: ['OpenAI']
   URL: /courses/chatgpt-prompt-engineering-for-developers
   Image URL: https://home-wordpress.deeplearning.ai/wp-content/uploads/2023/04/chatgpt-prompt-for-engineering.png
--------------------------------------------------------------------------------

3. 