In [1]:
import asyncio
import json
import os
import nest_asyncio
import pprint 
import base64
from io import BytesIO
from playwright.async_api import async_playwright
import google.generativeai as genai
#import tubulate
from PIL import Image
from IPython.display import display, HTML,Markdown
from pydantic import BaseModel
from helper import get_openai_api_key,visualizeCourses,get_openai_client

  from .autonotebook import tqdm as notebook_tqdm


work


In [17]:
client = get_openai_client()
nest_asyncio.apply()

response = client.generate_content("Explain AI in simple terms.")
print(response.text)

Imagine you're teaching a dog a trick.  You show it what to do, reward it when it gets it right, and correct it when it's wrong.  Eventually, the dog learns the trick.

AI is similar.  We "teach" computers to do things by showing them lots of examples and giving them feedback.  Instead of treats, we use data.  The computer learns patterns and rules from that data, and then uses those patterns to make decisions or predictions on its own, like recognizing faces in a photo or translating languages.

It's not actually "thinking" like a human, but it can appear that way because it can solve problems and learn from experience.  Think of it as a really smart calculator that can learn and adapt.



WebScraper Agent

In [18]:
class WebScraperAgent:
    def __init__(self):
        self.playwright = None
        self.browser = None
        self.page = None
        # Apply nest_asyncio to handle async in Jupyter
        nest_asyncio.apply()

    async def init_browser(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(
            headless=True,
            args=[
                "--disable-dev-shm-usage",
                "--no-sandbox",
                "--disable-setuid-sandbox",
                "--disable-accelerated-2d-canvas",
                "--disable-gpu",
                "--no-zygote",
                "--disable-audio-output",
                "--disable-software-rasterizer",
                "--disable-webgl",
                "--disable-web-security",
                "--disable-features=LazyFrameLoading",
                "--disable-features=IsolateOrigins",
                "--disable-background-networking"

            ]
        )
        self.page = await self.browser.new_page()

    async def scrape_content(self, url):
        if not self.page or self.page.is_closed():
            await self.init_browser()
        await self.page.goto(url, wait_until="load")
        await self.page.wait_for_timeout(2000)
        return await self.page.content()
    
    async def take_screenshot(self, path="screenshot.png"):
        await self.page.screenshot(path=path, full_page=True)
        return path
    
    async def screenshot_buffer(self):
        screenshot_bytes = await self.page.screenshot(type="png", full_page=False)
        return screenshot_bytes
    
    async def close(self):
        try:
            if self.browser:
                await self.browser.close()
        except Exception as e:
            print(f"Warning: Error closing browser: {e}")
        
        try:
            if self.playwright:
                await self.playwright.stop()
        except Exception as e:
            print(f"Warning: Error stopping playwright: {e}")
        
        self.playwright = None
        self.browser = None
        self.page = None

In [19]:
scraper = WebScraperAgent()

Structured data format

In [5]:
class DeeplearningCourse(BaseModel):
    title: str
    description: str
    presenter: list[str]
    imageUrl:str
    courseURL: str

class DeeplearningCourseList(BaseModel):
    courses: list[DeeplearningCourse]

LLM Client for Gemini Ai

In [13]:
async def process_with_llm(html, instructions, truncate=False):
    prompt = f"""
    You are an expert web scraping agent. Your task is to:
    Extract relevant information from this HTML to JSON 
    following these instructions:
    {instructions}
    
    Extract the title, description, presenter, 
    the image URL and course URL for each of 
    all the courses for the deeplearning.ai website

    Return ONLY valid JSON in the following format:
    {{
        "courses": [
            {{
                "title": "Course Title",
                "description": "Course Description", 
                "presenter": ["Presenter Name"],
                "imageUrl": "Image URL",
                "courseURL": "Course URL"
            }}
        ]
    }}

    HTML Content:
    {html[:150000]}
    """
    
    response = client.generate_content(prompt)
    
    # Parse the JSON response
    import json
    try:
        json_text = response.text.strip()
        # Remove markdown code blocks if present
        if json_text.startswith('```json'):
            json_text = json_text[7:]
        if json_text.endswith('```'):
            json_text = json_text[:-3]
        json_text = json_text.strip()
        
        parsed_data = json.loads(json_text)
        return DeeplearningCourseList(**parsed_data)
    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        print(f"Response text: {response.text[:500]}...")
        return None

In [14]:
async def webscraper(target_url, instructions):
    result = None
    screenshot = None
    try:
        print("Extracting HTML Content\n")
        html_content = await scraper.scrape_content(target_url)

        print("Taking Screenshot \n")
        screenshot = await scraper.screenshot_buffer()

        print("Processing..")
        result: DeeplearningCourseList = await process_with_llm(html_content, instructions, False)
        if result:
            print("\nGenerated Structured Response")
        else:
            print("\nFailed to generate structured response")
    except Exception as e:
        print(f"Error: {str(e)}")
    finally:
        await scraper.close()
    return result, screenshot

examples

In [8]:
target_url="https://www.deeplearning.ai/courses"
base_url="https://www.deeplearning.ai"

In [15]:
instructions="""Get all the courses"""
result,screenshot = await webscraper(target_url,instructions)

Extracting HTML Content

Error: 


In [22]:
# Fix for Windows asyncio issues with Playwright
import sys
import platform

if platform.system() == 'Windows':
    # Set the event loop policy for Windows
    if sys.version_info >= (3, 8):
        asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

# Alternative: Let's try using requests and BeautifulSoup instead of Playwright for now
import requests
from bs4 import BeautifulSoup

async def simple_webscraper(target_url, instructions):
    result = None
    screenshot = None
    try:
        print("Extracting HTML Content with requests...")
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        response = requests.get(target_url, headers=headers)
        html_content = response.text
        
        print(f"HTML extracted, length: {len(html_content)}")
        print("Processing with LLM...")
        
        result: DeeplearningCourseList = await process_with_llm(html_content, instructions, False)
        if result:
            print("\nGenerated Structured Response")
        else:
            print("\nFailed to generate structured response")
    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        traceback.print_exc()
    
    return result, screenshot

# Test the simple scraper
result, screenshot = await simple_webscraper(target_url, instructions)
if result:
    print(f"\nFound {len(result.courses)} courses!")
    for i, course in enumerate(result.courses[:3]):  # Show first 3 courses
        print(f"\n{i+1}. {course.title}")
        print(f"   Presenter: {course.presenter}")
        print(f"   URL: {course.courseURL}")
else:
    print("No courses found or parsing failed")

Extracting HTML Content with requests...
HTML extracted, length: 106065
Processing with LLM...

Generated Structured Response

Found 7 courses!

1. AI Python for Beginners
   Presenter: ['DeepLearning.AI']
   URL: /courses/ai-python-for-beginners

2. ChatGPT Prompt Engineering for Developers
   Presenter: ['OpenAI']
   URL: /courses/chatgpt-prompt-engineering-for-developers

3. Generative AI for Everyone
   Presenter: ['DeepLearning.AI']
   URL: /courses/generative-ai-for-everyone
