In [1]:
import asyncio
import json
import os
import nest_asyncio
import pprint 
import base64
from io import BytesIO
from playwright.async_api import async_playwright
import google.generativeai as genai
#import tubulate
from PIL import Image
from IPython.display import display, HTML,Markdown
from pydantic import BaseModel
from helper import get_openai_api_key,visualizeCourses,get_openai_client

  from .autonotebook import tqdm as notebook_tqdm


work


In [2]:
client = get_openai_client()
nest_asyncio.apply()

response = client.generate_content("Explain AI in simple terms.")
print(response.text)

Imagine you have a really smart puppy.  You teach it tricks by showing it examples and rewarding good behavior.  AI is kind of like that, but instead of a puppy, it's a computer program.

We "teach" the computer program by feeding it lots of data (like showing the puppy lots of examples).  The program learns patterns and rules from this data, and then uses those patterns to do things like:

* **Understand speech:**  Like Siri or Alexa.
* **Recognize images:** Like identifying your face in a photo.
* **Translate languages:** Like Google Translate.
* **Play games:** Like chess or Go.
* **Recommend things:** Like movies or products.

The computer doesn't *understand* things like a human does, it just gets really good at following patterns and making predictions based on the data it's been given.  It's constantly learning and improving as it gets more data.



WebScraper Agent

In [3]:
class WebScraperAgent:
    def __init__(self):
        self.playwright = None
        self.browser = None
        self.page = None

    async def init_browser(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch(
            headless=True,
            args=[
                "--disable-dev-shm-usage",
                "--no-sandbox",
                "--disable-setuid-sandbox",
                "--disable-accelerated-2d-canvas",
                "--disable-gpu",
                "--no-zygote",
                "--disable-audio-output",
                "--disable-software-rasterizer",
                "--disable-webgl",
                "--disable-web-security",
                "--disable-features=LazyFrameLoading",
                "--disable-features=IsolateOrigins",
                "--disable-background-networking"

            ]
        )
        self.page = await self.browser.new_page()

    async def scrape_content(self, url):
        if not self.page or self.page.is_closed():
            await self.init_browser()
        await self.page.goto(url,wait_untill="load")
        await self.page.wait_for_timeout(2000)
        return await self.page.content()
    
    async def take_screenshot(self, path="screenshot.png"):
        await self.page.screenshot(path=path,full_page=True)
        return path
    
    async def screenshot_buffer(self):
        screenshot_bytes=await self.page.screenshot(type="png",full_page=False)
        return screenshot_bytes
    
    async def close(self ):
        await self.browser.close()
        await self.playwright.stop()
        self.playwright=None
        self.browser=None
        self.page=None



In [4]:
scraper=WebScraperAgent()

Structured data format

In [5]:
class DeeplearningCourse(BaseModel):
    title: str
    description: str
    presenter: list[str]
    imageUrl:str
    courseURL: str

class DeeplearningCourseList(BaseModel):
    courses: list[DeeplearningCourse]

LLM Client for Gemini Ai

In [6]:
async def process_with_llm(html, instructions,truncate=False):
    completion = client.beta.chat.completions.parse(
        model="gemini-1.5-flash",
        messages=[{
            "role": "system",
            "content": f"""
            You are an expert web scraping agent. Your task is to:
            Extract relevant information from this HTML to JSON 
            following these instructions:
            {instructions}
            
            Extract the title, description, presenter, 
            the image URL and course URL for each of 
            all the courses for the deeplearning.ai website

            Return ONLY valid JSON, no markdown or extra text."""

        },
        {
            "role": "user",
            "content": html[:150000]  # Truncate to stay under token limits

        }],
        temperature=0.1,
        response_format=DeeplearningCourseList,
        )
    return completion.choices[0].message.parsed


In [7]:
async def webscraper(target_url,instructions):
    result=None
    try:
        print("Extracting HTML Content\n")
        html_content = await scraper.scrape_content(target_url)

        print("Taking Screensot \n")
        screenshot = await scraper.screenshot_buffer()

        print("Processing..")
        result: DeeplearningCourseList = await process_with_llm(html_content,instructions,False)
        print("\nGenerated Structured Response")
    except Exception as e:
        print(f"Error:{str(e)}")
    finally:
        await scraper.close()
    return result,screenshot