In [2]:
import os
from firecrawl import FirecrawlApp
from dotenv import load_dotenv
import pandas as pd
from typing import Dict,Any
from pydantic import BaseModel, Field

In [25]:
class WebsiteScraper:
    def __init__(self):
        load_dotenv()
        self.firecrawl_api_key=os.getenv("FIRECRAWL_API_KEY")
        self.app = FirecrawlApp(api_key=self.firecrawl_api_key)
        self.schema_fields = [{"name": "", "type": "str"}]

    def create_dynamic_model(self, fields):
        field_annotations = {}
        for field in fields:
            if field["name"]:
                type_mapping = {
                    "str" : str,
                    "bool" : bool,
                    "int" : int,
                    "float" : float
                }
                field_annotations[field["name"]] = type_mapping[field["type"]]
        return type(
            "ExtractSchema",
            (BaseModel,),
            {
               "__annotations__": field_annotations 
            }
        )


    def create_schema_from_fields(self,fields):
        if not any (field["name"] for field in fields):
            return None

        model_class = self.create_dynamic_model(fields)
        return model_class.model_json_schema()

    def convert_to_table(self, data:Dict[str, Any]) -> str:
        if not data or "data" not in data:
            return ""

        df=pd.DataFrame([data['data']])
        return df.to_string(index=False)

    def scrape_web(self, website_url : str, prompt: str, schema_fields=None):
        if not website_url:
            raise ValueError("Please Provide a Website URL")
        
        try:
            schema=self.create_schema_from_fields(schema_fields) if schema_fields else None
           
            if schema:
                data = self.app.extract([website_url], schema=schema)

            else:
                  data = self.app.extract([website_url])
                
            return data

        except Exception as e:
            raise Exception(f"An error occured: {str(e)}")

In [26]:
scraper = WebsiteScraper()
website_url = "https://www.anthropic.com/engineering/building-effective-agents"
prompt = "extract publish data, title and link of all article related to LLMs"

schema_fields = [
    {"name": "Article_title", "type": "str"},
    {"name": "Publish_data", "type": "str"},
    {"name": "Article_link", "type": "str"}
]

result = scraper.scrape_web(website_url, prompt, schema_fields)
print("Results: \n")
print(result)

Results: 



In [28]:
class ExtractSchema(BaseModel):
    mission: str
    supports_sso: bool
    is_open_source: bool
    is_in_yc: bool

In [29]:
ExtractSchema.model_json_schema()

{'properties': {'mission': {'title': 'Mission', 'type': 'string'},
  'supports_sso': {'title': 'Supports Sso', 'type': 'boolean'},
  'is_open_source': {'title': 'Is Open Source', 'type': 'boolean'},
  'is_in_yc': {'title': 'Is In Yc', 'type': 'boolean'}},
 'required': ['mission', 'supports_sso', 'is_open_source', 'is_in_yc'],
 'title': 'ExtractSchema',
 'type': 'object'}

In [30]:
scraper.create_schema_from_fields(schema_fields)

{'properties': {'Article_title': {'title': 'Article Title', 'type': 'string'},
  'Publish_data': {'title': 'Publish Data', 'type': 'string'},
  'Article_link': {'title': 'Article Link', 'type': 'string'}},
 'required': ['Article_title', 'Publish_data', 'Article_link'],
 'title': 'ExtractSchema',
 'type': 'object'}

In [32]:
app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY"))
class ExtractSchema(BaseModel):
    article_title: str
    publish_data: str
    article_link: str

data = app.extract(
    urls=["https://www.anthropic.com/engineering/building-effective-agents"],
    schema=ExtractSchema.model_json_schema()
)

print(data)

