In [None]:
# -*- coding: utf-8 -*-
"""
Crawl4AI - Advanced Web Crawling and Data Extraction

GitHub Repository: https://github.com/unclecode/crawl4ai
"""

In [None]:
# Install necessary dependencies
!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0
!pip install crawl4ai
!pip install nest-asyncio
!playwright install

In [None]:
# Import necessary libraries
import asyncio
import nest_asyncio
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
import os
from pydantic import BaseModel, Field

In [None]:

# Apply nested asyncio for compatibility in certain environments (e.g., Jupyter/Colab)
nest_asyncio.apply()

In [None]:
# Set your OpenAI API key
os.environ['OPENAI_API_KEY'] = 'sk-proj-xxxx'


In [None]:
# Define the schema for review extraction
class UserReviewSchema(BaseModel):
    title: str = Field(..., description="Title of the review")
    body: str = Field(..., description="Body text of the review")
    rating: int = Field(..., description="Rating given in the review")
    reviewer: str = Field(..., description="Name of the reviewer")


In [None]:
# Function to extract user reviews with pagination handling
async def extract_user_reviews_with_pagination():
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url='https://example.com/product-page',  # Replace with your target product URL
            word_count_threshold=1,
            extraction_strategy=LLMExtractionStrategy(
                provider="openai/gpt-4o-mini-2024-07-18",
                api_token=os.getenv('OPENAI_API_KEY'),
                schema=UserReviewSchema.schema(),
                extraction_type="schema",
                instruction="""
                From the crawled content, extract user reviews in the following format:
                {
                    "reviews_count": 100,
                    "reviews": [
                        {
                            "title": "Review Title",
                            "body": "Review body text",
                            "rating": 5,
                            "reviewer": "Reviewer Name"
                        }
                    ]
                }
                Ensure the response includes all reviews across pagination.
                """
            ),
            bypass_cache=True,  # Ignore any cached data for fresh crawling
            pagination=True,  # Enable pagination handling
            pagination_selector="button.next-page"  # Adjust this CSS selector for the 'Next Page' button
        )
        # Print the extracted reviews in JSON format
        print(json.dumps(result.extracted_content, indent=4))

# Run the function
await extract_user_reviews_with_pagination()