# Install dependencies

In [None]:
!pip install python-dotenv
!pip install pytest-playwright
!playwright install
!pip install nest_asyncio

# Filter out the HTML source code (only for SSR pages).

In [6]:
from bs4 import BeautifulSoup, Comment
import requests

url = "https://2717recovery.com/products/recovery-cream"
# url = "https://bhumi.com.au/products/sateen-sheet-set-stone?variant=46357555839133"
# url = "https://lyfefuel.com/products/essentials-nutrition-shake"

try:
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        for tag in soup(['script', 'style', 'meta', 'link']):
            tag.decompose()

        # remove all comments
        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        cleaned_html = str(soup)

        # print(cleaned_html)
    else:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")


# Uses Gemini flash model to determine the classname.

In [8]:
import requests
import json
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv('API_KEY')

response = requests.post(
    url="https://openrouter.ai/api/v1/chat/completions",
    headers={
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    },
    data=json.dumps({
        "model": "google/gemini-2.0-flash-exp:free",
        "messages": [
            {
                "role": "assistant",
                # "content": "extract the user reviews from this codebase. Return in a json format with review text, review author and stars, if not present then return empty json like {} and nothing else. ALso don't add any code formatatting in output. Here is the code: " + cleaned_html
                "content": "extract the class name of pagination next page button of review section from this codebase. Just return a single word with the classname, if not present then return 'null' and nothing else. Here is the code: " + cleaned_html
            }
        ]
    })
)

if response.status_code == 200:
    data = response.json()

    # print(data)
    message_content = data['choices'][0]['message']['content']
    print(message_content)
else:
    print("Error:", response.status_code, response.text)


jdgm-paginate__next-page



# An attempt to filter out HTML for pages with CSR. PS: Failed attempt (unknown bug), filter works as expected but for some reason doesn't give desired output when used with LLM

In [10]:
import asyncio
import nest_asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup, Comment

# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# URL of the webpage to scrape
# url = "https://bhumi.com.au/products/sateen-sheet-set-stone?variant=46357555839133"
url = "https://2717recovery.com/products/recovery-cream"

cleaned_html = ""
rendered_html = ""

async def scrape_page(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        await page.goto(url)

        rendered_html = await page.content()

        soup = BeautifulSoup(rendered_html, 'html.parser')

        for tag in soup(['script', 'style', 'meta', 'link']):
            tag.decompose()

        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

        cleaned_html = str(soup)

        # print(cleaned_html)

        await browser.close()

asyncio.run(scrape_page(url))
