## Web Scraper
Here lies the core component of the project!

In [1]:
import random
import asyncio
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, Browser, Page
from get_credentials import get_account
import sqlite3
from datetime import datetime

In [None]:
# function to create a new browser context with specified settings``
async def create_context(p, proxy=None):

    # we launch our browser, we use Browser which is a type hint for the browser object and pass in some settings
    browser: Browser = await p.chromium.launch(
        # our proxy settings, if a proxy is provided
            proxy={"server": f"http://{proxy}"} if proxy else None,
            # we can set the browser to be headless or not, for now we set it to False so we can see the browser in action
            headless=False,
            # we set the slow_mo to a random value between 100 and 300 milliseconds to slow down each browser action
            slow_mo=random.randint(100, 300)
        )  # Set to True later for performance

# create a new browser context with specific settings
    context = await browser.new_context(
        # set the viewport size to 1280x720 for our browser context
            viewport={"width": 1280, "height": 720},
            # pass in the user agent string to mimic a real browser
            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
            # set the locale to English (United States)
            locale="en-US",
            # optional: add extra HTTP headers if needed for more realistic requests (I don't use them here)
            extra_http_headers={}
        )
    return browser, context

In [None]:
# our main scraping function
async def scrape_page(p, url, username, password, proxy, account_id=None, status=None):

# get out browser and context from our previously defined function
    browser, context = await create_context(p, proxy)

# again we use type hinting to specify that page is of type Page and create a new page in our context
    page: Page = await context.new_page()

    # print statement to show which proxy is being used
    print(f"Scraping with Proxy: {proxy}")

    # set default value for status
    status = "active"

# wrap our scraping logic in a try-except block to handle any errors that may occur
    try:
        # Step 1: Navigate to the login page and wait for it to load
        await page.goto(url, wait_until="domcontentloaded", timeout=60000)
        
            # Fill in the login form
        await page.fill('input[name="username"]', username)
        await page.fill('input[name="password"]', str(password))

        # Click the login button
        await page.click('input[type="submit"]')

        # Wait for div.qoute to appear
        await page.wait_for_selector("div.quote")
        
        # scrape the page content and save it to soup variable
        content = await page.content()
        soup = BeautifulSoup(content, 'lxml')

        # Step 3: Parse all quote blocks
        quotes = soup.select("div.quote")
        random_quote = random.choice(quotes)

        # save the quote text and author to variables
        text = random_quote.select_one("span.text").get_text(strip=True)
        author = random_quote.select_one("small.author").get_text(strip=True)
        
    except Exception as e:
        # if any error occurs, we print the error message and set text, author, and status to "N/A" or "error"
        print(f"[!] Error: {e}")
        text = "N/A"
        author = "N/A"
        status = "error"
    
    finally:
        # Insert into the database
        conn = sqlite3.connect("../database/scraper.db")
        conn.execute("PRAGMA foreign_keys = ON")
        cursor = conn.cursor()

    # udate the quotes table with the scraped data
        cursor.execute("""
            INSERT INTO quotes (quote, author, scraped_at, status, account_id)
            VALUES (?, ?, ?, ?, ?)
        """, (text, author, datetime.now(), status, account_id))

        # if account_id is provided, update the last_used timestamp in the accounts table
        cursor.execute("""
            UPDATE accounts
            SET last_used = ?
            WHERE id = ?
        """, (datetime.now().isoformat(), account_id))

        conn.commit()
        conn.close()
        await browser.close()

In [None]:
# our final function to run the scraper with our inserted information
async def run_scraper(id):
    # here is where we create our playwright instance and run our scraper (you'll notice previous fuctions have the p parameter)
    async with async_playwright() as p:
         # Get accounts from the DB
        account = get_account(id)
        # define the URL we want to scrape
        url = "https://quotes.toscrape.com/login"

        # pass in the parameters to our scrape_page function from the account we retrieved
        await scrape_page(
            p=p,
            url=url,
            username=account.get("username"),
            password=account.get("encrypted_password"),
            proxy=account.get("proxy"),
            account_id=account.get("id"),
        )

In [None]:
# run our scraper with the specified account ID!
await run_scraper(4) 

Scraping with Proxy: 156.242.45.155:3129
