## ollama stuff
- https://github.com/ollama/ollama
- https://ollama.com/download

## chromewebdriver
- https://googlechromelabs.github.io/chrome-for-testing/#stable

In [None]:
# Scraping part
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

def scrape(url):
    driver_path = "./chromedriver.exe"
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(driver_path), options=options)

    try: 
        driver.get(url)
        html = driver.page_source
        
        return html
    finally:
        driver.quit()



In [None]:
scrape("https://scrapeme.live/shop/")

'<html lang="en"><head>\n\t<!-- Google Tag Manager -->\n<script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js" nonce=""></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-Q9KY1T6XJQ&amp;l=dataLayer&amp;cx=c&amp;gtm=45He4c40v79311205za200" nonce=""></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-NVFPDWB"></script><script nonce="">(function (w, d, s, l, i) {\n\t\tw[l] = w[l] || [];\n\t\tw[l].push({\n\t\t\t\'gtm.start\':\n\t\t\t\tnew Date().getTime(), event: \'gtm.js\'\n\t\t});\n\t\tvar f = d.getElementsByTagName(s)[0],\n\t\t\tj = d.createElement(s), dl = l != \'dataLayer\' ? \'&l=\' + l : \'\';\n\t\tj.async = true;\n\t\tj.src =\n\t\t\t\'https://www.googletagmanager.com/gtm.js?id=\' + i + dl;\n\t\tf.parentNode.insertBefore(j, f);\n\t})(window, document, \'script\', \'dataLayer\', \'GTM-NVFPDWB\');</script>\n<!-- End Google Tag Manager -->\n\t<title>Allinone | Web Scraper Tes

In [27]:
# Parsing and cleaning
from bs4 import BeautifulSoup

def parse(html):
    soup = BeautifulSoup(html, "html.parser")
    body = soup.body
    if body:
        for content in soup(["script", "style"]):
            content.extract() # get rid of tags
        clean_content = soup.get_text(separator="\n")
        clean_content = "\n".join(line.strip() for line in clean_content.splitlines() if line.strip()) # if \n is not seperating anything this will remove it

        return clean_content
    return None



In [None]:
parse(scrape("https://scrapeme.live/shop/"))

'Allinone | Web Scraper Test Sites\nToggle navigation\nWeb Scraper\nCloud Scraper\nPricing\nLearn\nDocumentation\nVideo Tutorials\nHow to\nTest Sites\nForum\nInstall\nCloud Login\nTest Sites\nHome\nComputers\nPhones\nE-commerce training site\nWelcome to WebScraper e-commerce site. You can use this site for training\nto learn how to use the Web Scraper. Items listed here are not for sale.\nTop items being scraped right now\n$1294.74\nToshiba Porteg...\nToshiba Portege Z30-C-16K Grey, 13.3" FHD, Core i5-6200U, 8GB, 256GB SSD, 4G, Windows 10 Pro\n6 reviews\n$899\nAsus ROG STRIX...\nAsus ROG STRIX GL553VD-DM256, 15.6" FHD, Core i5-7300HQ, 8GB, 1TB, GeForce GTX 1050 2GB, No OS + Windows 10 Home\n7 reviews\n$520.99\nHP 250 G3\n15.6", Core\xa0i5-4210U, 4GB, 500GB, Windows 8.1\n13 reviews\nProducts\nWeb Scraper browser extension\nWeb Scraper Cloud\nCompany\nAbout us\nContact\nWebsite Privacy Policy\nBrowser Extension Privacy Policy\nMedia kit\nJobs\nResources\nBlog\nDocumentation\nVideo Tutori

In [40]:
# Splitting text into batch for llm, idk what the max length should be
def split(content, max_length=8000):
    batch = [content[i :i +max_length] for i in range(0, len(content), max_length)] # keeps grabbing wtv the max lenght is until the end
    return batch

In [None]:
split(parse(scrape("https://scrapeme.live/shop/")))

['Allinone | Web Scraper Test Sites\nToggle navigation\nWeb Scraper\nCloud Scraper\nPricing\nLearn\nDocumentation\nVideo Tutorials\nHow to\nTest Sites\nForum\nInstall\nCloud Login\nTest Sites\nHome\nComputers\nPhones\nE-commerce training site\nWelcome to WebScraper e-commerce site. You can use this site for training\nto learn how to use the Web Scraper. Items listed here are not for sale.\nTop items being scraped right now\n$1294.74\nToshiba Porteg...\nToshiba Portege Z30-C-16K Grey, 13.3" FHD, Core i5-6200U, 8GB, 256GB SSD, 4G, Windows 10 Pro\n6 reviews\n$899\nAsus ROG STRIX...\nAsus ROG STRIX GL553VD-DM256, 15.6" FHD, Core i5-7300HQ, 8GB, 1TB, GeForce GTX 1050 2GB, No OS + Windows 10 Home\n7 reviews\n$520.99\nHP 250 G3\n15.6", Core\xa0i5-4210U, 4GB, 500GB, Windows 8.1\n13 reviews\nProducts\nWeb Scraper browser extension\nWeb Scraper Cloud\nCompany\nAbout us\nContact\nWebsite Privacy Policy\nBrowser Extension Privacy Policy\nMedia kit\nJobs\nResources\nBlog\nDocumentation\nVideo Tutor

In [59]:
#llm part
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

model = OllamaLLM(model="llama3.1")

template = (
    "You are tasked with extracting specific information from the following text content: {content}. "
    "Please follow these instructions carefully: \n\n"
    "1. **Extract Information:** Only extract the information that directly matches the provided description: {user_prompt}. "
    "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
    "3. **Empty Response:** If no information matches the description, return an empty string ('')."
    "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
)

def llm_parse(chunks, user_prompt):
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model

    res = []

    for i, chunk in enumerate(chunks, start=1):
        response = chain.invoke({"content": chunk, "user_prompt": user_prompt})
        res.append(response)
    
    return "\n".join(res)


In [60]:
url = "https://scrapeme.live/shop/"
html = scrape(url)
content = parse(html)
chunks = split(content)
prompt = "Give me the price of Charizard"
llm_parse(chunks, prompt)

'£156.00'