## ollama stuff
- https://github.com/ollama/ollama
- https://ollama.com/download

## chromewebdriver
- https://googlechromelabs.github.io/chrome-for-testing/#stable

In [16]:
# Scraping part
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

def scrape(url):
    driver_path = "./chromedriver.exe"
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(driver_path), options=options)

    try: 
        driver.get(url)
        html = driver.page_source
        
        return html
    finally:
        driver.quit()



In [None]:
scrape("https://scrapeme.live/shop/")

'<html lang="en"><head>\n\t<!-- Google Tag Manager -->\n<script type="text/javascript" async="" src="https://www.google-analytics.com/analytics.js" nonce=""></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-Q9KY1T6XJQ&amp;l=dataLayer&amp;cx=c&amp;gtm=45He4c40v79311205za200" nonce=""></script><script async="" src="https://www.googletagmanager.com/gtm.js?id=GTM-NVFPDWB"></script><script nonce="">(function (w, d, s, l, i) {\n\t\tw[l] = w[l] || [];\n\t\tw[l].push({\n\t\t\t\'gtm.start\':\n\t\t\t\tnew Date().getTime(), event: \'gtm.js\'\n\t\t});\n\t\tvar f = d.getElementsByTagName(s)[0],\n\t\t\tj = d.createElement(s), dl = l != \'dataLayer\' ? \'&l=\' + l : \'\';\n\t\tj.async = true;\n\t\tj.src =\n\t\t\t\'https://www.googletagmanager.com/gtm.js?id=\' + i + dl;\n\t\tf.parentNode.insertBefore(j, f);\n\t})(window, document, \'script\', \'dataLayer\', \'GTM-NVFPDWB\');</script>\n<!-- End Google Tag Manager -->\n\t<title>Allinone | Web Scraper Tes

In [60]:
# Parsing and cleaning
from bs4 import BeautifulSoup

def parse(html):
    soup = BeautifulSoup(html, "html.parser")
    body = soup.body
    if body:
        for content in soup(["script", "style"]):
            content.extract() # get rid of tags

        clean_content = soup.get_text(separator="\n")
        clean_content = "\n".join(line.strip() for line in clean_content.splitlines() if line.strip()) # if \n is not seperating anything this will remove it

        return clean_content
    return None



In [3]:
# Splitting text into batch for llm, idk what the max length should be
def split(content, max_length=8000):
    batch = [content[i :i +max_length] for i in range(0, len(content), max_length)] # keeps grabbing wtv the max lenght is until the end
    return batch

In [None]:
#llm part
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

model = OllamaLLM(model="llama3.1")

template = (
    "You are tasked with extracting specific information from the following text content: {content}. "
    "Please follow these instructions carefully: \n\n"
    "1. **Extract Information:** Only extract the information that directly matches the provided description: {user_prompt}. "
    "2. **No Extra Content:** Do not include any additional text, comments, or explanations in your response. "
    "3. **Empty Response:** If no information matches the description, return an empty string ('')."
    "4. **Direct Data Only:** Your output should contain only the data that is explicitly requested, with no other text."
)

def llm_parse(chunks, user_prompt):
    prompt = ChatPromptTemplate.from_template(template)
    chain = prompt | model

    res = []

    for i, chunk in enumerate(chunks, start=1):
        response = chain.invoke({"content": chunk, "user_prompt": user_prompt})
        res.append(response)
    
    return "\n".join(res) 


## Testing

In [63]:
url = "https://www.amazon.sg/gp/bestsellers/electronics/18157234051?ref_=Oct_d_obs_S&pd_rd_w=mK26Q&content-id=amzn1.sym.a6e77fee-5294-433e-9edf-4884ef3ec196&pf_rd_p=a6e77fee-5294-433e-9edf-4884ef3ec196&pf_rd_r=TQX5PNVYEHC8R64CRXZX&pd_rd_wg=4ljqT&pd_rd_r=48e3fc54-0d47-45cd-bd72-6860cf085d58"
html = scrape(url)

In [64]:
content = parse(html)
print(content)
# Saves the content so i can cross check with the llm output
with open("content.txt", "w", encoding='utf-8') as file:
    file.write(content)

Amazon.sg Best Sellers: The best items in Single-Board Computers & Accessories based on Amazon customer purchases
Skip to main content
.sg
Delivering to Singapore 640000
Update location
All
All Departments
Amazon Fresh
Amazon International Store
Automotive
Baby
Beauty & Personal Care
Books
CDs and Vinyl
Clothing, Shoes & Jewellery
Computer & Accessories
Electronics
Garden & Outdoor
Gift Cards
Grocery
Health, Household & Personal Care
Home
Industrial & Scientific
Kitchen & Dining
Little Farms
Luggage & Travel Gear
Luxury Beauty
Movies & TV
Musical Instruments
Office Products
Pet Supplies
Prime Video
Software
Sports and Outdoors
Tools & Home Improvement
Toys and Games
Video Games
Watsons
Search Amazon.sg
EN
Hello, sign in
Account and Lists
Returns
& Orders
0
Cart
Sign in
New customer?
Start here.
Your Lists
Create a List
Your Account
Your Account
Your Orders
Your Recommendations
Content and Devices
Your Prime Membership
Your Prime Video
Your Seller Account
Sign in
New customer?
Start her

In [71]:
chunks = split(content)
print(chunks)
with open("chunks.txt", "w", encoding='utf-8') as file:
    file.write(str(chunks))

['Amazon.sg Best Sellers: The best items in Single-Board Computers & Accessories based on Amazon customer purchases\nSkip to main content\n.sg\nDelivering to Singapore 640000\nUpdate location\nAll\nAll Departments\nAmazon Fresh\nAmazon International Store\nAutomotive\nBaby\nBeauty & Personal Care\nBooks\nCDs and Vinyl\nClothing, Shoes & Jewellery\nComputer & Accessories\nElectronics\nGarden & Outdoor\nGift Cards\nGrocery\nHealth, Household & Personal Care\nHome\nIndustrial & Scientific\nKitchen & Dining\nLittle Farms\nLuggage & Travel Gear\nLuxury Beauty\nMovies & TV\nMusical Instruments\nOffice Products\nPet Supplies\nPrime Video\nSoftware\nSports and Outdoors\nTools & Home Improvement\nToys and Games\nVideo Games\nWatsons\nSearch Amazon.sg\nEN\nHello, sign in\nAccount and Lists\nReturns\n& Orders\n0\nCart\nSign in\nNew customer?\nStart here.\nYour Lists\nCreate a List\nYour Account\nYour Account\nYour Orders\nYour Recommendations\nContent and Devices\nYour Prime Membership\nYour Prim

In [72]:
prompt = "give me all the prices of a product names and prices in JSON" 
response = llm_parse(chunks, prompt)
print(response)
with open("response.txt", "w", encoding='utf-8') as file:
    file.write(response)

[
    {
        "name": "ELEGOO Mega 2560 Project Most Complete Ultimate Starter Kit",
        "price": "$37.97"
    },
    {
        "name": "Freenove ESP32-S3-WROOM Board Lite (2 Pack)",
        "price": "$39.77"
    },
    {
        "name": "ESP32 Board ESP32-32D WROOM Module CP2012 USB C 38 Pin WiFi+Blue tooth Type-C Interface ESP32-DevKitC-32 Development Board STA/AP/STA+AP",
        "price": "$29.32"
    },
    {
        "name": "Super Breakout Board for ESP32 3.81mm / 0.15\" Terminal Blocks GPIO Expansion Board 3 Types Output for ESP32 Module ESP-WROOM-32 ESP32-DevKitC",
        "price": "$32.19"
    },
    {
        "name": "2pcs ESP32-S3-DevKit C N16R8 Development Board WiFi + Bluetooth MCU Module, Dual Type-C ESP32-S3-WROOM-1 Cores Microcontroller Processor Integrates Complete Wi-Fi and BLE Functions",
        "price": "$32.60"
    },
    {
        "name": "2 Pack 2.8\" ESP32 Development Board WiFi Bluetooth ESP32-2432S028R Dual-core 240x320 Smart LCD Display TFT Module Touch