In [1]:
from pathlib import Path
import re
import requests
import time
import datetime
import pandas as pd
from requests_html import HTML
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

In [2]:
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"        # os.path.join(BASE_DIR, "data")
if not DATA_DIR.exists():           # os.path.exists(DATA_DIR)
    DATA_DIR.mkdir(exist_ok=True)   # os.ôakedirs(DATA_DIR, exist_ok=True)

product_category_links_output = DATA_DIR / "category-products.csv"
product_output = DATA_DIR / "products.csv"

In [3]:
c = Options()
c.add_argument("--headless")

driver = webdriver.Chrome(options=c)

In [4]:
categories = [
    {"name": "elektrocentrály", "url": "https://gude.gude.sk/7712-elektrocentraly"},
    {"name": "cerpadla", "url": "https://gude.gude.sk/7674-cerpadla-a-vodarne"},
    {"name": "nabijacky", "url": "https://gude.gude.sk/7711-nabijacky-autobaterii"},
    {"name": "naradie", "url": "https://gude.gude.sk/8089-rucne-naradie"}
]


In [5]:
regex_options = [
    r"https://gude.gude.sk/(?P<slug>[\w-]+)/(?P<slug1>[\w-]+)-(?P<product_id>[0-9]+).html",
    r"https://gude.gude.sk/(?P<slug>[\w-]+)/(?P<product_id>[0-9]+).html"
]

def extract_product_id_from_url(url):
    product_id = None
    for regex_str in regex_options:
        regex = re.compile(regex_str)
        match = regex.match(url)
        if match != None:
            try:
                product_id = match['product_id']
            except:
                pass
    return product_id

In [6]:
def clean_page_links(page_links=[], category=None):
    final_page_links = []
    for x in page_links:
        product_id = extract_product_id_from_url(x)
        if product_id != None:
            final_page_links.append({"url": x, "product_id": product_id, "category": category})
    return final_page_links

In [7]:
def scrape_category_product_links(categories=[]):
    all_product_links=[]
    for category in categories:
        url = category.get("url")
        driver.get(url)
        body_el = driver.find_element(By.CSS_SELECTOR, ".product_list")
        html_str = body_el.get_attribute("innerHTML")
        html_obj = HTML(html = html_str)
        new_links = [x for x in html_obj.links if x.startswith("https://gude.gude.sk/")]
        page_links = [x for x in new_links if x.endswith(".html")]
        cleaned_links = clean_page_links(page_links=page_links, category=category)
        all_product_links += cleaned_links
    return all_product_links

In [8]:
def extract_categories_and_save(categories=[]):
    all_product_links = scrape_category_product_links(categories)
    category_df = pd.DataFrame(all_product_links)
    category_df.to_csv(product_category_links_output, index=False)

In [9]:
extract_categories_and_save(categories)


In [10]:
def scrape_product_page(url, title_lookup = "h1", price_lookup = ".productPrice", avail = "#availability_value"):
    driver.get(url)
    #time.sleep(1.2)
    body_el = driver.find_element(By.CSS_SELECTOR, "body")
    #body_el = driver.find_element_by_css_selector("body")
    html_str = body_el.get_attribute("innerHTML")
    html_obj = HTML(html = html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    if html_obj.find(avail, first=True).text == "Skladom (ihneď k odberu)":
        product_av = True
    else:
        product_av = False
    #print (product_title, product_price)
    return product_title, product_price, product_av

In [11]:
def perform_scrape(cleaned_items=[]):
    data_extracted = []
    for obj in cleaned_items:
        link = obj['url']
        product_id = obj['product_id']
        title, price = (None, None)
        try:
            title, price = scrape_product_page(link)
        except:
            pass
        if title != None and price != None:
            print (link, title, price)
        product_data = {
            "url": link,
            "product_id": product_id,
            "title": title,
            "price": price
        }
        data_extracted.append(product_data)
    return data_extracted

In [12]:
#extracted_data = perform_scrape(cleaned_links)

In [13]:
#print (extracted_data)

In [14]:
def row_scrape_event(row, *args, **kwargs):
    link = row['url']
    scraped = 0
    try:
        scraped = row['scraped']
    except:
        pass
    if scraped == 1 or scraped == "1":
        return row
    product_id = row['product_id']
    title, price, avail = (None, None, None)
    try:
        title, price, avail = scrape_product_page(link)
    except:
        pass
    row['title'] = title
    row['price'] = price
    row['avail'] = avail
    row['scraped'] = 1
    row['timestamp'] = datetime.datetime.now().timestamp()
    return row

In [15]:
df = pd.read_csv(product_category_links_output)

In [16]:
df = df.apply(row_scrape_event, axis = 1)

In [17]:
if not product_output.exists():
    df.to_csv(product_output, index=False)
else:
    products_df = pd.read_csv(product_output)
    final_df = pd.concat([products_df, df])
    final_df.to_csv(product_output, index=False)