# Scrapp

In [1]:
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import warnings

warnings.filterwarnings('ignore')

#### Create and setup driver

In [2]:
options = webdriver.ChromeOptions()
options.add_argument("--headless=new")
options.add_argument('--blink-settings=imagesEnabled=false')

driver = webdriver.Chrome(options=options)

#### List of categories

In [3]:
def get_main_entities(url, driver, time_wait):
    driver.get(url)
    time.sleep(time_wait)
    html_main = BeautifulSoup(driver.page_source, 'html5lib')
    json_main = html_main.find_all('template', {'data-field': 'extensions', 
                                                'data-type': 'json', 
                                                'data-varname': '__RUNTIME__'})
    df_main_entities = pd.json_normalize(json.loads(json_main[0].contents[1].string)['store.home/$after_footer/footer-layout.desktop/footer-oculto']['content']['opciones'])
    
    return df_main_entities

In [4]:
def get_entity_marcas(df_main_entities):
    df_marcas = df_main_entities[df_main_entities['correspondeA']=='MARCAS'].reset_index(drop=True)
    df_marcas.columns = ['corresponde_a', 'texto_marcas', 'url_marcas']
    return df_marcas

def get_entity_productos(df_main_entities):
    df_productos = df_main_entities[df_main_entities['correspondeA']=='PRODUCTOS'].reset_index(drop=True)
    df_productos.columns = ['corresponde_a', 'texto_productos', 'url_productos']
    return df_productos

def get_entity_departamentos(df_main_entities):
    df_departamentos = df_main_entities[df_main_entities['correspondeA']=='DEPARTAMENTO'].reset_index(drop=True)
    df_departamentos.columns = ['corresponde_a', 'texto_departamento', 'url_departamento']
    df_departamentos['url_texto_departamento'] = df_departamentos['url_departamento'].str.split('/', expand=True)[3].str.lower()
    return df_departamentos

def get_entity_categorias(df_main_entities):
    df_categorias = df_main_entities[df_main_entities['correspondeA']=='CATEGORIA'].reset_index(drop=True)
    df_categorias.columns = ['corresponde_a', 'texto_categorias', 'url_categorias']
    df_categorias['url_texto_departamento'] = df_categorias['url_categorias'].str.split('/', expand=True)[3].str.lower()
    df_categorias['url_texto_categorias'] = df_categorias['url_categorias'].str.split('/', expand=True)[4].str.lower()
    return df_categorias

def get_entity_subcategorias(df_main_entities):
    df_subcategorias = df_main_entities[df_main_entities['correspondeA']=='SUBCATEGORIA'].reset_index(drop=True)
    df_subcategorias.columns = ['corresponde_a', 'texto_subcategorias', 'url_subcategorias']
    df_subcategorias['url_texto_departamento'] = df_subcategorias['url_subcategorias'].str.split('/', expand=True)[3].str.lower()
    df_subcategorias['url_texto_categorias'] = df_subcategorias['url_subcategorias'].str.split('/', expand=True)[4].str.lower()
    df_subcategorias['url_texto_subcategorias'] = df_subcategorias['url_subcategorias'].str.split('/', expand=True)[5].str.lower()
    return df_subcategorias

In [5]:
%%time
url_main = 'https://www.vea.com.ar/'
time_wait = 10

df_main_entities = get_main_entities(url=url_main, driver=driver, time_wait=time_wait)

df_marcas = get_entity_marcas(df_main_entities)
df_productos = get_entity_productos(df_main_entities)
df_departamentos = get_entity_departamentos(df_main_entities)
df_categorias = get_entity_categorias(df_main_entities)
df_subcategorias = get_entity_categorias(df_main_entities)

del df_main_entities

CPU times: total: 422 ms
Wall time: 13.8 s


In [6]:
df_categorias.head()

Unnamed: 0,corresponde_a,texto_categorias,url_categorias,url_texto_departamento,url_texto_categorias
0,CATEGORIA,Aire Acondicionado y Ventilación,https://www.vea.com.ar/Electro/Aire-Acondicion...,electro,aire-acondicionado-y-ventilacion
1,CATEGORIA,"Calefacción, Calefones y Termotanques",https://www.vea.com.ar/Electro/Calefaccion-Cal...,electro,calefaccion-calefones-y-termotanques
2,CATEGORIA,Cocinas y Hornos,https://www.vea.com.ar/Electro/Cocinas-y-Hornos,electro,cocinas-y-hornos
3,CATEGORIA,Consolas y Videojuegos,https://www.vea.com.ar/Electro/Consolas-y-Vide...,electro,consolas-y-videojuegos
4,CATEGORIA,"Heladeras, Freezers y Cavas",https://www.vea.com.ar/Electro/Heladeras-Freez...,electro,heladeras-freezers-y-cavas


#### Get products and prices

In [7]:
categories = df_categorias['url_categorias'].str.split('.ar/', expand=True)[1].tolist()[:10]
len(categories)

10

In [8]:
%%time
from selenium.common.exceptions import ElementClickInterceptedException, StaleElementReferenceException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

data = {key:{} for key in categories}

for idx,category in enumerate(categories):
    info = f'[{idx+1}/{len(categories)}] {category} '
    print(info, end='')
    driver.get('https://www.vea.com.ar/' + category)
    
    number_of_products = 0
    while number_of_products == 0:
        footer = WebDriverWait(driver,20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'p.text-content')))
        number_of_products = int(footer.text.split()[3])
        number_of_loaded_products = int(footer.text.split()[1])
    print(f'(loaded products={number_of_loaded_products}, total={number_of_products})', end='\r')
    
    while number_of_loaded_products < number_of_products:
        footer = driver.find_element(By.CSS_SELECTOR, 'p.text-content')
        driver.execute_script('arguments[0].scrollIntoView({block: "center"});', footer)
        show_more = driver.find_elements(By.XPATH, "//div[text()='Mostrar más']")
        if show_more:
            try:
                show_more[0].click()
            except (ElementClickInterceptedException, StaleElementReferenceException):
                continue
        number_of_loaded_products = int(footer.text.split()[1])
        print(info + f'(loaded products={number_of_loaded_products}, total={number_of_products})', end='\r')
        time.sleep(1)

    loaded_products = json.loads(driver.find_element(By.CSS_SELECTOR, "body script[type='application/ld+json']").get_attribute('innerText'))['itemListElement']
    products = {'item':[],'price':[]}
    for prod in loaded_products:
        products['item']  += [prod['item']['name']]
        products['price'] += [prod['item']['offers']['offers'][0]['price']]

    data[category] = products
    print()

[1/10] Electro/Aire-Acondicionado-y-Ventilacion (loaded products=6, total=6)
[2/10] Electro/Calefaccion-Calefones-y-Termotanques (loaded products=32, total=32)
[3/10] Electro/Cocinas-y-Hornos (loaded products=46, total=46)
[4/10] Electro/Consolas-y-Videojuegos (loaded products=7, total=7)
[5/10] Electro/Heladeras-Freezers-y-Cavas (loaded products=44, total=44)
[6/10] electro/informatica (loaded products=62, total=62)
[7/10] Electro/Lavado (loaded products=35, total=35)
[8/10] Electro/Pequenos-Electros (loaded products=178, total=178)
[9/10] electro/telefonos (loaded products=41, total=41)
[10/10] tiempo-libre/aire-libre (loaded products=49, total=49)
CPU times: total: 562 ms
Wall time: 1min 54s


In [9]:
%%time
data_list = []
for key, value in data.items():
    for i in range(len(value['item'])):
        record_dict = {
            'category': key,
            'item': value['item'][i],
            'price': value['price'][i]
        }
        data_list.append(record_dict)

df = pd.DataFrame(data_list)

CPU times: total: 0 ns
Wall time: 0 ns


In [10]:
df.shape

(500, 3)

In [11]:
df.head()

Unnamed: 0,category,item,price
0,Electro/Aire-Acondicionado-y-Ventilacion,Aire Acondicionado Nex Inverter Frio Calor 5000,269000.0
1,Electro/Aire-Acondicionado-y-Ventilacion,Turbo Ventilador 10 5a Liliana Rojo,7999.0
2,Electro/Aire-Acondicionado-y-Ventilacion,Aire Acondicionado Inverter Samsung 5500 Fc,461389.0
3,Electro/Aire-Acondicionado-y-Ventilacion,Aire Acondicionado Philco Inverter 3550w,274999.0
4,Electro/Aire-Acondicionado-y-Ventilacion,Ventilador De Mesa Retro Protalia 12&apos; Negro,17999.0
