In [20]:
import pandas as pd

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup

import re

In [2]:
eans = pd.read_csv('database.csv')['ean'].to_list()

In [3]:
def scrap_Amazon_sales(ean):
    '''
    Toma el ean de un libro y devuelve su posición
    en distintos ranking de ventas de Amazon
    '''
    # Ajustes del driver
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    wait = WebDriverWait(driver, 10)                 
    driver.delete_all_cookies()

    # Acceso al artículo de Amazon
    amazon_searcher = "https://www.amazon.com/advanced-search/books"
    driver.get(amazon_searcher)

    search_isbn = driver.find_element(By.XPATH, '//*[(@id = "field-isbn")]')
    search_isbn.send_keys(str(ean))
    wait.until(EC.element_to_be_clickable(search_isbn)).submit()

    wait.until(EC.url_changes(amazon_searcher))

    wait.until(EC.element_to_be_clickable((By.XPATH,
                                          '//*[contains(concat( " ", @class, " " ), concat( " ", "s-line-clamp-2", " " ))]')
                                         )).click()

    # Extracción información ventas
    soup = BeautifulSoup(driver.page_source)
    try: 
        top_sales = soup.find("div", {"id": "detailBulletsWrapper_feature_div"}).find_all(
        "ul", class_= "a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list")[1].get_text()
    except:
        top_sales = None

    # Cierre del driver
    driver.quit()
    
    return (ean, top_sales)

In [None]:
# Realizamos el raspado web

top_sales_by_ean = []
for ean in eans:
    try:
        top_sales_by_ean.append(scrap_Amazon_sales(ean))
    except:
        continue

In [37]:
# Tratamos y normalizamos los valores obtenidos

d_tops = dict()
for ean, top in top_sales_by_ean:
    if top != None and top != '   ':
        tops_libro = top.strip().split('nº')[1:]
        for i, t in enumerate(tops_libro):
            t = re.sub(r'\(Ver el Top 100.*\)', '', t)
            tops_libro[i] = t.strip()
        d_tops[ean] = tops_libro
    
df_amazon_tops = pd.DataFrame([d_tops.keys(), d_tops.values()]).transpose()
df_amazon_tops.columns = ['ean', 'tops_amazon']

#Mostramos primeras filas 
display(df_amazon_tops.head())

Unnamed: 0,ean,tops_amazon
0,9788433981066,"[3,322,967 en Tienda Kindle, 6,137 en Biografí..."
1,9788433981134,"[1,835,602 en Libros, 63,596 en Memorias (Libr..."
2,9788433981073,"[804,658 en Tienda Kindle, 641 en Biografías y..."
3,9788433981103,"[377,300 en Tienda Kindle, 844 en Literatura y..."
4,9788433981110,"[2,090,463 en Libros, 99,130 en Libros en espa..."


In [43]:
# Añadimos la información obtenida al dataset original
df = pd.read_csv("database.csv")
df = pd.merge(df,df_amazon_tops,on='ean',how='left')

display(df)

# Y finalmente guardamos el dataset definitivo en un archivo csv
df.to_csv('coleccion_anagrama.csv', index=False)