In [2]:
from typing import Dict
import findspark
import os
import requests
from bs4 import BeautifulSoup
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row

findspark.init(r"D:\spark\spark")

os.environ["SPARK_HOME"] = r"D:\spark\spark"
os.environ["HADOOP_HOME"] = r"D:\spark\spark\hadoop\bin"
os.environ["JAVA_HOME"] = r"D:\spark\jdk-21"

jarsql = r"C:\leads\venv\Lib\site-packages\pyspark\mssql-jdbc-9.4.0.jre11.jar"
warehouse_location = os.path.abspath('spark-warehouse')

conf = SparkConf()
conf.set("spark.master", "local[*]")
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")
conf.set('spark.driver.extraClassPath', jarsql)
conf.set('spark.executor.extraClassPath', jarsql)

spark = SparkSession.builder\
    .config(conf=conf)\
    .config("spark.sql.warehouse.dir", warehouse_location)\
    .appName("Testing PySpark Example")\
    .getOrCreate()

sc = spark.sparkContext

In [3]:

def extract_product_details(url_produto):
    dict_item = {"urlprodutos": url_produto}
    
    try:
        response = requests.get(url_produto)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Erro na requisição para {url_produto}: {e}")
        return dict_item

    return dict_item

def scroll(start=0, step=500, sz=528):
    url = f"https://www.flooranddecor.com/porcelain-tile?start={start}&sz={sz}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    return soup

def get_urls_categorias_produtos():
    lista_categorias = ["https://www.flooranddecor.com/porcelain-tile"]
    return lista_categorias

def get_url_produtos():
    base_url = "https://www.flooranddecor.com"
    lista_produtos = []
    try:
        categorias = get_urls_categorias_produtos()
        for categoria in categorias:
            for start in range(0, 500, 100):
                soup = scroll(start=start)
                elementos_produtos = soup.select('a.b-product_tile-figure_link')
                for elemento in elementos_produtos:
                    url_produto = base_url + elemento['href']
                    lista_produtos.append(url_produto)
                    if len(lista_produtos) >= 500:
                        return lista_produtos
    except Exception as e:
        print(f"Erro ao obter URLs de produtos: {e}")
    return lista_produtos

lista_produtos = get_url_produtos()
rdd_produtos = sc.parallelize(lista_produtos)
rdd_detalhes = rdd_produtos.map(extract_product_details)
df_produtos = rdd_detalhes.map(lambda x: Row(**x)).toDF()
#df_produtos.show(truncate=False)
df_produtos.collect()

[Row(urlprodutos='https://www.flooranddecor.com/porcelain-tile/bernini-avorio-polished-porcelain-tile-100655315.html'),
 Row(urlprodutos='https://www.flooranddecor.com/porcelain-tile/venato-ii-matte-porcelain-tile-100610781.html'),
 Row(urlprodutos='https://www.flooranddecor.com/porcelain-tile/venato-ii-polished-porcelain-tile-100610823.html'),
 Row(urlprodutos='https://www.flooranddecor.com/porcelain-tile/belucci-bianca-matte-porcelain-tile-100885334.html'),
 Row(urlprodutos='https://www.flooranddecor.com/porcelain-tile/black-matte-2-in.-hexagon-porcelain-mosaic-100782390.html'),
 Row(urlprodutos='https://www.flooranddecor.com/porcelain-tile/tauleto-bianco-polished-porcelain-tile-100572437.html'),
 Row(urlprodutos='https://www.flooranddecor.com/porcelain-tile/tauleto-bianco-matte-porcelain-tile-100572429.html'),
 Row(urlprodutos='https://www.flooranddecor.com/porcelain-tile/belucci-bianca-polished-porcelain-tile-100966621.html'),
 Row(urlprodutos='https://www.flooranddecor.com/porcela

In [4]:
lista_dicionarios = [{**row.asDict()} for row in df_produtos.collect()]


In [5]:
cont = len(lista_dicionarios)
i = 0
while i < cont:
    
    print(lista_dicionarios[i])
    i+=1

{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/bernini-avorio-polished-porcelain-tile-100655315.html'}
{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/venato-ii-matte-porcelain-tile-100610781.html'}
{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/venato-ii-polished-porcelain-tile-100610823.html'}
{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/belucci-bianca-matte-porcelain-tile-100885334.html'}
{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/black-matte-2-in.-hexagon-porcelain-mosaic-100782390.html'}
{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/tauleto-bianco-polished-porcelain-tile-100572437.html'}
{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/tauleto-bianco-matte-porcelain-tile-100572429.html'}
{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/belucci-bianca-polished-porcelain-tile-100966621.html'}
{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/concept-g

In [6]:
from typing import List
import requests
from bs4 import BeautifulSoup

def extract_product_details(url_produto):
    dict_item = {"urlprodutos": url_produto}

    try:
        response = requests.get(url_produto)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Erro na requisição para {url_produto}: {e}")
        return dict_item

    nome = soup.select_one("div.b-pdp_main-title > h1")
    if nome:
        dict_item['nome'] = nome.text.strip()

    preco = soup.select_one("div.b-pdp_price > span")
    if preco:
        dict_item['precos'] = preco.text.strip()

    sku = soup.select_one("div.b-pdp_main-sku > div")
    if sku:
        dict_item['sku'] = sku.text.strip().split(":")[-1].strip()

    dimensao = soup.select_one("div.b-pdp_main-size > div")
    if dimensao:
        dict_item['dimensao'] = dimensao.text.strip()

    categorias = soup.select('div.b-pdp_categories-container > div')
    for i, categoria in enumerate(categorias):
        dict_item[f'Categoria{i}'] = categoria.text.strip()

    imagens = soup.select('img.b-pdp_thumbnail-figure_img')
    for i, imagem in enumerate(imagens):
        dict_item[f'imagem{i}'] = imagem['src']

    specifications = soup.select('section.b-pdp_specifications-container article span')
    for i in range(0, len(specifications) - 1, 2):
        referencia = specifications[i].text.strip()
        valor = specifications[i + 1].text.strip()
        dict_item[referencia] = valor

    product_details = soup.select_one('section.b-pdp_details-container > div > div > div > p')
    if product_details:
        dict_item["ProductDetails"] = product_details.text.strip()

    installation = soup.select_one('figure.b-products_install-figure > img')
    if installation:
        dict_item["installation"] = installation['src']

    cair_maintenance = soup.select_one('a.b-products_install-title:contains("Care & Maintenance")')
    if cair_maintenance:
        dict_item['CairMaintence'] = cair_maintenance['href']

    user_guide = soup.select_one('figure.b-products_install-figure > img')
    if user_guide:
        dict_item["UserGuide"] = user_guide['src']

    print(dict_item)
    return dict_item

def extract_product_details_from_list(urls: List[str]) -> List[dict]:
    result_list = []

    for url_produto in urls:
        result_list.append(extract_product_details(url_produto))

    return result_list

lista_urls = [
    'https://www.flooranddecor.com/porcelain-tile/tarsus-almond-ii-polished-porcelain-bullnose-100618594.html',
    'https://www.flooranddecor.com/porcelain-tile/nepal-gray-porcelain-bullnose-100248137.html'
    
]

resultado = extract_product_details_from_list(lista_urls)




{'urlprodutos': 'https://www.flooranddecor.com/porcelain-tile/tarsus-almond-ii-polished-porcelain-bullnose-100618594.html', 'precos': '$3.46 / piece', 'imagem0': 'https://i8.amplience.net/i/flooranddecor/100618594_tarsus-almond-ii-polished-porcelain-bullnose_1?fmt=auto&qlt=85', 'Size': '3 x 12', 'Product Length': '11.80', 'Product Width': '3.20', 'Product Thickness': '10mm', 'Box Length': '12', 'Box Width': '3.34', 'Box Weight': '13.23 lbs', 'Box Quantity': '12', 'Coverage (sqft/pc)': '0.26', 'PEI Rating\n\n\n\nP.E.I. rating measures Tile design durability from 1-5:P.E.I. of 3 & above is suitable for home useP.E.I. of 5 is suitable for commercial & high traffic areas': 'P.E.I. rating measures Tile design durability from 1-5:P.E.I. of 3 & above is suitable for home useP.E.I. of 5 is suitable for commercial & high traffic areas', 'P.E.I. rating measures Tile design durability from 1-5:P.E.I. of 3 & above is suitable for home useP.E.I. of 5 is suitable for commercial & high traffic areas'