# Web Scraping Mercado Livre


In [1]:
# Web scraping Mercado Livre (busca)
# Obs.: o Mercado Livre muda o HTML com frequência; por isso, priorizamos JSON-LD (SEO) como fonte principal.

import json
import re
import time
from urllib.parse import quote_plus

import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}


def fetch_html(url, session=None, timeout=20, retries=3, backoff_s=1.5):
    session = session or requests.Session()
    last_exc = None

    for attempt in range(retries):
        try:
            resp = session.get(url, headers=DEFAULT_HEADERS, timeout=timeout)

            # rate limit / manutenção
            if resp.status_code in (429, 503):
                time.sleep(backoff_s * (attempt + 1))
                continue

            resp.raise_for_status()
            resp.encoding = resp.apparent_encoding or resp.encoding
            return resp.text
        except requests.RequestException as exc:
            last_exc = exc
            time.sleep(backoff_s * (attempt + 1))

    raise RuntimeError(f"Falha ao baixar a página: {url}. Erro: {last_exc}")


def build_search_url(query, offset=1):
    slug = quote_plus(query).replace("+", "-")
    base = f"https://lista.mercadolivre.com.br/{slug}"
    if offset and offset > 1:
        return f"{base}_Desde_{offset}"
    return base


def _to_number_br(value):
    if value is None:
        return None
    if isinstance(value, (int, float)):
        return float(value)

    s = str(value)
    s = re.sub(r"[^0-9,\.]", "", s)

    # Formato comum no BR: 1.234,56
    if s.count(",") == 1 and s.count(".") >= 1:
        s = s.replace(".", "").replace(",", ".")
    else:
        s = s.replace(",", ".")

    try:
        return float(s)
    except ValueError:
        return None


def parse_products_from_jsonld(soup):
    products = []

    for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
        raw = (script.string or script.get_text() or "").strip()
        if not raw:
            continue

        try:
            data = json.loads(raw)
        except json.JSONDecodeError:
            continue

        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue

            item_list = obj.get("itemListElement")
            if not isinstance(item_list, list):
                continue

            for el in item_list:
                item = el.get("item") if isinstance(el, dict) else None
                if not isinstance(item, dict):
                    continue

                offers = item.get("offers") if isinstance(item.get("offers"), dict) else {}

                products.append(
                    {
                        "TITLE": item.get("name"),
                        "PRICE": _to_number_br(offers.get("price")),
                        "URL": item.get("url"),
                        "CURRENCY": offers.get("priceCurrency"),
                    }
                )

    # Dedup por URL (ou título se faltar URL)
    dedup = {}
    for p in products:
        key = p.get("URL") or p.get("TITLE")
        if key and key not in dedup:
            dedup[key] = p
    return list(dedup.values())


def parse_products_from_dom(soup):
    products = []
    items = soup.select("li.ui-search-layout__item, li.ui-search-layout__stack") or []

    for it in items:
        title_el = it.select_one(
            "h2.ui-search-item__title, h2.poly-component__title, a.poly-component__title"
        )
        if not title_el:
            continue

        title = title_el.get_text(strip=True)

        link_el = it.select_one("a.ui-search-link, a.poly-component__title")
        url = link_el.get("href") if link_el else None

        price_el = it.select_one(
            "span.andes-money-amount__fraction, div.andes-money-amount-combo__main-container"
        )
        price = _to_number_br(price_el.get_text(" ", strip=True) if price_el else None)

        products.append({"TITLE": title, "PRICE": price, "URL": url, "CURRENCY": "BRL"})

    return products


def _first_dict(value):
    if isinstance(value, dict):
        return value
    if isinstance(value, list) and value and isinstance(value[0], dict):
        return value[0]
    return {}


def parse_product_details_from_jsonld(soup):
    """Extrai dados mínimos do JSON-LD da página do produto."""
    product = None
    breadcrumbs = None

    for script in soup.find_all("script", attrs={"type": "application/ld+json"}):
        raw = (script.string or script.get_text() or "").strip()
        if not raw:
            continue
        try:
            data = json.loads(raw)
        except json.JSONDecodeError:
            continue

        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue

            t = obj.get("@type")
            if t == "Product" and product is None:
                product = obj
            if t == "BreadcrumbList" and breadcrumbs is None:
                breadcrumbs = obj

    details = {"CATEGORY": None, "RATING": None, "IN_STOCK": None, "AVAILABILITY": None}

    # Categoria: usa breadcrumb quando existir
    if isinstance(breadcrumbs, dict):
        elements = breadcrumbs.get("itemListElement")
        if isinstance(elements, list) and elements:
            names = []
            for el in elements:
                if not isinstance(el, dict):
                    continue
                item = el.get("item")
                if isinstance(item, dict) and item.get("name"):
                    names.append(str(item.get("name")).strip())
            # normalmente o último é o próprio produto; remove se houver mais de 1
            if len(names) >= 2:
                names = names[:-1]
            details["CATEGORY"] = " > ".join(names) if names else None

    # fallback: Product.category (quando existir)
    if details["CATEGORY"] is None and isinstance(product, dict):
        cat = product.get("category")
        if isinstance(cat, str) and cat.strip():
            details["CATEGORY"] = cat.strip()

    # Avaliação
    if isinstance(product, dict):
        ar = product.get("aggregateRating")
        if isinstance(ar, dict):
            details["RATING"] = _to_number_br(ar.get("ratingValue"))

    # Disponibilidade
    if isinstance(product, dict):
        offers = _first_dict(product.get("offers"))
        avail = offers.get("availability")
        if isinstance(avail, str) and avail:
            details["AVAILABILITY"] = avail
            if avail.endswith("InStock"):
                details["IN_STOCK"] = 1
            elif avail.endswith("OutOfStock"):
                details["IN_STOCK"] = 0

    return details


def enrich_with_details(df, max_items=30, sleep_s=1.0):
    """Visita as páginas dos produtos para coletar categoria/avaliação/estoque."""
    if df.empty or "URL" not in df.columns:
        return df

    session = requests.Session()
    urls = [u for u in df["URL"].dropna().unique().tolist() if isinstance(u, str) and u.startswith("http")]
    if max_items is not None:
        urls = urls[: int(max_items)]

    details_by_url = {}
    for url in urls:
        html = fetch_html(url, session=session)
        soup = BeautifulSoup(html, "html.parser")
        details_by_url[url] = parse_product_details_from_jsonld(soup)
        time.sleep(sleep_s)

    details_df = pd.DataFrame.from_dict(details_by_url, orient="index").reset_index().rename(columns={"index": "URL"})
    out = df.merge(details_df, on="URL", how="left")
    out["RATING"] = pd.to_numeric(out.get("RATING"), errors="coerce")
    out["IN_STOCK"] = pd.to_numeric(out.get("IN_STOCK"), errors="coerce")
    return out


def scrape_search(query, pages=1, sleep_s=1.0):
    session = requests.Session()
    all_products = []

    for page in range(pages):
        offset = 1 + page * 50
        url = build_search_url(query, offset=offset)

        html = fetch_html(url, session=session)
        soup = BeautifulSoup(html, "html.parser")

        products = parse_products_from_jsonld(soup) or parse_products_from_dom(soup)
        if not products:
            raise RuntimeError(
                "Não foi possível extrair produtos desta página. "
                "O HTML pode ter mudado ou a requisição foi bloqueada."
            )

        for p in products:
            p["SOURCE_URL"] = url
        all_products.extend(products)

        time.sleep(sleep_s)

    df = pd.DataFrame(all_products)
    df["SCRAPY_DATETIME"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    df["PRICE"] = pd.to_numeric(df["PRICE"], errors="coerce")
    df = df.dropna(subset=["TITLE"]).reset_index(drop=True)
    return df


# Exemplo: buscar por termo e paginar
df_mercado_livre = scrape_search(query="iphone", pages=3, sleep_s=1.0)

# Enriquecimento (categoria, avaliação e disponibilidade) via páginas de detalhe
df_mercado_livre = enrich_with_details(df_mercado_livre, max_items=30, sleep_s=1.0)

# Dados mínimos solicitados
df_mercado_livre[["TITLE", "PRICE", "CATEGORY", "RATING", "IN_STOCK"]].head(10)


Unnamed: 0,TITLE,PRICE,CATEGORY,RATING,IN_STOCK
0,Apple iPhone 13 (128gb) Preto Novo + Acessórios,3.729,Celulares e Telefones,,1.0
1,Apple iPhone 13 (128gb) Branco Novo + Acessórios,3.699,Celulares e Telefones,,1.0
2,iPhone 16e (128 GB) - Preto - Distribuidor Aut...,5.799,Celulares e Telefones > Celulares e Smartphones,4.9,1.0
3,Apple iPhone 16 (128 GB) - Preto - Distribuido...,7.799,Celulares e Telefones > Celulares e Smartphones,4.9,1.0
4,Apple iPhone 15 (128 GB) - Preto - Distribuido...,7.209,Celulares e Telefones > Celulares e Smartphones,4.9,1.0
5,iPhone 17 de 256 GB - Lavanda - Distribuidor A...,7.999,Celulares e Telefones,4.9,1.0
6,Apple iPhone 16 (256 GB) - Ultramarino - Distr...,9.538,Celulares e Telefones > Celulares e Smartphones,4.9,1.0
7,Apple iPhone 15 verde 256 GB - Distribuidor Au...,8.095,Celulares e Telefones,4.9,1.0
8,iPhone 17 Pro Max 256GB - Prateado - Distribui...,12.499,Celulares e Telefones,4.9,1.0
9,iPhone 17 Pro 256GB - Azul-profundo - Distribu...,11.499,Celulares e Telefones,4.9,1.0
