In [1]:
import os
from unidecode import unidecode
import json
import requests
import pandas as pd

In [2]:
data_path = os.path.join(os.getcwd(), 'data')
base_url = "https://api.mercadolibre.com"
sellers_path = os.path.join(data_path, "input", "sellers")
seller_items_path = os.path.join(data_path, "input", "seller_items")
categories_path = os.path.join(data_path, "input", "categories", "main_categories.json")

In [None]:
# -----------------------load and save data------------------------------------------------

# ---------------categories-------------------------
# verify if is soma file named categories.json and delete it from path



if os.path.exists(categories_path):

    os.remove(categories_path)

else:

    os.makedirs(os.path.dirname(categories_path), exist_ok=True)

# request all categories from site
categories = requests.get(base_url+"/sites/MLA/categories")

categories = categories.json()

# decoding strings for readability

for category in categories:

    category["name"] = unidecode(category["name"])

# saving categories
with open(os.path.join(categories_path), "w") as f:

    json.dump(categories, f)

categories = None

# --------------------------------------------------

In [None]:
# ---------------sellers from top items per category-------------------------
# verify if is soma file named categories.json and delete it from path

os.makedirs(sellers_path, exist_ok=True)

with open(categories_path, "r") as f:
    categories = json.load(f)

counter = 1
# get sellers from top 1k items for each category
for category in categories:
    print(f"Processing category: {category['name']}, {counter}/{len(categories)}")
    counter += 1

    file_path = os.path.join(sellers_path, f"{category['id']}.json")

    # Remove arquivo antigo se existir
    if os.path.exists(file_path):
        os.remove(file_path)

    all_sellers = []  # Lista para armazenar os seller_id de todos os itens da categoria

    # Buscar até 1.000 itens por categoria
    for offset in range(0, 951, 50):
        response = requests.get(
            f"{base_url}/sites/MLA/search?category={category['id']}&offset={offset}&limit=50"
        )
        sellers = response.json()

        # salvar apenas o seller id
        for seller in sellers.get("results", []):
            seller_id = seller.get("seller", {}).get("id", None)
            if seller_id:  # Verificar se o seller_id existe
                all_sellers.append(seller_id)

    # Salvar todos os seller_ids de uma vez em formato JSON válido
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(all_sellers, f, ensure_ascii=False, indent=4)

all_sellers, sellers = None, None
# --------------------------------------------------

Processing category: Accesorios para Vehiculos, 1/32
Processing category: Agro, 2/32
Processing category: Alimentos y Bebidas, 3/32
Processing category: Animales y Mascotas, 4/32
Processing category: Antiguedades y Colecciones, 5/32
Processing category: Arte, Libreria y Merceria, 6/32
Processing category: Autos, Motos y Otros, 7/32
Processing category: Bebes, 8/32
Processing category: Belleza y Cuidado Personal, 9/32
Processing category: Camaras y Accesorios, 10/32
Processing category: Celulares y Telefonos, 11/32
Processing category: Computacion, 12/32
Processing category: Consolas y Videojuegos, 13/32
Processing category: Construccion, 14/32
Processing category: Deportes y Fitness, 15/32
Processing category: Electrodomesticos y Aires Ac., 16/32
Processing category: Electronica, Audio y Video, 17/32
Processing category: Entradas para Eventos, 18/32
Processing category: Herramientas, 19/32
Processing category: Hogar, Muebles y Jardin, 20/32
Processing category: Industrias y Oficinas, 2

In [None]:
#-----------------------TOP 1K ITEMS PER TOP SELERS--------------------------------------------
# Função para obter e processar itens de um seller
def get_seller_items(seller_id, seller_items_path=seller_items_path):
    # Criar um caminho para o arquivo do seller
    file_path = os.path.join(seller_items_path, f"{seller_id}_items.json")

    with open(file_path, "w", encoding="utf-8") as output_file:
        # Pegar até 1k items de cada seller
        for offset in range(0, 951, 50):
            url = f"{base_url}/sites/MLA/search?seller_id={seller_id}&offset={offset}&limit=50"
            response = requests.get(url)
            items = response.json()

            if "results" not in items:
                break

            # Processar os itens
            for item in items["results"]:
                item_data = {
                    "seller_id": seller_id,
                    "category_id": item.get("category_id", ""),
                    "price": item.get("price", 0),
                    "original_price": item.get("original_price", item.get("price", 0)),
                    "discount": (
                        (item.get("original_price", 0) - item.get("price", 0))
                        / item.get("original_price", 1)
                        if item.get("original_price")
                        else 0
                    ),
                    "available_qty": item.get("available_quantity", 0),
                    "cataloged": (1 if item.get("catalog_product_id", None) else 0),
                    "free_shipping": (
                        1 if item.get("shipping", {}).get("free_shipping", False) else 0
                    ),
                    "condition": item.get("condition", ""),  # "new" or "used"
                    "installments": (
                        item.get("installments", {}).get("quantity", 0)
                        if item.get("installments")
                        else 0
                    ),
                    "buying_mode": item.get("buying_mode", ""),
                    "city": item.get("address", {}).get("city_name", ""),
                    "state": item.get("address", {}).get("state_name", ""),
                    "has_gtin": 0,
                    "num_attributes": 0,
                }

                # Verificar se tem GTIN e contar atributos com valor
                if "attributes" in item:
                    for attr in item["attributes"]:
                        if attr["id"] == "GTIN" and attr.get("value_name"):
                            item_data["has_gtin"] = 1
                        if attr.get("value_name"):
                            item_data["num_attributes"] += 1

                # Escrever cada item no arquivo
                json.dump(item_data, output_file)
                output_file.write("\n")

#INIT DIR IF NOT EXISTS
os.makedirs(seller_items_path, exist_ok=True)

# List all JSON files
files = [f for f in os.listdir(sellers_path) if f.endswith(".json")]

# Create a list to store sellers ids
sellers_ids = []

for file in files:
    file_path = os.path.join(sellers_path, file)
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        sellers_ids.extend(data)

sellers_ids = list(set(sellers_ids))  # Remove duplicates

# Obter lista de sellers que ainda não têm arquivos salvos
existing_files = {
    f.split("_")[0] for f in os.listdir(seller_items_path) if f.endswith("_items.json")
}
sellers_ids = [
    seller_id for seller_id in sellers_ids if str(seller_id) not in existing_files
]
len(sellers_ids)

for seller_id in sellers_ids:
    seller_items = get_seller_items(seller_id)