In [1]:
import os
from unidecode import unidecode
import json
import requests

In [2]:
data_path = os.path.join(os.getcwd(), 'data')
base_url = "https://api.mercadolibre.com"
categories_path = os.path.join(data_path, "input", "categories")
sellers_path = os.path.join(data_path, "input", "sellers")
seller_items_path = os.path.join(data_path, "input", "seller_items")
country_id_list = ["MLA", "MLB", "MLM", "MPY", "MPE", "MBO", "MLU"]
target_country_id = "MPY"

In [3]:
# -----------------------load and save data------------------------------------------------

# ---------------categories-------------------------
# verify if is soma file named categories.json and delete it from path

os.makedirs(categories_path, exist_ok=True)

for country_id in country_id_list:

    cat_path = os.path.join(categories_path, country_id)

    if os.path.exists(cat_path):

        continue

    else:

        os.makedirs(cat_path)

    # request all categories from site
    categories = requests.get(base_url + f"/sites/{country_id}/categories")

    categories = categories.json()

    # decoding strings for readability

    for category in categories:

        category["name"] = unidecode(category["name"])

    # saving categories
    with open(os.path.join(cat_path, "main_categories.json"), "w") as f:

        json.dump(categories, f)

    categories = None

# --------------------------------------------------

In [4]:
# ---------------sellers from top items per category-------------------------
# verify if is soma file named categories.json and delete it from path

os.makedirs(sellers_path, exist_ok=True)

cnt = 1

for country_id in country_id_list:

    print(f"Processing country: {country_id}, {cnt}/{len(country_id_list)}")

    sell_path = os.path.join(sellers_path, country_id)

    if os.path.exists(sell_path):

        continue

    else:

        os.makedirs(sell_path)

    with open(
        os.path.join(categories_path, country_id, "main_categories.json"), "r"
    ) as f:
        categories = json.load(f)

    counter = 1
    # get sellers from top 1k items for each category
    for category in categories:
        print(f"Processing category: {category['name']}, {counter}/{len(categories)}")
        counter += 1

        file_path = os.path.join(sell_path, f"{category['id']}.json")

        # Remove arquivo antigo se existir
        if os.path.exists(file_path):

            continue

        all_sellers = []  # Lista para armazenar os seller_id de todos os itens da categoria

        # Buscar até 1.000 itens por categoria
        for offset in range(0, 550, 50):
            response = requests.get(
                f"{base_url}/sites/{country_id}/search?category={category['id']}&offset={offset}"
            )
            sellers = response.json()

            # salvar apenas o seller id
            for seller in sellers.get("results", []):
                seller_id = seller.get("seller", {}).get("id", None)
                if seller_id:  # Verificar se o seller_id existe
                    all_sellers.append(seller_id)

        # Salvar todos os seller_ids de uma vez em formato JSON válido
        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(all_sellers, f, ensure_ascii=False, indent=4)

    all_sellers, sellers = None, None

    cnt += 1
# --------------------------------------------------

Processing country: MLA, 1/7
Processing category: Accesorios para Vehiculos, 1/32
Processing category: Agro, 2/32
Processing category: Alimentos y Bebidas, 3/32
Processing category: Animales y Mascotas, 4/32
Processing category: Antiguedades y Colecciones, 5/32
Processing category: Arte, Libreria y Merceria, 6/32
Processing category: Autos, Motos y Otros, 7/32
Processing category: Bebes, 8/32
Processing category: Belleza y Cuidado Personal, 9/32
Processing category: Camaras y Accesorios, 10/32
Processing category: Celulares y Telefonos, 11/32
Processing category: Computacion, 12/32
Processing category: Consolas y Videojuegos, 13/32
Processing category: Construccion, 14/32
Processing category: Deportes y Fitness, 15/32
Processing category: Electrodomesticos y Aires Ac., 16/32
Processing category: Electronica, Audio y Video, 17/32
Processing category: Entradas para Eventos, 18/32
Processing category: Herramientas, 19/32
Processing category: Hogar, Muebles y Jardin, 20/32
Processing categ

In [3]:
# -----------------------TOP 1K ITEMS PER TOP SELERS--------------------------------------------
# Função para obter e processar itens de um seller
def get_seller_items(seller_id, country_id):
    
    os.makedirs(os.path.join(seller_items_path, country_id), exist_ok=True)
        
    # Criar um caminho para o arquivo do seller
    file_path = os.path.join(seller_items_path, country_id, f"{seller_id}_items.json")

    with open(file_path, "w", encoding="utf-8") as output_file:
        # Pegar até 1k items de cada seller
        for offset in range(0, 550, 50):
            url = f"{base_url}/sites/{country_id}/search?seller_id={seller_id}&offset={offset}"
            response = requests.get(url)
            items = response.json()

            if "results" not in items:
                break

            # Processar os itens
            for item in items["results"]:
                item_data = {
                    "seller_id": seller_id,
                    "category_id": item.get("category_id", ""),
                    "price": item.get("price", 0),
                    "original_price": item.get("original_price", item.get("price", 0)),
                    "discount": (
                        (item.get("original_price", 0) - item.get("price", 0))
                        / item.get("original_price", 1)
                        if item.get("original_price")
                        else 0
                    ),
                    "available_qty": item.get("available_quantity", 0),
                    "cataloged": (1 if item.get("catalog_product_id", None) else 0),
                    "free_shipping": (
                        1 if item.get("shipping", {}).get("free_shipping", False) else 0
                    ),
                    "condition": item.get("condition", ""),  # "new" or "used"
                    "installments": (
                        item.get("installments", {}).get("quantity", 0)
                        if item.get("installments")
                        else 0
                    ),
                    "buying_mode": item.get("buying_mode", ""),
                    "city": item.get("address", {}).get("city_name", ""),
                    "state": item.get("address", {}).get("state_name", ""),
                    "has_gtin": 0,
                    "num_attributes": 0,
                }

                # Verificar se tem GTIN e contar atributos com valor
                if "attributes" in item:
                    for attr in item["attributes"]:
                        if attr["id"] == "GTIN" and attr.get("value_name"):
                            item_data["has_gtin"] = 1
                        if attr.get("value_name"):
                            item_data["num_attributes"] += 1

                # Escrever cada item no arquivo
                json.dump(item_data, output_file)
                output_file.write("\n")

# INIT DIR IF NOT EXISTS
os.makedirs(seller_items_path, exist_ok=True)

# Initialize the dictionary to store the country_id as keys
country_files = {}

# Loop through each subfolder (country_id) in sellers_path
for country_id in os.listdir(sellers_path):

    country_path = os.path.join(sellers_path, country_id)

    # Ensure we're only processing directories
    if os.path.isdir(country_path):

        # List all JSON files in the current country folder
        json_files = [f for f in os.listdir(country_path) if f.endswith(".json")]
        # Add to the dictionary
        country_files[country_id] = json_files

sellers_ids = {}

# Create dict from sellers for each country
for country_id, files_list in country_files.items():

    # Initialize a list for each country
    if country_id not in sellers_ids:
        sellers_ids[country_id] = []

    for file in files_list:
        file_path = os.path.join(sellers_path, country_id, file)

        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Ensure data is a list before extending
        if isinstance(data, list):
            
            sellers_ids[country_id].extend(data)

# Remove duplicates from sellers_ids dictionary
for country_id in sellers_ids.keys():
    sellers_ids[country_id] = list(set(sellers_ids[country_id]))

existing_files = {}

for country_id in sellers_ids.keys():

    if not os.path.exists(os.path.join(seller_items_path, country_id)):

        continue 

    # Obter lista de sellers que ainda não têm arquivos salvos
    existing_files[country_id] = [
        f.split("_")[0]
        for f in os.listdir(os.path.join(seller_items_path,country_id))
        if f.endswith("_items.json")
    ]

for country_id in sellers_ids.keys():

    if country_id in existing_files.keys():  # Ensure the key exists before accessing it

        sellers_ids[country_id] = [
            seller_id
            for seller_id in sellers_ids[country_id]
            if str(seller_id) not in existing_files[country_id]
        ]

sum(len(sellers) for sellers in sellers_ids.values())

counter = 1
for country_id, s_ids in sellers_ids.items():

    # comment to run all countries
    if country_id != target_country_id:  # comment to run all countries

        continue  # comment to run all countries

    print(f"Processing country: {country_id}, {counter}/{len(sellers_ids)}")

    cnt=1

    for seller_id in s_ids:

        if cnt % 100 == 0 or cnt == 1:

            print(f"Processing seller: {seller_id}, {cnt}/{len(s_ids)}")

        get_seller_items(seller_id, country_id)
        cnt += 1

    counter += 1

Processing country: MPY, 1/7
Processing seller: 2049513472, 1/436
Processing seller: 1751562688, 100/436
Processing seller: 2215218070, 200/436
Processing seller: 1788650855, 300/436
Processing seller: 2253346642, 400/436
