Skip to content

Main_v5.2 #1

@eL2Walker2

Description

@eL2Walker2

#!/usr/bin/env python3

-- coding: utf-8 --

"""
Ozon Parser v5.2 — фикс цен, глубокий сбор карточек, чистый CSV

Функциональность:

  • Сбор карточек Ozon (title, url, price_text, price_num)
  • Улучшенный поиск цен (поддержка ui-price__price, b5v2, a1y5 и др.)
  • Удаление дублей и нечитабельных строк
  • Глубокая прокрутка страницы (до конца)
  • Поддержка прокси (--proxy user:pass@ip:port)
  • Поддержка профиля (--persist)
  • CSV в кодировке UTF-8 с цифрами цены отдельно
    """

import argparse
import re
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional

import pandas as pd
from playwright.sync_api import sync_playwright, Page, Frame

SEARCH_URL = "https://www.ozon.ru/search/?text={query}"
ACCEPT_LANG = "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7"

def log(msg: str):
print(f"[ozon] {msg}", file=sys.stderr)

def deep_scroll(page: Page, total_steps: int = 40, step_px: int = 2000, wait_ms: int = 1000):
"""Глубокая прокрутка вниз для подгрузки товаров"""
for i in range(total_steps):
page.mouse.wheel(0, step_px)
page.wait_for_timeout(wait_ms)

def parse_proxy(proxy: Optional[str]):
if not proxy:
return None
try:
if "@" in proxy:
creds, addr = proxy.split("@", 1)
user, pw = creds.split(":", 1)
host, port = addr.split(":")
return {"server": f"http://{host}:{port}", "username": user, "password": pw}
else:
host, port = proxy.split(":")
return {"server": f"http://{host}:{port}"}
except Exception:
log("⚠️ не удалось распарсить прокси")
return None

def extract_price_num(price_text: str) -> Optional[int]:
"""Извлекает только цифры из цены"""
if not price_text:
return None
m = re.findall(r"\d+", price_text.replace(" ", "").replace(" ", ""))
return int("".join(m)) if m else None

def extract_from_frame(frame: Frame, query: str) -> List[Dict]:
"""Сбор карточек из одного фрейма"""
js = """
(() => {
const anchors = Array.from(document.querySelectorAll('a[href*="/product/"]'));
const out = [];
for (const a of anchors) {
const href = a.getAttribute('href') || '';
if (!href) continue;
const url = href.startsWith('http') ? href : ('https://www.ozon.ru' + href);
if (url.includes('advert') || url.includes('action')) continue;

let title = (a.getAttribute('aria-label') || a.getAttribute('title') || '').trim();
const tile = a.closest('div[data-widget*="tile"]') || a.closest('div');
if (!title && tile) {
  const t = tile.querySelector('span[class*="tsBody"], span[class*="tsBodyL"], span[class*="tsHeadline"]');
  title = (t && t.textContent || a.textContent || '').trim();
}

let price_text = '';
if (tile) {
  const p = tile.querySelector(
    'span[class*="ui-price__price"], span[class*="b5v2"], span[class*="a1y5"], ' +
    'span[class*="price"], span[class*="Price"], span[class*="_price"]'
  );
  price_text = (p && p.textContent || '').trim();
}

out.push({ title, url, price_text });

}
return out;
})();
"""
try:
data = frame.evaluate(js)
except Exception:
return []

out_clean: List[Dict] = []
for item in data:
    t = (item.get("title") or "").strip()
    u = (item.get("url") or "").strip()
    p = (item.get("price_text") or "").strip()
    if not u or not t:
        continue
    if "ozon.ru/product/" not in u:
        continue
    price_num = extract_price_num(p)
    out_clean.append({
        "query": query,
        "title": t,
        "url": u,
        "price_text": p,
        "price_num": price_num,
    })
return out_clean

def dedup(items: List[Dict]) -> List[Dict]:
seen, out = set(), []
for i in items:
u = i.get("url")
if not u or u in seen:
continue
seen.add(u)
out.append(i)
return out

def run(query: str, limit: int, out_csv: Path, headless: bool, persist: bool, proxy: Optional[str]):
with sync_playwright() as p:
proxy_conf = parse_proxy(proxy)
if proxy_conf:
log(f"используется прокси: {proxy_conf['server']}")

    args = ["--disable-blink-features=AutomationControlled"]
    if persist:
        user_data_dir = str(Path("ozon_profile").absolute())
        context = p.chromium.launch_persistent_context(
            user_data_dir=user_data_dir,
            headless=headless,
            viewport={"width": 1366, "height": 900},
            java_script_enabled=True,
            locale="ru-RU",
            proxy=proxy_conf,
            args=args,
        )
        page = context.pages[0] if context.pages else context.new_page()
        browser = None
    else:
        browser = p.chromium.launch(headless=headless, args=args, proxy=proxy_conf)
        context = browser.new_context(
            viewport={"width": 1366, "height": 900},
            locale="ru-RU",
            java_script_enabled=True,
        )
        page = context.new_page()

    url = SEARCH_URL.format(query=query.replace(" ", "+"))
    log(f"goto {url}")
    page.goto(url, wait_until="domcontentloaded", timeout=90000)
    page.wait_for_timeout(3000)
    deep_scroll(page, total_steps=40)
    log("прокрутка завершена")

    frames = page.frames
    all_items: List[Dict] = []
    for fr in frames:
        items = extract_from_frame(fr, query)
        if items:
            log(f"  [frame:{fr.url[:60]}...] карточек: {len(items)}")
        all_items.extend(items)

    all_items = dedup(all_items)
    if limit:
        all_items = all_items[:limit]

    log(f"всего карточек после очистки: {len(all_items)}")
    df = pd.DataFrame(all_items)

    out_csv.parent.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.now().strftime("%H%M%S")
    final = out_csv.with_name(f"{out_csv.stem}_{timestamp}{out_csv.suffix}")
    df.to_csv(final, index=False, encoding="utf-8-sig")
    log(f"✅ сохранено в {final}")

    context.close()
    if browser:
        browser.close()

def main():
ap = argparse.ArgumentParser(description="Ozon search parser v5.2 (clean prices)")
ap.add_argument("--query", required=True, help="Поисковый запрос, например: 'гидрогелевая пленка iphone'")
ap.add_argument("--limit", type=int, default=100, help="Ограничение карточек")
ap.add_argument("--out", type=Path, default=Path("ozon_results.csv"))
ap.add_argument("--headed", action="store_true", help="Браузер с окном")
ap.add_argument("--persist", action="store_true", help="Постоянный профиль (ozon_profile/)")
ap.add_argument("--proxy", type=str, help="HTTP-прокси (user:pass@ip:port или ip:port)")
args = ap.parse_args()

run(
    query=args.query,
    limit=args.limit,
    out_csv=args.out,
    headless=not args.headed,
    persist=args.persist,
    proxy=args.proxy,
)

if name == "main":
main()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions