-
Notifications
You must be signed in to change notification settings - Fork 0
Description
#!/usr/bin/env python3
-- coding: utf-8 --
"""
Ozon Parser v5.2 — фикс цен, глубокий сбор карточек, чистый CSV
Функциональность:
- Сбор карточек Ozon (title, url, price_text, price_num)
- Улучшенный поиск цен (поддержка ui-price__price, b5v2, a1y5 и др.)
- Удаление дублей и нечитабельных строк
- Глубокая прокрутка страницы (до конца)
- Поддержка прокси (--proxy user:pass@ip:port)
- Поддержка профиля (--persist)
- CSV в кодировке UTF-8 с цифрами цены отдельно
"""
import argparse
import re
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
import pandas as pd
from playwright.sync_api import sync_playwright, Page, Frame
SEARCH_URL = "https://www.ozon.ru/search/?text={query}"
ACCEPT_LANG = "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7"
def log(msg: str):
print(f"[ozon] {msg}", file=sys.stderr)
def deep_scroll(page: Page, total_steps: int = 40, step_px: int = 2000, wait_ms: int = 1000):
"""Глубокая прокрутка вниз для подгрузки товаров"""
for i in range(total_steps):
page.mouse.wheel(0, step_px)
page.wait_for_timeout(wait_ms)
def parse_proxy(proxy: Optional[str]):
if not proxy:
return None
try:
if "@" in proxy:
creds, addr = proxy.split("@", 1)
user, pw = creds.split(":", 1)
host, port = addr.split(":")
return {"server": f"http://{host}:{port}", "username": user, "password": pw}
else:
host, port = proxy.split(":")
return {"server": f"http://{host}:{port}"}
except Exception:
log("
return None
def extract_price_num(price_text: str) -> Optional[int]:
"""Извлекает только цифры из цены"""
if not price_text:
return None
m = re.findall(r"\d+", price_text.replace(" ", "").replace(" ", ""))
return int("".join(m)) if m else None
def extract_from_frame(frame: Frame, query: str) -> List[Dict]:
"""Сбор карточек из одного фрейма"""
js = """
(() => {
const anchors = Array.from(document.querySelectorAll('a[href*="/product/"]'));
const out = [];
for (const a of anchors) {
const href = a.getAttribute('href') || '';
if (!href) continue;
const url = href.startsWith('http') ? href : ('https://www.ozon.ru' + href);
if (url.includes('advert') || url.includes('action')) continue;
let title = (a.getAttribute('aria-label') || a.getAttribute('title') || '').trim();
const tile = a.closest('div[data-widget*="tile"]') || a.closest('div');
if (!title && tile) {
const t = tile.querySelector('span[class*="tsBody"], span[class*="tsBodyL"], span[class*="tsHeadline"]');
title = (t && t.textContent || a.textContent || '').trim();
}
let price_text = '';
if (tile) {
const p = tile.querySelector(
'span[class*="ui-price__price"], span[class*="b5v2"], span[class*="a1y5"], ' +
'span[class*="price"], span[class*="Price"], span[class*="_price"]'
);
price_text = (p && p.textContent || '').trim();
}
out.push({ title, url, price_text });
}
return out;
})();
"""
try:
data = frame.evaluate(js)
except Exception:
return []
out_clean: List[Dict] = []
for item in data:
t = (item.get("title") or "").strip()
u = (item.get("url") or "").strip()
p = (item.get("price_text") or "").strip()
if not u or not t:
continue
if "ozon.ru/product/" not in u:
continue
price_num = extract_price_num(p)
out_clean.append({
"query": query,
"title": t,
"url": u,
"price_text": p,
"price_num": price_num,
})
return out_clean
def dedup(items: List[Dict]) -> List[Dict]:
seen, out = set(), []
for i in items:
u = i.get("url")
if not u or u in seen:
continue
seen.add(u)
out.append(i)
return out
def run(query: str, limit: int, out_csv: Path, headless: bool, persist: bool, proxy: Optional[str]):
with sync_playwright() as p:
proxy_conf = parse_proxy(proxy)
if proxy_conf:
log(f"используется прокси: {proxy_conf['server']}")
args = ["--disable-blink-features=AutomationControlled"]
if persist:
user_data_dir = str(Path("ozon_profile").absolute())
context = p.chromium.launch_persistent_context(
user_data_dir=user_data_dir,
headless=headless,
viewport={"width": 1366, "height": 900},
java_script_enabled=True,
locale="ru-RU",
proxy=proxy_conf,
args=args,
)
page = context.pages[0] if context.pages else context.new_page()
browser = None
else:
browser = p.chromium.launch(headless=headless, args=args, proxy=proxy_conf)
context = browser.new_context(
viewport={"width": 1366, "height": 900},
locale="ru-RU",
java_script_enabled=True,
)
page = context.new_page()
url = SEARCH_URL.format(query=query.replace(" ", "+"))
log(f"goto {url}")
page.goto(url, wait_until="domcontentloaded", timeout=90000)
page.wait_for_timeout(3000)
deep_scroll(page, total_steps=40)
log("прокрутка завершена")
frames = page.frames
all_items: List[Dict] = []
for fr in frames:
items = extract_from_frame(fr, query)
if items:
log(f" [frame:{fr.url[:60]}...] карточек: {len(items)}")
all_items.extend(items)
all_items = dedup(all_items)
if limit:
all_items = all_items[:limit]
log(f"всего карточек после очистки: {len(all_items)}")
df = pd.DataFrame(all_items)
out_csv.parent.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%H%M%S")
final = out_csv.with_name(f"{out_csv.stem}_{timestamp}{out_csv.suffix}")
df.to_csv(final, index=False, encoding="utf-8-sig")
log(f"✅ сохранено в {final}")
context.close()
if browser:
browser.close()
def main():
ap = argparse.ArgumentParser(description="Ozon search parser v5.2 (clean prices)")
ap.add_argument("--query", required=True, help="Поисковый запрос, например: 'гидрогелевая пленка iphone'")
ap.add_argument("--limit", type=int, default=100, help="Ограничение карточек")
ap.add_argument("--out", type=Path, default=Path("ozon_results.csv"))
ap.add_argument("--headed", action="store_true", help="Браузер с окном")
ap.add_argument("--persist", action="store_true", help="Постоянный профиль (ozon_profile/)")
ap.add_argument("--proxy", type=str, help="HTTP-прокси (user:pass@ip:port или ip:port)")
args = ap.parse_args()
run(
query=args.query,
limit=args.limit,
out_csv=args.out,
headless=not args.headed,
persist=args.persist,
proxy=args.proxy,
)
if name == "main":
main()