# data_mining (shoes)

Полный ноутбук для сбора/очистки/нормализации каталога обуви и загрузки в таблицу bikes.

In [None]:

# ==== 1) Config & imports ====
import os, re, time, math, json, random, itertools, typing as T
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

# Embeddings
from sentence_transformers import SentenceTransformer

# DB
import psycopg
from pgvector.psycopg import register_vector

# Paths & env
USE_GPU = os.getenv("USE_GPU", "false").lower() == "true"
DB_DSN  = os.getenv("DB_DSN", "postgresql://postgres:postgres@localhost:5430/bikes")
OUT_JSON = os.getenv("OUT_JSON", "./data/parsed_data.json")
OUT_PARQUET = os.getenv("OUT_PARQUET", "./data/parsed_data.parquet")

# polite scraping
HEADERS = {
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/125.0 Safari/537.36"
}
REQ_TIMEOUT = 30
SLEEP_RANGE = (0.5, 1.8)   # seconds between requests

print("DB_DSN:", DB_DSN)
print("USE_GPU:", USE_GPU)


## Источники и адаптеры

In [None]:

# ==== 2) Sources and adapters ====
SOURCES = [
    # Примеры — замените на реальные:
    {
        "name": "shop_example_men_sneakers",
        "list_url": "https://example.com/men/sneakers",
    },
    {
        "name": "shop_example_women_boots",
        "list_url": "https://example.com/women/boots",
    },
]

def fetch(url: str) -> str:
    """HTTP GET с повторами и паузами."""
    for i in range(3):
        try:
            r = requests.get(url, headers=HEADERS, timeout=REQ_TIMEOUT)
            if r.status_code == 200:
                return r.text
        except Exception:
            pass
        time.sleep(1.5 + i)
    return ""

def extract_links_example(html: str) -> list:
    """Черновой пример извлечения ссылок со страницы списка (замените под реальный сайт)."""
    soup = BeautifulSoup(html, "html.parser")
    links = []
    for card in soup.select(".product-card a"):
        href = card.get("href")
        if href and href.startswith("/"):
            href = "https://example.com" + href
        if href and href.startswith("http"):
            links.append(href)
    # dedup
    return list(dict.fromkeys(links))

def parse_product_example(url: str) -> dict:
    """Черновой разбор карточки товара для example.com. Замените селекторы под реальный сайт."""
    html = fetch(url)
    if not html:
        return {}

    soup = BeautifulSoup(html, "html.parser")

    def txt(sel, default=""):
        el = soup.select_one(sel)
        return el.get_text(strip=True) if el else default

    name = txt("h1.product-title")
    brand = txt(".product-brand")
    category = txt(".breadcrumb .active") or "sneakers"
    price_raw = txt(".price-current").replace(" "," ").replace("₽","").replace(",",".")
    price = None
    try:
        price = float(__import__("re").findall(r"[0-9.]+", price_raw)[0])
    except Exception:
        pass

    color = txt(".color-name")
    gender = txt(".gender-tag")  # может отсутствовать
    sizes_text = " ".join(x.get_text(strip=True) for x in soup.select(".sizes .size"))
    desc = txt(".product-description")

    image_url = ""
    im = soup.select_one(".product-images img")
    if im and (im.get("src") or im.get("data-src")):
        image_url = im.get("data-src") or im.get("src")
        if image_url.startswith("/"):
            image_url = "https://example.com" + image_url

    return {
        "name": name,
        "brand": brand,
        "category": category,
        "gender": gender,
        "color": color,
        "sizes_raw": sizes_text,
        "price": price,
        "old_price": None,
        "discount": None,
        "in_stock": None,
        "url": url,
        "image_url": image_url,
        "description": desc
    }


In [None]:

# ==== 3) Normalization helpers ====
def norm_gender(s: str) -> str:
    s = (s or "").lower()
    if "жен" in s or "women" in s: return "women"
    if "муж" in s or "men" in s:   return "men"
    if "дет" in s or "kid" in s:   return "kids"
    return ""

def parse_sizes(text: str) -> list[int]:
    import re
    nums = []
    for part in re.findall(r"\b\d{2}(?:-\d{2})?\b", text or ""):
        if "-" in part:
            a,b = map(int, part.split("-"))
            nums.extend(range(min(a,b), max(a,b)+1))
        else:
            try:
                nums.append(int(part))
            except:
                pass
    return [x for x in nums if 16 <= x <= 52]

def norm_color(s: str) -> str:
    import re
    s = (s or "").lower()
    if re.search(r"черн|black|noir|schwarz", s): return "black"
    if re.search(r"бел|white|blanc|weiß", s): return "white"
    if re.search(r"син|blue|bleu", s): return "blue"
    if re.search(r"красн|red|rouge", s): return "red"
    if re.search(r"коричн|brown|brun", s): return "brown"
    if re.search(r"беж|beige", s): return "beige"
    if re.search(r"сер|grey|gray|grau", s): return "grey"
    return ""

def extract_season(text: str) -> str:
    t = (text or "").lower()
    return "winter" if "зим" in t else "summer" if "лет" in t else "demi" if "демисез" in t else ""

def extract_materials(text: str) -> dict:
    t = (text or "").lower()
    return {
        "mat_leather": int("кож" in t or "leather" in t),
        "mat_suede": int("замш" in t or "suede" in t),
        "mat_textile": int("текстил" in t or "textile" in t),
        "membrane": int("gore" in t or "мембран" in t),
    }

def build_full_description(rec: dict) -> str:
    sizes = rec.get("sizes", []) or []
    sizes_str = f"размеры: {', '.join(map(str, sorted(set(sizes))))}" if sizes else ""
    parts = [
        rec.get("name",""),
        rec.get("brand",""),
        rec.get("category",""),
        rec.get("gender",""),
        rec.get("color",""),
        sizes_str,
        rec.get("description",""),
    ]
    # синонимы по категории
    cat = (rec.get("category","") + " " + rec.get("name","")).lower()
    synonyms = []
    if "sneaker" in cat or "крос" in cat: synonyms += ["кроссовки", "sneakers"]
    if "boot" in cat or "ботин" in cat:   synonyms += ["ботинки", "boots"]
    if "sandal" in cat or "сандал" in cat: synonyms += ["сандалии", "sandals"]
    if "loaf" in cat or "лофер" in cat:   synonyms += ["лоферы", "loafers"]
    if synonyms: parts.append(" ".join(synonyms))
    return " | ".join([p for p in parts if p])


In [None]:

# ==== 4) Crawl orchestration ====
def crawl_source(src: dict) -> list[dict]:
    print(">>> crawl:", src["name"], src["list_url"])
    html = fetch(src["list_url"])
    if not html:
        print("  ! no html")
        return []
    links = extract_links_example(html)  # замените на адаптер для вашего сайта
    print("  found links:", len(links))

    items = []
    for i, url in enumerate(links, 1):
        it = parse_product_example(url)   # замените на адаптер для вашего сайта
        if it:
            items.append(it)
        if i % 10 == 0:
            print(f"  parsed {i}/{len(links)}")
        time.sleep(random.uniform(*SLEEP_RANGE))
    return items

all_raw = []
for src in SOURCES:
    try:
        all_raw += crawl_source(src)
    except Exception as e:
        print("source failed:", src["name"], e)

print("Total raw items:", len(all_raw))
df = pd.DataFrame(all_raw)
df.head()


In [None]:

# ==== 5) Normalize dataframe ====
if df.empty:
    print("WARNING: df is empty — заполни SOURCES и адаптеры под реальный сайт.")
else:
    df["gender"] = df["gender"].map(norm_gender)
    df["color_norm"] = df["color"].map(norm_color)
    df["sizes"] = df["sizes_raw"].map(parse_sizes)
    df["season"] = df["description"].map(extract_season)
    mats = df["description"].map(extract_materials).apply(pd.Series).fillna(0).astype(int)
    df = pd.concat([df, mats], axis=1)

    # full_description
    df["full_description"] = df.apply(lambda r: build_full_description({
        "name": r.get("name",""),
        "brand": r.get("brand",""),
        "category": r.get("category",""),
        "gender": r.get("gender",""),
        "color": r.get("color_norm",""),
        "sizes": r.get("sizes",[]),
        "description": r.get("description","")
    }), axis=1)

    # базовые проверки
    df = df.dropna(subset=["name", "url"]).drop_duplicates(subset=["url"])
    print("Prepared rows:", len(df))
    display(df.head(3))


In [None]:

# ==== 6) Embeddings ====
if df.empty:
    print("Skip embeddings: df empty")
else:
    device = "cuda" if USE_GPU else "cpu"
    embedder = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device=device)
    vecs = []
    for t in df["full_description"].fillna("").tolist():
        v = embedder.encode([t], prompt_name="query")[0]
        vecs.append(v)
    df["embedding"] = vecs
    print("Embeddings computed:", len(vecs))


In [None]:

# ==== 7) DB load ====
if df.empty:
    print("Skip DB load: df empty")
else:
    with psycopg.connect(DB_DSN) as conn:
        register_vector(conn)
        with conn.cursor() as cur:
            # опционально: добавить UNIQUE(url) для аккуратного UPSERT
            # cur.execute("CREATE UNIQUE INDEX IF NOT EXISTS ux_bikes_url ON bikes(url);")
            # conn.commit()

            rows = df.to_dict(orient="records")
            for r in rows:
                cur.execute(
                    """
                    INSERT INTO bikes (name, brand, category, price, old_price, discount, in_stock,
                                       url, image_url, full_description, embedding, fts_vector)
                    VALUES (%(name)s, %(brand)s, %(category)s, %(price)s, %(old_price)s, %(discount)s, %(in_stock)s,
                            %(url)s, %(image_url)s, %(full_description)s, %(embedding)s,
                            to_tsvector('russian', coalesce(%(full_description)s,'')))
                    ON CONFLICT (url) DO UPDATE SET
                        name=EXCLUDED.name,
                        brand=EXCLUDED.brand,
                        category=EXCLUDED.category,
                        price=EXCLUDED.price,
                        old_price=EXCLUDED.old_price,
                        discount=EXCLUDED.discount,
                        in_stock=EXCLUDED.in_stock,
                        image_url=EXCLUDED.image_url,
                        full_description=EXCLUDED.full_description,
                        embedding=EXCLUDED.embedding,
                        fts_vector=to_tsvector('russian', coalesce(EXCLUDED.full_description,''))
                    """,
                    r
                )
        conn.commit()
    print("DB load complete.")


In [None]:

# ==== 8) Save artifacts ====
if not df.empty:
    os.makedirs(os.path.dirname(OUT_JSON), exist_ok=True)
    df.to_json(OUT_JSON, orient="records", force_ascii=False, indent=2)
    print("Saved JSON ->", OUT_JSON)

    try:
        df.to_parquet(OUT_PARQUET, index=False)
        print("Saved Parquet ->", OUT_PARQUET)
    except Exception as e:
        print("Parquet save skipped:", e)
else:
    print("Skip save: df empty")


In [None]:

# ==== 9) Optional: quick preview ====
test_query = "мужские кроссовки черные 42"
with psycopg.connect(DB_DSN) as conn:
    register_vector(conn)
    with conn.cursor() as cur:
        # FTS
        cur.execute(
            """
            SELECT name, price, url
            FROM bikes
            WHERE fts_vector @@ websearch_to_tsquery('russian', %s)
            ORDER BY ts_rank(fts_vector, websearch_to_tsquery('russian', %s)) DESC
            LIMIT 5
            """,
            (test_query, test_query)
        )
        print("FTS top-5:")
        for r in cur.fetchall():
            print(" -", r)
print("Done.")
