In [6]:
import os, time, sys, random, urllib.request, threading
from urllib.parse import urlencode, urlparse, parse_qs, urlunparse
from concurrent.futures import ThreadPoolExecutor, as_completed

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException

In [7]:
# -------- ПАРАМЕТРЫ --------
BASE = "https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1"
OUTPUT = "links.txt"
MAX_PAGES = 40
HEADLESS = True
PAGELOAD_TIMEOUT = 20
N_BROWSERS = 4  # сколько параллельных окон

In [8]:
district_id = [x for x in range(1, 132)]

In [9]:
if 'district_id' in globals() and isinstance(district_id, list):
    district_ids = district_id
elif 'district_ids' not in globals():
    district_ids = list(range(1, 133))

In [None]:
def load_existing(path=OUTPUT) -> set[str]:
    if not os.path.exists(path): return set()
    with open(path, "r", encoding="utf-8") as f:
        return set(ln.strip() for ln in f if ln.strip())

In [11]:
seen_links = load_existing(OUTPUT)     # общий набор уже сохранённых ссылок
file_lock = threading.Lock()           # блокировка на запись/дедуп
def append_threadsafe(candidates:set[str]) -> int:
    """
    Потокобезопасно дописывает новые ссылки в файл.
    Гарантии:
      - дедупликация между потоками;
      - flush + fsync после каждой партии;
      - возвращает сколько реально добавлено.
    """
    if not candidates: return 0
    added = 0
    with file_lock:
        new = [u for u in candidates if u not in seen_links]
        if not new: return 0
        # сразу обновим общий набор, чтобы другие потоки не писали дубли
        seen_links.update(new)
        # дописываем
        # на Windows fsync сработает тоже, но путь немного отличается — оставим общий вариант
        with open(OUTPUT, "a", encoding="utf-8") as f:
            for u in new:
                f.write(u + "\n")
            f.flush()
            os.fsync(f.fileno())
        added = len(new)
    return added

In [12]:
def make_driver():
    o = Options()
    if HEADLESS: o.add_argument("--headless=new")
    o.page_load_strategy = "eager"
    o.add_argument("--disable-gpu")
    o.add_argument("--no-sandbox")
    o.add_argument("--disable-dev-shm-usage")
    o.add_argument("--blink-settings=imagesEnabled=false")
    o.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
    o.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0 Safari/537.36")
    d = webdriver.Chrome(options=o)
    d.set_page_load_timeout(PAGELOAD_TIMEOUT)
    d.set_script_timeout(10)
    return d

In [13]:
def is_online(timeout=4):
    try:
        urllib.request.urlopen("https://www.google.com/generate_204", timeout=timeout)
        return True
    except Exception:
        return False

In [14]:
def safe_get(drv, url, wait_css=None, attempts=4, sleep_base=1.5):
    def restart_local_driver():
        try: drv.quit()
        except Exception: pass
        return make_driver()

    for i in range(1, attempts + 1):
        t0 = time.time()
        while not is_online():
            if time.time() - t0 > 60:
                raise TimeoutException("Сеть не восстановилась за 60 секунд.")
            time.sleep(1.5)
        try:
            drv.get(url)
            if wait_css:
                WebDriverWait(drv, 12).until(lambda d: len(d.find_elements(By.CSS_SELECTOR, wait_css)) > 0)
            else:
                time.sleep(1.2)
            return drv
        except (TimeoutException, WebDriverException) as e:
            msg = str(e)
            print(f"[safe_get] попытка {i}/{attempts} неудачна: {msg[:140]}...", file=sys.stderr)
            time.sleep(sleep_base * i)
            fatal = any(s in msg for s in [
                "ERR_INTERNET_DISCONNECTED","ERR_NETWORK_CHANGED","ERR_NAME_NOT_RESOLVED",
                "disconnected: not connected to DevTools","chrome not reachable","timeout"
            ])
            if fatal or i == attempts:
                drv = restart_local_driver()
    return drv

In [16]:
def safe_scroll_to_bottom(driver, url_for_reload: str,
                          max_same=3, sleep_between=2.0,
                          js_attempts=3, kb_attempts=2):
    def js_scroll_round():
        nonlocal last_height, same
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(sleep_between)
            new_height = driver.execute_script("return document.body.scrollHeight")
            same = same + 1 if new_height == last_height else 0
            last_height = max(last_height, new_height)
            return True
        except WebDriverException:
            return False

    def kb_scroll_round():
        nonlocal same, last_height
        try:
            body = driver.find_element(By.TAG_NAME, "body")
            ActionChains(driver).move_to_element(body).click(body).perform()
            for _ in range(12):
                body.send_keys(Keys.PAGE_DOWN); time.sleep(0.12)
            body.send_keys(Keys.END); time.sleep(sleep_between)
            try:
                new_h = driver.execute_script("return document.body.scrollHeight")
            except WebDriverException:
                return False
            same = same + 1 if new_h == last_height else 0
            last_height = max(last_height, new_h)
            return True
        except Exception:
            return False

    last_height, same = 0, 0
    js_fail_streak = kb_fail_streak = 0

    while True:
        ok = js_scroll_round()
        if not ok:
            js_fail_streak += 1
            ok2 = kb_scroll_round()
            kb_fail_streak = 0 if ok2 else kb_fail_streak + 1
        else:
            js_fail_streak = 0

        if js_fail_streak >= js_attempts and kb_fail_streak >= kb_attempts:
            print("↻ вкладка зависла: перезапуск драйвера и повторное открытие страницы…")
            try: driver.quit()
            except Exception: pass
            driver = make_driver()
            driver.get(url_for_reload); time.sleep(1.5)
            last_height = same = js_fail_streak = kb_fail_streak = 0
            continue

        if same >= max_same:
            break

    time.sleep(1.1)
    return driver


In [17]:
def build_url(did:int, page:int) -> str:
    pr = urlparse(BASE); q = parse_qs(pr.query, keep_blank_values=True)
    for k in list(q):
        if k.startswith("district[") or k == "p": del q[k]
    q["district[0]"] = [str(did)]
    q["p"] = [str(page)]
    return urlunparse((pr.scheme, pr.netloc, pr.path, pr.params, urlencode(q, doseq=True), pr.fragment))


In [18]:
def collect_links(driver) -> set[str]:
    cards = driver.find_elements(By.CSS_SELECTOR, "a[data-name='LinkArea'], a._93444fe79c--media--9P6wN")
    return {(c.get_attribute("href") or "").split("?")[0] for c in cards if c.get_attribute("href")}


In [19]:
def scan_district(did: int):
    drv = make_driver()
    total_added = 0
    try:
        for p in range(1, MAX_PAGES + 1):
            url = build_url(did, p)
            print(f"[did {did} | p {p}] GET {url}")
            drv = safe_get(drv, url, wait_css="body")
            drv = safe_scroll_to_bottom(drv, url)
            page_links = collect_links(drv)
            # пишем ИМЕННО СЕЙЧАС: потокобезопасно
            added = append_threadsafe(page_links)
            total_added += added
            print(f"[did {did} | p {p}] на странице: {len(page_links)} | новых ЗАПИСАНО: {added}")

            # эвристика раннего выхода: если не было новых — стоп район
            if added == 0:
                print(f"[did {did}] новых нет → стоп по району")
                break

            time.sleep(random.uniform(0.15, 0.35))
    finally:
        try: drv.quit()
        except Exception: pass
    return did, total_added


In [None]:
print(f"Старт. Уже в {OUTPUT}: {len(seen_links)} ссылок.\n")
t0 = time.time()

with ThreadPoolExecutor(max_workers=N_BROWSERS) as pool:
    futures = [pool.submit(scan_district, did) for did in district_ids]
    for fut in as_completed(futures):
        did, added = fut.result()
        print(f"✔ Район {did} завершён. Добавлено: {added}")

elapsed = time.time() - t0
print(f"\nГотово. В {OUTPUT}  {len(load_existing(OUTPUT))}  ссылок.")
print(f"Время: {elapsed:.1f} c. Параллельных окон: {N_BROWSERS}.")

Старт. Уже в links.txt: 39336 ссылок.

[did 1 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=1&p=1
[did 2 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=2&p=1
[did 3 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=3&p=1
[did 4 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=4&p=1
[did 2 | p 1] на странице: 0 | новых ЗАПИСАНО: 0
[did 2] новых нет → стоп по району
✔ Район 2 завершён. Добавлено: 0
[did 3 | p 1] на странице: 0 | новых ЗАПИСАНО: 0
[did 3] новых нет → стоп по району
✔ Район 3 завершён. Добавлено: 0
[did 5 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=5&p=1
[did 6 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1

In [None]:
# === Jupyter cell: CIAN scraper — parallel & incremental safe append ===
import os, time, sys, random, urllib.request, threading
from urllib.parse import urlencode, urlparse, parse_qs, urlunparse
from concurrent.futures import ThreadPoolExecutor, as_completed

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, WebDriverException

# -------- ПАРАМЕТРЫ --------
BASE = "https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1"
OUTPUT = "links.txt"
MAX_PAGES = 40
HEADLESS = True
PAGELOAD_TIMEOUT = 20
N_BROWSERS = 4  # сколько параллельных окон

# список районов
if 'district_id' in globals() and isinstance(district_id, list):
    district_ids = district_id
elif 'district_ids' not in globals():
    district_ids = list(range(1, 133))

# -------- ФАЙЛ/ДЕДУП --------
def load_existing(path=OUTPUT) -> set[str]:
    if not os.path.exists(path): return set()
    with open(path, "r", encoding="utf-8") as f:
        return set(ln.strip() for ln in f if ln.strip())

seen_links = load_existing(OUTPUT)     # общий набор уже сохранённых ссылок
file_lock = threading.Lock()           # блокировка на запись/дедуп
def append_threadsafe(candidates:set[str]) -> int:
    """
    Потокобезопасно дописывает новые ссылки в файл.
    Гарантии:
      - дедупликация между потоками;
      - flush + fsync после каждой партии;
      - возвращает сколько реально добавлено.
    """
    if not candidates: return 0
    added = 0
    with file_lock:
        new = [u for u in candidates if u not in seen_links]
        if not new: return 0
        # сразу обновим общий набор, чтобы другие потоки не писали дубли
        seen_links.update(new)
        # дописываем
        # на Windows fsync сработает тоже, но путь немного отличается — оставим общий вариант
        with open(OUTPUT, "a", encoding="utf-8") as f:
            for u in new:
                f.write(u + "\n")
            f.flush()
            os.fsync(f.fileno())
        added = len(new)
    return added

# -------- ДРАЙВЕР/СЕТЬ --------
def make_driver():
    o = Options()
    if HEADLESS: o.add_argument("--headless=new")
    o.page_load_strategy = "eager"
    o.add_argument("--disable-gpu")
    o.add_argument("--no-sandbox")
    o.add_argument("--disable-dev-shm-usage")
    o.add_argument("--blink-settings=imagesEnabled=false")
    o.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
    o.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/124.0 Safari/537.36")
    d = webdriver.Chrome(options=o)
    d.set_page_load_timeout(PAGELOAD_TIMEOUT)
    d.set_script_timeout(10)
    return d

def is_online(timeout=4):
    try:
        urllib.request.urlopen("https://www.google.com/generate_204", timeout=timeout)
        return True
    except Exception:
        return False

def safe_get(drv, url, wait_css=None, attempts=4, sleep_base=1.5):
    def restart_local_driver():
        try: drv.quit()
        except Exception: pass
        return make_driver()

    for i in range(1, attempts + 1):
        t0 = time.time()
        while not is_online():
            if time.time() - t0 > 60:
                raise TimeoutException("Сеть не восстановилась за 60 секунд.")
            time.sleep(1.5)
        try:
            drv.get(url)
            if wait_css:
                WebDriverWait(drv, 12).until(lambda d: len(d.find_elements(By.CSS_SELECTOR, wait_css)) > 0)
            else:
                time.sleep(1.2)
            return drv
        except (TimeoutException, WebDriverException) as e:
            msg = str(e)
            print(f"[safe_get] попытка {i}/{attempts} неудачна: {msg[:140]}...", file=sys.stderr)
            time.sleep(sleep_base * i)
            fatal = any(s in msg for s in [
                "ERR_INTERNET_DISCONNECTED","ERR_NETWORK_CHANGED","ERR_NAME_NOT_RESOLVED",
                "disconnected: not connected to DevTools","chrome not reachable","timeout"
            ])
            if fatal or i == attempts:
                drv = restart_local_driver()
    return drv

def safe_scroll_to_bottom(driver, url_for_reload: str,
                          max_same=3, sleep_between=2.0,
                          js_attempts=3, kb_attempts=2):
    def js_scroll_round():
        nonlocal last_height, same
        try:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(sleep_between)
            new_height = driver.execute_script("return document.body.scrollHeight")
            same = same + 1 if new_height == last_height else 0
            last_height = max(last_height, new_height)
            return True
        except WebDriverException:
            return False

    def kb_scroll_round():
        nonlocal same, last_height
        try:
            body = driver.find_element(By.TAG_NAME, "body")
            ActionChains(driver).move_to_element(body).click(body).perform()
            for _ in range(12):
                body.send_keys(Keys.PAGE_DOWN); time.sleep(0.12)
            body.send_keys(Keys.END); time.sleep(sleep_between)
            try:
                new_h = driver.execute_script("return document.body.scrollHeight")
            except WebDriverException:
                return False
            same = same + 1 if new_h == last_height else 0
            last_height = max(last_height, new_h)
            return True
        except Exception:
            return False

    last_height, same = 0, 0
    js_fail_streak = kb_fail_streak = 0

    while True:
        ok = js_scroll_round()
        if not ok:
            js_fail_streak += 1
            ok2 = kb_scroll_round()
            kb_fail_streak = 0 if ok2 else kb_fail_streak + 1
        else:
            js_fail_streak = 0

        if js_fail_streak >= js_attempts and kb_fail_streak >= kb_attempts:
            print("↻ вкладка зависла: перезапуск драйвера и повторное открытие страницы…")
            try: driver.quit()
            except Exception: pass
            driver = make_driver()
            driver.get(url_for_reload); time.sleep(1.5)
            last_height = same = js_fail_streak = kb_fail_streak = 0
            continue

        if same >= max_same:
            break

    time.sleep(1.1)
    return driver

def build_url(did:int, page:int) -> str:
    pr = urlparse(BASE); q = parse_qs(pr.query, keep_blank_values=True)
    for k in list(q):
        if k.startswith("district[") or k == "p": del q[k]
    q["district[0]"] = [str(did)]
    q["p"] = [str(page)]
    return urlunparse((pr.scheme, pr.netloc, pr.path, pr.params, urlencode(q, doseq=True), pr.fragment))

def collect_links(driver) -> set[str]:
    cards = driver.find_elements(By.CSS_SELECTOR, "a[data-name='LinkArea'], a._93444fe79c--media--9P6wN")
    return {(c.get_attribute("href") or "").split("?")[0] for c in cards if c.get_attribute("href")}

# -------- РАБОТНИК: один район --------
def scan_district(did: int):
    drv = make_driver()
    total_added = 0
    try:
        for p in range(1, MAX_PAGES + 1):
            url = build_url(did, p)
            print(f"[did {did} | p {p}] GET {url}")
            drv = safe_get(drv, url, wait_css="body")
            drv = safe_scroll_to_bottom(drv, url)
            page_links = collect_links(drv)
            # пишем ИМЕННО СЕЙЧАС: потокобезопасно
            added = append_threadsafe(page_links)
            total_added += added
            print(f"[did {did} | p {p}] на странице: {len(page_links)} | новых ЗАПИСАНО: {added}")

            # эвристика раннего выхода: если не было новых — стоп район
            if added == 0:
                print(f"[did {did}] новых нет → стоп по району")
                break

            time.sleep(random.uniform(0.15, 0.35))
    finally:
        try: drv.quit()
        except Exception: pass
    return did, total_added

# -------- RUN --------
print(f"Старт. Уже в {OUTPUT}: {len(seen_links)} ссылок.\n")
t0 = time.time()

with ThreadPoolExecutor(max_workers=N_BROWSERS) as pool:
    futures = [pool.submit(scan_district, did) for did in district_ids]
    for fut in as_completed(futures):
        did, added = fut.result()
        print(f"✔ Район {did} завершён. Добавлено: {added}")

elapsed = time.time() - t0
print(f"\nГотово. В {OUTPUT}  {len(load_existing(OUTPUT))}  ссылок.")
print(f"Время: {elapsed:.1f} c. Параллельных окон: {N_BROWSERS}.")

Старт. Уже в links.txt: 39194 ссылок.

[did 3 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=3&p=1
[did 4 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=4&p=1
[did 2 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=2&p=1
[did 1 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=1&p=1
[did 3 | p 1] на странице: 0 | новых ЗАПИСАНО: 0
[did 3] новых нет → стоп по району
[did 2 | p 1] на странице: 0 | новых ЗАПИСАНО: 0
[did 2] новых нет → стоп по району
✔ Район 3 завершён. Добавлено: 0
✔ Район 2 завершён. Добавлено: 0
[did 5 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1&district%5B0%5D=5&p=1
[did 6 | p 1] GET https://www.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=1

In [4]:
# === НАСТРОЙКИ ===
LINKS_PATH = "links.txt"        # файл со ссылками (по одной в строке)
CSV_PATH   = "cian_flats.csv"   # куда писать результат

WORKERS = 6                     # одновременно открытых Chrome (потоков)
BATCH_SIZE = 50                 # как часто дописывать в CSV
TIMEOUT_WAIT = 5.5              # ожидание отдельных элементов
TIMEOUT_PAGELOAD = 14           # таймаут загрузки страницы
PAUSE = (0.12, 0.4)             # случайная микропаузa между операциями

# --- шардирование (для разделения между несколькими машинами/процессами) ---
SHARDS = 4   # всего шардов/машин
SHARD  = 1   # номер текущего шарда (1..SHARDS)

In [5]:
def make_driver():
    opts = Options()
    opts.page_load_strategy = "none"
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--disable-notifications")
    opts.add_argument("--mute-audio")
    opts.add_argument("--disable-extensions")
    opts.add_argument("--disable-infobars")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument("--log-level=3")
    # вырубим картинки/медиа/шрифты/стили (ускоряет)
    opts.add_argument("--blink-settings=imagesEnabled=false")
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.managed_default_content_settings.stylesheets": 2,
        "profile.managed_default_content_settings.fonts": 2,
        "profile.managed_default_content_settings.plugins": 2,
    }
    opts.add_experimental_option("prefs", prefs)

    opts.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
    )

    drv = webdriver.Chrome(options=opts)
    drv.set_page_load_timeout(TIMEOUT_PAGELOAD)
    drv.set_script_timeout(TIMEOUT_WAIT)

    try:
        drv.execute_cdp_cmd("Network.enable", {})
        drv.execute_cdp_cmd("Network.setBlockedURLs", {
            "urls": [
                "*.jpg","*.jpeg","*.png","*.gif","*.webp","*.svg",
                "*.mp4","*.webm","*.avi","*.mkv",
                "*.woff","*.woff2","*.ttf","*.otf",
                "*.css","*.map"
            ]
        })
    except Exception:
        pass

    return drv

def wait_any(drv, timeout=TIMEOUT_WAIT):
    try:
        WebDriverWait(drv, timeout).until(
            EC.any_of(
                EC.presence_of_element_located(LOCATORS["title"]),
                EC.presence_of_element_located(LOCATORS["price"]),
                EC.presence_of_element_located(LOCATORS["address"])
            )
        )
    except Exception:
        pass

def open_url(drv, url):
    try:
        drv.get(url)
        human_pause(0.15, 0.3)
        try:
            wait_any(drv, timeout=max(2.5, TIMEOUT_WAIT - 2))
        finally:
            try:
                drv.execute_script("return window.stop && window.stop();")
            except Exception:
                pass
        return None
    except TimeoutException as e:
        try:
            drv.execute_script("return window.stop && window.stop();")
        except Exception:
            pass
        return f"page_load_timeout: {e}"
    except WebDriverException as e:
        return f"webdriver_error: {e}"
    except Exception as e:
        return f"unknown_open_error: {e}"

def safe_text(drv, locator, timeout=TIMEOUT_WAIT):
    try:
        el = WebDriverWait(drv, timeout).until(EC.presence_of_element_located(locator))
        return el.text.strip()
    except Exception:
        return ""

def parse_one_url(url):
    drv = make_driver()
    try:
        err = open_url(drv, url)
        data = {"url": url, "_error": err or ""}
        human_pause(0.08, 0.25)

        data["title"]       = safe_text(drv, LOCATORS["title"])
        data["price"]       = safe_text(drv, LOCATORS["price"])
        data["address"]     = safe_text(drv, LOCATORS["address"])
        data["area"]        = safe_text(drv, LOCATORS["area"])
        data["floor"]       = safe_text(drv, LOCATORS["floor"])
        data["year"]        = safe_text(drv, LOCATORS["year"])
        data["metro_name"]  = safe_text(drv, LOCATORS["metro_name"])
        data["metro_time"]  = safe_text(drv, LOCATORS["metro_time"])
        data["house_type"]  = safe_text(drv, LOCATORS["house_type"])

        return data
    finally:
        try:
            drv.quit()
        except Exception:
            pass

In [6]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

import time, random, csv, sys, os, hashlib

LOCATORS = {
    "title":   (By.XPATH, "//h1 | //h1[contains(.,'квартира') or contains(.,'Квартира')]"),
    "price":   (By.CSS_SELECTOR, '[data-testid="price-amount"] span'),
    "address": (By.CSS_SELECTOR, 'div[data-name="AddressContainer"]'),
    "area":    (By.XPATH, "//span[contains(text(), 'Общая площадь')]/following-sibling::span"),
    "floor":   (By.XPATH, "//span[contains(text(), 'Этаж')]/following-sibling::span"),
    "year":    (By.XPATH, "//span[contains(text(), 'Год постройки')]/following-sibling::span"),
    "metro_name": (By.XPATH, "(//li[@data-name='UndergroundItem'])[1]//a[contains(@class,'underground_link')]"),
    "metro_time": (By.XPATH, "(//li[@data-name='UndergroundItem'])[1]//span[contains(@class,'underground_time')]"),
    "house_type": (By.XPATH, "//div[@data-name='OfferSummaryInfoItem'][.//p[contains(., 'Тип дома')]]//p[2]")
}

CSV_KEYS = [
    "url","title","price","address","area","floor",
    "year","metro_name","metro_time","house_type","_error"
]

def human_pause(a=PAUSE[0], b=PAUSE[1]):
    if b > 0:
        time.sleep(random.uniform(a, b))

def stable_hash(s: str) -> int:
    return int(hashlib.md5(s.encode("utf-8")).hexdigest(), 16)

def read_existing_urls(path: str) -> set:
    seen = set()
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        return seen
    try:
        with open(path, "r", encoding="utf-8", newline="") as f:
            for row in csv.DictReader(f):
                u = (row.get("url") or "").strip()
                if u:
                    seen.add(u)
    except Exception:
        pass
    return seen

def save_csv_header_if_needed(path=CSV_PATH):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        with open(path, "w", newline="", encoding="utf-8") as f:
            csv.DictWriter(f, fieldnames=CSV_KEYS).writeheader()

def append_csv(rows, path=CSV_PATH):
    if not rows:
        return
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=CSV_KEYS)
        for r in rows:
            w.writerow({k: r.get(k, "") for k in CSV_KEYS})

def load_links(path=LINKS_PATH):
    if not os.path.exists(path):
        print(f"Файл {path} не найден", file=sys.stderr)
        return []
    with open(path, "r", encoding="utf-8") as f:
        return [ln.strip() for ln in f if ln.strip()]

In [7]:
# читаем ссылки
links = load_links(LINKS_PATH)
links = [u.strip() for u in links if u.strip()]
# уникальность с сохранением порядка
links = list(dict.fromkeys(links))

if not links:
    raise SystemExit("Список ссылок пуст. Заполни links.txt")

# пропустим те, что уже есть в CSV
seen = read_existing_urls(CSV_PATH)
if seen:
    before = len(links)
    links = [u for u in links if u not in seen]
    print(f"Пропущено уже записанных: {before - len(links)}; осталось к обработке: {len(links)}")

# шардирование (если SHARDS>1)
if SHARDS > 1:
    if not (1 <= SHARD <= SHARDS):
        raise SystemExit(f"Неверный SHARD={SHARD}; должен быть 1..{SHARDS}")
    shard_idx = SHARD - 1
    links_sharded = [u for u in links if stable_hash(u) % SHARDS == shard_idx]
    print(f"Шард {SHARD}/{SHARDS}: беру {len(links_sharded)} из {len(links)}")
    links = links_sharded

if not links:
    raise SystemExit("Нечего обрабатывать — все ссылки уже в CSV или не попали в этот шард.")

save_csv_header_if_needed(CSV_PATH)
print(f"К старту готово: {len(links)} URL")

Шард 1/4: беру 10191 из 40337
К старту готово: 10191 URL


In [None]:
done = 0
errors = 0
batch = []

t0 = time.time()
print(f"Старт: {len(links)} URL, воркеров={WORKERS}, batch={BATCH_SIZE}")

try:
    with ThreadPoolExecutor(max_workers=max(1, WORKERS)) as ex:
        futures = {ex.submit(parse_one_url, u): u for u in links}
        for fut in as_completed(futures):
            url = futures[fut]
            try:
                row = fut.result(timeout=TIMEOUT_PAGELOAD + 12)
                if row.get("_error"):
                    errors += 1
            except Exception as e:
                row = {"url": url, "_error": f"future_error: {e}"}
                errors += 1

            batch.append(row)
            done += 1

            if len(batch) >= BATCH_SIZE:
                append_csv(batch, CSV_PATH)
                batch.clear()
                # короткий прогресс прямо в output
                print(f"Сохранено {done}/{len(links)} (ошибок {errors})")

finally:
    if batch:
        append_csv(batch, CSV_PATH)

dt = time.time() - t0
print(f"Готово: {done} URL, ошибок {errors}. Файл -> {CSV_PATH}. Время: {dt:.1f} c")

Старт: 10191 URL, воркеров=6, batch=50


In [None]:
# fast_cian_scraper.py — один файл

from concurrent.futures import ThreadPoolExecutor, as_completed
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
import time, random, csv, sys, os

# ---- настройки ----
WORKERS = 4               
TIMEOUT_WAIT = 6          
TIMEOUT_PAGELOAD = 15     
PAUSE = (0.15, 0.5)
CSV_PATH = "cian_flats.csv"
LINKS_PATH = "links.txt"  

LOCATORS = {
    "title":   (By.XPATH, "//h1 | //h1[contains(.,'квартира') or contains(.,'Квартира')]"),
    "price":   (By.CSS_SELECTOR, '[data-testid="price-amount"] span'),
    "address": (By.CSS_SELECTOR, 'div[data-name="AddressContainer"]'),
    "area":    (By.XPATH, "//span[contains(text(), 'Общая площадь')]/following-sibling::span"),
    "floor":   (By.XPATH, "//span[contains(text(), 'Этаж')]/following-sibling::span"),
    "year":    (By.XPATH, "//span[contains(text(), 'Год постройки')]/following-sibling::span"),
    "metro_name": (By.XPATH, "(//li[@data-name='UndergroundItem'])[1]//a[contains(@class,'underground_link')]"),
    "metro_time": (By.XPATH, "(//li[@data-name='UndergroundItem'])[1]//span[contains(@class,'underground_time')]"),
    "house_type": (By.XPATH, "//div[@data-name='OfferSummaryInfoItem'][.//p[contains(., 'Тип дома')]]//p[2]")
}

def human_pause(a=PAUSE[0], b=PAUSE[1]):
    time.sleep(random.uniform(a, b))

def make_driver():
    opts = Options()

    # быстрый и «тихий» профиль
    opts.page_load_strategy = "none"                 # не ждём полной загрузки
    opts.add_argument("--headless=new")              # фоновый режим
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    opts.add_argument("--disable-notifications")
    opts.add_argument("--mute-audio")
    opts.add_argument("--disable-extensions")
    opts.add_argument("--disable-infobars")
    opts.add_argument("--window-size=1920,1080")
    opts.add_argument("--log-level=3")
    opts.add_argument("--blink-settings=imagesEnabled=false")  # выключить картинки (способ 1)

    # выключаем изображения через prefs (способ 2 — более надёжен)
    prefs = {"profile.managed_default_content_settings.images": 2}
    opts.add_experimental_option("prefs", prefs)

    # немного «человечнее» user-agent
    opts.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0 Safari/537.36"
    )

    drv = webdriver.Chrome(options=opts)
    drv.set_page_load_timeout(TIMEOUT_PAGELOAD)
    drv.set_script_timeout(TIMEOUT_WAIT)

    # Дополнительно: режем медиа через CDP (быстро и стабильно)
    try:
        drv.execute_cdp_cmd("Network.enable", {})
        drv.execute_cdp_cmd("Network.setBlockedURLs", {
            "urls": ["*.jpg", "*.jpeg", "*.png", "*.gif", "*.webp", "*.svg", "*.mp4", "*.webm", "*.avi"]
        })
    except Exception:
        # если окружение не поддерживает CDP — просто пропустим
        pass

    return drv

def wait_any(drv, timeout=TIMEOUT_WAIT):
    try:
        WebDriverWait(drv, timeout).until(
            EC.any_of(
                EC.presence_of_element_located(LOCATORS["title"]),
                EC.presence_of_element_located(LOCATORS["price"]),
                EC.presence_of_element_located(LOCATORS["address"])
            )
        )
    except Exception:
        pass  # не критично — пойдём собирать что получилось

def open_url(drv, url):
    """Открыть URL с жёстким таймаутом и возможностью прервать загрузку."""
    try:
        drv.get(url)
        human_pause(0.2, 0.4)
        try:
            wait_any(drv, timeout=4)
        finally:
            try:
                drv.execute_script("return window.stop && window.stop();")
            except Exception:
                pass
        return None
    except TimeoutException as e:
        try:
            drv.execute_script("return window.stop && window.stop();")
        except Exception:
            pass
        return f"page_load_timeout: {e}"
    except WebDriverException as e:
        return f"webdriver_error: {e}"
    except Exception as e:
        return f"unknown_open_error: {e}"

def safe_text(drv, locator, timeout=TIMEOUT_WAIT):
    try:
        el = WebDriverWait(drv, timeout).until(EC.presence_of_element_located(locator))
        return el.text.strip()
    except Exception:
        return ""

def parse_one_url(url):
    """Функция для одного URL — создаёт и закрывает свой драйвер (стабильнее)."""
    drv = make_driver()
    try:
        err = open_url(drv, url)
        data = {"url": url, "_error": err or ""}
        human_pause(0.1, 0.3)

        data["title"]       = safe_text(drv, LOCATORS["title"])
        data["price"]       = safe_text(drv, LOCATORS["price"])
        data["address"]     = safe_text(drv, LOCATORS["address"])
        data["area"]        = safe_text(drv, LOCATORS["area"])
        data["floor"]       = safe_text(drv, LOCATORS["floor"])
        data["year"]        = safe_text(drv, LOCATORS["year"])
        data["metro_name"]  = safe_text(drv, LOCATORS["metro_name"])
        data["metro_time"]  = safe_text(drv, LOCATORS["metro_time"])
        data["house_type"]  = safe_text(drv, LOCATORS["house_type"])

        return data
    finally:
        try:
            drv.quit()
        except Exception:
            pass

CSV_KEYS = [
    "url","title","price","address","area","floor",
    "year","metro_name","metro_time","house_type","_error"
]

def save_csv_header(path=CSV_PATH):
    with open(path, "w", newline="", encoding="utf-8") as f:
        csv.DictWriter(f, fieldnames=CSV_KEYS).writeheader()

def append_csv(rows, path=CSV_PATH):
    with open(path, "a", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=CSV_KEYS)
        for r in rows:
            w.writerow({k: r.get(k, "") for k in CSV_KEYS})

def load_links_fallback():
    if os.path.exists(LINKS_PATH):
        with open(LINKS_PATH, "r", encoding="utf-8") as f:
            return [ln.strip() for ln in f if ln.strip()]
    print(f"Файл со ссылками {LINKS_PATH} не найден и переменная links не определена.", file=sys.stderr)
    return []

# ----------------- запуск -----------------
if __name__ == "__main__":
    # Если переменная links определена где-то снаружи — используем её,
    # иначе попробуем прочитать из links.txt
   """ if "links" in globals():
        links = list(set(links))
    else:"""
    links = load_links_fallback()
    links = list(set(links))

    if not links:
        print("Список ссылок пуст. Заполни переменную `links` или файл links.txt")
        sys.exit(1)

    save_csv_header()

    batch = []
    done = 0
    errors = 0

    with ThreadPoolExecutor(max_workers=WORKERS) as ex:
        futures = {ex.submit(parse_one_url, u): u for u in links}
        for fut in as_completed(futures):
            try:
                row = fut.result(timeout=TIMEOUT_PAGELOAD + 10)
                batch.append(row)
                done += 1
                if row.get("_error"):
                    errors += 1
            except Exception as e:
                row = {"url": futures[fut], "_error": f"future_error: {e}"}
                batch.append(row)
                errors += 1

            if len(batch) >= 50:
                append_csv(batch)
                print(f"Сохранено {done}/{len(links)} (ошибок {errors})")
                batch.clear()

    if batch:
        append_csv(batch)

    print(f"Готово: {done} URL, ошибок {errors}. Результат -> {CSV_PATH}")