In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

# ----------------------------------------
# Config
# ----------------------------------------

base_url = 'https://www.avito.ma/fr/maroc/voitures'
output_file = 'avito_cars_data.csv'
max_pages = 2609
max_threads = 10
retry_limit = 3

# ----------------------------------------
# Headers Rotation
# ----------------------------------------

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64)...',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...',
    'Mozilla/5.0 (X11; Linux x86_64)...',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)...',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64)... Safari/537.36',
]

def get_headers():
    return {
        'User-Agent': random.choice(user_agents)
    }

# ----------------------------------------
# CSV Setup
# ----------------------------------------

if not os.path.exists(output_file):
    pd.DataFrame(columns=[
        'url', 'type_boit', 'type_carburant',
        'kilometrage', 'marke', 'model',
        'puissance', 'price'
    ]).to_csv(output_file, index=False)

# ----------------------------------------
# Safe Request with Retry
# ----------------------------------------

def fetch_with_retries(url, params=None):
    for attempt in range(retry_limit):
        try:
            response = requests.get(url, headers=get_headers(), params=params, timeout=10)
            if response.status_code == 200 and "captcha" not in response.text.lower():
                return response
            else:
                print(f"[!] Retry {attempt+1} - Status: {response.status_code}")
                time.sleep(random.uniform(2, 5))
        except Exception as e:
            print(f"[!] Error fetching {url}: {e}")
            time.sleep(random.uniform(2, 4))
    return None

# ----------------------------------------
# Car Detail Scraper
# ----------------------------------------

def scrape_car(url):
    try:
        if not url.startswith("http"):
            url = "https://www.avito.ma" + url

        response = fetch_with_retries(url)
        if not response:
            return None

        soup = BeautifulSoup(response.text, 'html.parser')
        price_tag = soup.find('p', {'class': 'jdRkSM'})
        price = price_tag.text.strip() if price_tag else None

        features_div = soup.find('div', {'class': 'dnArJl'})
        if features_div:
            values = features_div.find_all('span', {'class': 'fjZBup'})
            labels = [s.text.strip() for s in features_div.find_all('span', {'class': 'bXFCIH'})]
            get = lambda i: values[i].text.strip() if len(values) > i else None
            get_by_label = lambda name: get(labels.index(name)) if name in labels else None

            return {
                'url': url,
                'type_boit': get(1),
                'type_carburant': get(2),
                'kilometrage': get(3),
                'marke': get(4),
                'model': get(5),
                'puissance': get_by_label('Puissance fiscale'),
                'price': price
            }
        else:
            return {
                'url': url,
                'type_boit': None,
                'type_carburant': None,
                'kilometrage': None,
                'marke': None,
                'model': None,
                'puissance': None,
                'price': price
            }

    except Exception as e:
        print(f"[!] Error scraping car {url}: {str(e)}")
        return None

# ----------------------------------------
# Main Scraping Loop
# ----------------------------------------

for page in range(1358, max_pages + 1):
    print(f"\n🌐 Scraping page {page}...")
    params = {'p': page, 'radius': 200}
    car_links = []

    response = fetch_with_retries(base_url, params=params)
    if response:
        soup = BeautifulSoup(response.text, 'html.parser')
        container = soup.find('div', {'class': 'crKvIr'})
        if container:
            links = container.find_all('a', {'class': 'jZXrfL'})
            car_links = [link['href'] for link in links if link.get('href')]
            print(f"[+] Found {len(car_links)} cars on page {page}")
        else:
            print("[!] No container found on this page")
    else:
        print(f"[!] Failed to fetch page {page}, skipping.")
        continue

    # Scrape detail pages in parallel
    page_data = []
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = [executor.submit(scrape_car, url) for url in car_links]
        for future in as_completed(futures):
            result = future.result()
            if result:
                page_data.append(result)

    # Save after each page
    if page_data:
        pd.DataFrame(page_data).to_csv(output_file, mode='a', header=False, index=False)
        print(f"✅ Saved {len(page_data)} cars from page {page}")
    else:
        print(f"⚠️ No data to save for page {page}")

    # Random polite delay
    if page % 20 == 0:
        print("😴 Taking longer break...")
        time.sleep(random.uniform(10, 20))
    else:
        time.sleep(random.uniform(2, 4))

print("\n🎉 Done scraping all pages!")
