In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import random
from concurrent.futures import ThreadPoolExecutor, as_completed

# ----------------------------------------
# Config
# ----------------------------------------

base_url = 'https://www.avito.ma/fr/maroc/voitures'
output_file = 'avito_cars.csv'
max_pages = 2609
max_threads = 10
retry_limit = 3

# ----------------------------------------
# Headers Rotation
# ----------------------------------------

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64)...',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)...',
    'Mozilla/5.0 (X11; Linux x86_64)...',
    'Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X)...',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64)... Safari/537.36',
]

def get_headers():
    return {
        'User-Agent': random.choice(user_agents)
    }

# ----------------------------------------
# CSV Setup
# ----------------------------------------

if not os.path.exists(output_file):
    pd.DataFrame(columns=[
        'url', 'type_boit', 'type_carburant',
        'kilometrage', 'marke', 'model',
        'puissance', 'price' , 
        'has_abs' ,'has_airbags' , 'has_bluetooth_audio' ,'has_rear_camera' , 'has_ac' , 'has_esp'
          , 'has_alloy_wheels' , 'has_speed_limiter' , 'has_trip_computer' ,'has_parking_sensor' , 
          'has_cruise_control' , 'has_leather_seats' , 'has_gps',  'has_sunroof' , 'has_remote_locking'
          ,'has_power_windows' 
    ]).to_csv(output_file, index=False)

# ----------------------------------------
# Safe Request with Retry
# ----------------------------------------

def fetch_with_retries(url, params=None):
    for attempt in range(retry_limit):
        try:
            response = requests.get(url, headers=get_headers(), params=params, timeout=10)
            if response.status_code == 200 and "captcha" not in response.text.lower():
                return response
            else:
                print(f"[!] Retry {attempt+1} - Status: {response.status_code}")
                time.sleep(random.uniform(2, 5))
        except Exception as e:
            print(f"[!] Error fetching {url}: {e}")
            time.sleep(random.uniform(2, 4))
    return None

# ----------------------------------------
# Car Detail Scraper
# ----------------------------------------
def scrape_car(url):
    try:
        if not url.startswith("http"):
            url = "https://www.avito.ma" + url

        response = fetch_with_retries(url)
        if not response:
            return None

        soup = BeautifulSoup(response.text, 'html.parser')

        # --- PRICE ---
        price_tag = soup.find('p', {'class': 'jdRkSM'})
        price = price_tag.text.strip() if price_tag else None

        # --- BASIC FEATURES (first div.dnArJl) ---
        features_divs = soup.find_all('div', {'class': 'dnArJl'})
        values = labels = []
        if len(features_divs) >= 1:
            values = features_divs[0].find_all('span', {'class': 'fjZBup'})
            labels = [s.text.strip() for s in features_divs[0].find_all('span', {'class': 'bXFCIH'})]

        get = lambda i: values[i].text.strip() if len(values) > i else None
        get_by_label = lambda name: get(labels.index(name)) if name in labels else None

        # --- Feature Mapping (renamed columns) ---

        real_features = {
            'ABS': 'has_abs',
            'Airbags': 'has_airbags',
            'CD/MP3/Bluetooth': 'has_bluetooth_audio',
            'Caméra de recul': 'has_rear_camera',
            'Climatisation': 'has_ac',
            'ESP': 'has_esp',
            'Jantes aluminium': 'has_alloy_wheels',
            'Limiteur de vitesse': 'has_speed_limiter',
            'Ordinateur de bord': 'has_trip_computer',
            'Radar de recul': 'has_parking_sensor',
            'Régulateur de vitesse': 'has_cruise_control',
            'Sièges cuir': 'has_leather_seats',
            'Système de navigation/GPS': 'has_gps',
            'Toit ouvrant': 'has_sunroof',
            'Verrouillage centralisé à distance': 'has_remote_locking',
            'Vitres électriques': 'has_power_windows'
        }

        # --- EXTRA FEATURES (second div.dnArJl) ---
        feature_status = {new_key: 0 for new_key in real_features.values()}  # default to 0
        if len(features_divs) >= 2:
            feature_texts = [
                span.text.strip() for span in features_divs[1].find_all('span', {'class': 'fjZBup'})
            ]
            feature_status = {
                new_key: 1 if old_key in feature_texts else 0
                for old_key, new_key in real_features.items()
            }
        # --- Final Data Dictionary ---
      
        return {
            'url': url,
            'type_boit': get(1),
            'type_carburant': get(2),
            'kilometrage': get(3),
            'marke': get(4),
            'model': get(5),
            'puissance': get_by_label('Puissance fiscale'),
            'price': price,
            **feature_status  # Merge feature flags into main dictionary
        }

    except Exception as e:
        print(f"[!] Error scraping car {url}: {str(e)}")
        return None


for page in range(2, 6):
    print(f"\n🌐 Scraping page {page}...")
    params = {'o': page}
    car_links = []

    response = fetch_with_retries(base_url, params=params)
    print(response.url)
    if response:
        soup = BeautifulSoup(response.text, 'html.parser')
        container = soup.find('div', {'class': 'crKvIr'})
        if container:
            links = container.find_all('a', {'class': 'jZXrfL'})
            car_links = [link['href'] for link in links if link.get('href')]
            print(f"[+] Found {len(car_links)} cars on page {page}")
        else:
            print("[!] No container found on this page")
    else:
        print(f"[!] Failed to fetch page {page}, skipping.")
        continue

    page_data = []
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        futures = [executor.submit(scrape_car, url) for url in car_links]
        for future in as_completed(futures):
            result = future.result()
            if result:
                page_data.append(result)

    # Save after each page
    if page_data:
        pd.DataFrame(page_data).to_csv(output_file, mode='a', header=False, index=False)
        print(f"✅ Saved {len(page_data)} cars from page {page}")

    else:
        print(f"⚠️ No data to save for page {page}")

    # Random polite delay
    if page % 20 == 0:
        print("😴 Taking longer break...")
        time.sleep(random.uniform(10, 20))
    else:
        time.sleep(random.uniform(2, 4))

print("\n🎉 Done scraping all pages!")



🌐 Scraping page 2...
https://www.avito.ma/fr/maroc/voitures?o=2
[+] Found 34 cars on page 2
✅ Saved 34 cars from page 2

🌐 Scraping page 3...
https://www.avito.ma/fr/maroc/voitures?o=3
[+] Found 35 cars on page 3
[!] Retry 1 - Status: 511
[!] Retry 1 - Status: 511
✅ Saved 35 cars from page 3

🌐 Scraping page 4...
https://www.avito.ma/fr/maroc/voitures?o=4
[+] Found 34 cars on page 4
✅ Saved 34 cars from page 4

🌐 Scraping page 5...
https://www.avito.ma/fr/maroc/voitures?o=5
[+] Found 34 cars on page 5
[!] Retry 1 - Status: 511
✅ Saved 34 cars from page 5

🎉 Done scraping all pages!
