# Projet de Web Scraping - Dakar Auto

Ce notebook va scraper et nettoyer les donn√©es de trois cat√©gories sur dakar-auto.com:
1. Voitures
2. Motos et Scooters
3. Location de voitures

In [1]:
# Installation des biblioth√®ques n√©cessaires
# Ex√©cutez cette cellule une seule fois
import sys
!{sys.executable} -m pip install requests beautifulsoup4 pandas lxml



In [2]:
# Import des biblioth√®ques
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from typing import List, Dict

## Fonctions utilitaires pour le scraping

In [3]:
def get_page_content(url: str) -> BeautifulSoup:
    """R√©cup√®re le contenu HTML d'une page"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.content, 'lxml')
    except Exception as e:
        print(f"Erreur lors de la r√©cup√©ration de {url}: {e}")
        return None

def clean_text(text: str) -> str:
    """Nettoie le texte en supprimant les espaces superflus"""
    if text:
        return re.sub(r'\s+', ' ', text.strip())
    return ""

def extract_number(text: str) -> str:
    """Extrait les nombres d'un texte"""
    if text:
        numbers = re.findall(r'\d+', text.replace(' ', ''))
        return ''.join(numbers) if numbers else ""
    return ""

## 1. Scraping des Voitures

In [4]:
def scrape_voitures(base_url: str, max_pages: int = 2773) -> List[Dict]:
    """
    Scrape les donn√©es des voitures
    Variables: marque, ann√©e, prix, adresse, kilom√©trage, boite vitesse, carburant, propri√©taire
    """
    all_data = []
    
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        url = f"{base_url}?page={page}" if page > 1 else base_url
        soup = get_page_content(url)
        
        if not soup:
            break
            
        articles = soup.find_all('div', class_='listings-cards__list-item')
        
        if not articles:
            print(f"Aucun article trouv√© sur la page {page}")
            break
            
        for article in articles:
            try:
                data = {}
                
                # Marque (titre de l'annonce)
                title_elem = article.find('h2', class_='listing-card__header__title')
                if title_elem:
                    title_link = title_elem.find('a')
                    data['marque'] = clean_text(title_link.get_text()) if title_link else clean_text(title_elem.get_text())
                else:
                    data['marque'] = ""
                
                # Prix
                price_elem = article.find('h3', class_='listing-card__header__price')
                data['prix'] = clean_text(price_elem.get_text()) if price_elem else ""
                
                # Adresse (ville + province)
                address_parts = []
                town_elem = article.find('span', class_='town-suburb')
                if town_elem:
                    address_parts.append(clean_text(town_elem.get_text()))
                province_elem = article.find('span', class_='province')
                if province_elem:
                    address_parts.append(clean_text(province_elem.get_text()))
                data['adresse'] = ' '.join(address_parts)
                
                # Initialiser les caract√©ristiques
                data['ann√©e'] = ""
                data['kilom√©trage'] = ""
                data['boite_vitesse'] = ""
                data['carburant'] = ""
                
                # Extraire l'ann√©e du titre si pr√©sent
                if data['marque']:
                    year_match = re.search(r'\b(19|20)\d{2}\b', data['marque'])
                    if year_match:
                        data['ann√©e'] = year_match.group()
                
                # Caract√©ristiques dans les li de la liste
                specs = article.find_all('li', class_='listing-card__attribute')
                for spec in specs:
                    spec_text = clean_text(spec.get_text()).lower()
                    
                    # Kilom√©trage (contient "km")
                    if 'km' in spec_text and 'km' not in data['kilom√©trage']:
                        data['kilom√©trage'] = clean_text(spec.get_text())
                    
                    # Bo√Æte de vitesse
                    if 'automatique' in spec_text or 'manuelle' in spec_text:
                        data['boite_vitesse'] = clean_text(spec.get_text())
                    
                    # Carburant
                    if any(word in spec_text for word in ['essence', 'diesel', 'hybride', '√©lectrique']):
                        data['carburant'] = clean_text(spec.get_text())
                
                # Propri√©taire (dans le texte "Par [nom]")
                author_elem = article.find('p', class_='time-author')
                if author_elem:
                    author_link = author_elem.find('a')
                    if author_link:
                        author_text = clean_text(author_link.get_text())
                        # Enlever "Par " du d√©but
                        data['propri√©taire'] = author_text.replace('Par ', '').strip()
                    else:
                        data['propri√©taire'] = clean_text(author_elem.get_text()).replace('Par ', '').strip()
                else:
                    data['propri√©taire'] = ""
                
                all_data.append(data)
                
            except Exception as e:
                print(f"Erreur lors du traitement d'un article: {e}")
                continue
        
        time.sleep(1)  # Pause pour ne pas surcharger le serveur
    
    print(f"Total voitures scrap√©es: {len(all_data)}")
    return all_data

## 2. Scraping des Motos et Scooters

In [5]:
def scrape_motos(base_url: str, max_pages: int = 55) -> List[Dict]:
    """
    Scrape les donn√©es des motos et scooters
    Variables: marque, ann√©e, prix, adresse, kilom√©trage, propri√©taire
    """
    all_data = []
    
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        url = f"{base_url}?page={page}" if page > 1 else base_url
        soup = get_page_content(url)
        
        if not soup:
            break
            
        # Trouver tous les articles/annonces
        articles = soup.find_all('div', class_='listings-cards__list-item')
        
        if not articles:
            print(f"Aucun article trouv√© sur la page {page}")
            break
            
        for article in articles:
            try:
                data = {}
                
                # Marque (titre de l'annonce)
                title_elem = article.find('h2', class_='listing-card__header__title')
                if title_elem:
                    title_link = title_elem.find('a')
                    data['marque'] = clean_text(title_link.get_text()) if title_link else clean_text(title_elem.get_text())
                else:
                    data['marque'] = ""
                
                # Prix
                price_elem = article.find('h3', class_='listing-card__header__price')
                data['prix'] = clean_text(price_elem.get_text()) if price_elem else ""
                
                # Adresse (ville + province)
                address_parts = []
                town_elem = article.find('span', class_='town-suburb')
                if town_elem:
                    address_parts.append(clean_text(town_elem.get_text()))
                province_elem = article.find('span', class_='province')
                if province_elem:
                    address_parts.append(clean_text(province_elem.get_text()))
                data['adresse'] = ' '.join(address_parts)
                
                # Initialiser les caract√©ristiques
                data['ann√©e'] = ""
                data['kilom√©trage'] = ""
                
                # Extraire l'ann√©e du titre si pr√©sent
                if data['marque']:
                    year_match = re.search(r'\b(19|20)\d{2}\b', data['marque'])
                    if year_match:
                        data['ann√©e'] = year_match.group()
                
                # Caract√©ristiques dans les li
                specs = article.find_all('li', class_='listing-card__attribute')
                for spec in specs:
                    spec_text = clean_text(spec.get_text()).lower()
                    
                    # Kilom√©trage
                    if 'km' in spec_text and 'km' not in data['kilom√©trage']:
                        data['kilom√©trage'] = clean_text(spec.get_text())
                
                # Propri√©taire
                author_elem = article.find('p', class_='time-author')
                if author_elem:
                    author_link = author_elem.find('a')
                    if author_link:
                        author_text = clean_text(author_link.get_text())
                        data['propri√©taire'] = author_text.replace('Par ', '').strip()
                    else:
                        data['propri√©taire'] = clean_text(author_elem.get_text()).replace('Par ', '').strip()
                else:
                    data['propri√©taire'] = ""
                
                all_data.append(data)
                
            except Exception as e:
                print(f"Erreur lors du traitement d'un article: {e}")
                continue
        
        time.sleep(1)
    
    print(f"Total motos scrap√©es: {len(all_data)}")
    return all_data

## 3. Scraping des Locations de Voitures

In [6]:
def scrape_locations(base_url: str, max_pages: int = 9) -> List[Dict]:
    """
    Scrape les donn√©es des locations de voitures
    Variables: marque, ann√©e, prix, adresse, propri√©taire
    """
    all_data = []
    
    for page in range(1, max_pages + 1):
        print(f"Scraping page {page}...")
        url = f"{base_url}?page={page}" if page > 1 else base_url
        soup = get_page_content(url)
        
        if not soup:
            break
            
        # Trouver tous les articles/annonces
        articles = soup.find_all('div', class_='listings-cards__list-item')
        
        if not articles:
            print(f"Aucun article trouv√© sur la page {page}")
            break
            
        for article in articles:
            try:
                data = {}
                
                # Marque (titre de l'annonce)
                title_elem = article.find('h2', class_='listing-card__header__title')
                if title_elem:
                    title_link = title_elem.find('a')
                    data['marque'] = clean_text(title_link.get_text()) if title_link else clean_text(title_elem.get_text())
                else:
                    data['marque'] = ""
                
                # Prix
                price_elem = article.find('h3', class_='listing-card__header__price')
                data['prix'] = clean_text(price_elem.get_text()) if price_elem else ""
                
                # Adresse (ville + province)
                address_parts = []
                town_elem = article.find('span', class_='town-suburb')
                if town_elem:
                    address_parts.append(clean_text(town_elem.get_text()))
                province_elem = article.find('span', class_='province')
                if province_elem:
                    address_parts.append(clean_text(province_elem.get_text()))
                data['adresse'] = ' '.join(address_parts)
                
                # Ann√©e - extraire du titre
                data['ann√©e'] = ""
                if data['marque']:
                    year_match = re.search(r'\b(19|20)\d{2}\b', data['marque'])
                    if year_match:
                        data['ann√©e'] = year_match.group()
                
                # Propri√©taire
                author_elem = article.find('p', class_='time-author')
                if author_elem:
                    author_link = author_elem.find('a')
                    if author_link:
                        author_text = clean_text(author_link.get_text())
                        data['propri√©taire'] = author_text.replace('Par ', '').strip()
                    else:
                        data['propri√©taire'] = clean_text(author_elem.get_text()).replace('Par ', '').strip()
                else:
                    data['propri√©taire'] = ""
                
                all_data.append(data)
                
            except Exception as e:
                print(f"Erreur lors du traitement d'un article: {e}")
                continue
        
        time.sleep(1)
    
    print(f"Total locations scrap√©es: {len(all_data)}")
    return all_data

## 4. Fonction de nettoyage des donn√©es

In [7]:
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Nettoie et standardise un DataFrame"""
    
    # Supprimer les lignes compl√®tement vides
    df = df.dropna(how='all')
    
    # Nettoyer les prix
    if 'prix' in df.columns:
        df['prix'] = df['prix'].apply(lambda x: extract_number(str(x)) if pd.notna(x) else "")
        # Convertir en num√©rique si possible
        df['prix_numeric'] = pd.to_numeric(df['prix'], errors='coerce')
    
    # Nettoyer les kilom√©trages
    if 'kilom√©trage' in df.columns:
        df['kilom√©trage_clean'] = df['kilom√©trage'].apply(lambda x: extract_number(str(x)) if pd.notna(x) else "")
        df['kilom√©trage_numeric'] = pd.to_numeric(df['kilom√©trage_clean'], errors='coerce')
    
    # Nettoyer les ann√©es
    if 'ann√©e' in df.columns:
        df['ann√©e'] = df['ann√©e'].apply(lambda x: extract_number(str(x)) if pd.notna(x) else "")
        df['ann√©e_numeric'] = pd.to_numeric(df['ann√©e'], errors='coerce')
    
    # Supprimer les doublons bas√©s sur marque et prix
    if 'marque' in df.columns and 'prix' in df.columns:
        df = df.drop_duplicates(subset=['marque', 'prix'], keep='first')
    
    # R√©initialiser l'index
    df = df.reset_index(drop=True)
    
    return df

## 5. Ex√©cution du scraping

In [12]:
# URLs √† scraper
url_voitures = "https://dakar-auto.com/senegal/voitures-4"
url_motos = "https://dakar-auto.com/senegal/motos-and-scooters-3"
url_locations = "https://dakar-auto.com/senegal/location-de-voitures-19"

# Nombre de pages √† scraper (ajustez selon vos besoins)
MAX_PAGES = 5

print("="*60)
print("SCRAPING DES VOITURES")
print("="*60)
voitures_data = scrape_voitures(url_voitures, max_pages=MAX_PAGES)
df_voitures = pd.DataFrame(voitures_data)
df_voitures = clean_dataframe(df_voitures)

print("\n" + "="*60)
print("SCRAPING DES MOTOS ET SCOOTERS")
print("="*60)
motos_data = scrape_motos(url_motos, max_pages=MAX_PAGES)
df_motos = pd.DataFrame(motos_data)
df_motos = clean_dataframe(df_motos)

print("\n" + "="*60)
print("SCRAPING DES LOCATIONS")
print("="*60)
locations_data = scrape_locations(url_locations, max_pages=MAX_PAGES)
df_locations = pd.DataFrame(locations_data)
df_locations = clean_dataframe(df_locations)

print("\n" + "="*60)
print("R√âSUM√â")
print("="*60)
print(f"Voitures scrap√©es: {len(df_voitures)}")
print(f"Motos scrap√©es: {len(df_motos)}")
print(f"Locations scrap√©es: {len(df_locations)}")

SCRAPING DES VOITURES
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Total voitures scrap√©es: 100

SCRAPING DES MOTOS ET SCOOTERS
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Total motos scrap√©es: 100

SCRAPING DES LOCATIONS
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Total locations scrap√©es: 100

R√âSUM√â
Voitures scrap√©es: 98
Motos scrap√©es: 95
Locations scrap√©es: 96


## 6. Affichage et analyse des donn√©es

In [13]:
# Afficher les premi√®res lignes de chaque dataset
print("="*60)
print("VOITURES - Premi√®res lignes")
print("="*60)
print(df_voitures.head())
print(f"\nShape: {df_voitures.shape}")
print(f"Colonnes: {list(df_voitures.columns)}")

print("\n" + "="*60)
print("MOTOS - Premi√®res lignes")
print("="*60)
print(df_motos.head())
print(f"\nShape: {df_motos.shape}")
print(f"Colonnes: {list(df_motos.columns)}")

print("\n" + "="*60)
print("LOCATIONS - Premi√®res lignes")
print("="*60)
print(df_locations.head())
print(f"\nShape: {df_locations.shape}")
print(f"Colonnes: {list(df_locations.columns)}")

VOITURES - Premi√®res lignes
              marque      prix               adresse ann√©e kilom√©trage  \
0  Peugeot 2008 2023   5000000         Fass, Kaolack  2008    20000 km   
1    Honda CR-V 2012   4950000           Fann, Dakar  2012   221094 km   
2   Ford Fusion 2014   3800000  Yeumbeul Nord, Dakar  2014   155000 km   
3   Opel Antara 2007   2100000     Nord Foire, Dakar  2007   171107 km   
4   Lexus GX460 2020  38000000    Ouest Foire, Dakar  2020    55493 km   

  boite_vitesse carburant   propri√©taire  prix_numeric kilom√©trage_clean  \
0   Automatique   Essence  MOURAD ENNAJY       5000000             20000   
1   Automatique   Essence    fafa ndiaye       4950000            221094   
2   Automatique   Essence  Cheikh Mback√©       3800000            155000   
3   Automatique   Essence    Assane Ndao       2100000            171107   
4   Automatique   Essence   Ulrich MI√âR√â      38000000             55493   

   kilom√©trage_numeric  ann√©e_numeric  
0                200

---

# PARTIE 2: Scraping SANS nettoyage de TOUTES les pages

Cette section scrappe toutes les pages disponibles et conserve les donn√©es brutes sans nettoyage.

## Fonction de d√©tection automatique du nombre de pages

In [None]:
def get_total_pages(base_url: str) -> int:
    """D√©tecte automatiquement le nombre total de pages"""
    soup = get_page_content(base_url)
    if not soup:
        return 1
    
    try:
        paginator = soup.find('nav', class_='paginator')
        if paginator:
            # Trouver tous les liens de pagination
            page_links = paginator.find_all('a', class_='page-link')
            max_page = 1
            
            for link in page_links:
                # Extraire le num√©ro de page de l'URL
                href = link.get('href', '')
                # Chercher tous les param√®tres page= dans l'URL
                matches = re.findall(r'page=(\d+)', href)
                if matches:
                    # Prendre le dernier param√®tre page= (le vrai num√©ro)
                    page_num = int(matches[-1])
                    max_page = max(max_page, page_num)
            # modification temporaire
            return max_page
    except Exception as e:
        print(f"Erreur lors de la d√©tection du nombre de pages: {e}")
    
    return 1

## Fonctions de scraping SANS nettoyage - Toutes les pages

In [24]:
def scrape_voitures_brut(base_url: str, max_pages: int = None) -> List[Dict]:
    """
    Scrape les donn√©es brutes des voitures (SANS NETTOYAGE) - TOUTES LES PAGES
    Variables: marque, ann√©e, prix, adresse, kilom√©trage, boite vitesse, carburant, propri√©taire
    """
    # D√©tecter automatiquement le nombre de pages si non sp√©cifi√©
    if max_pages is None:
        print("üîç D√©tection du nombre total de pages...")
        max_pages = get_total_pages(base_url)
        print(f"‚úì {max_pages} pages d√©tect√©es\n")
    
    all_data = []
    
    for page in range(1, max_pages + 1):
        print(f"üìÑ Scraping page {page}/{max_pages}...")
        url = f"{base_url}?page={page}" if page > 1 else base_url
        soup = get_page_content(url)
        
        if not soup:
            print(f"‚ùå Impossible de r√©cup√©rer la page {page}, arr√™t.")
            break
            
        # Trouver tous les articles
        articles = soup.find_all('div', class_='listings-cards__list-item')
        
        if not articles:
            print(f"‚ö†Ô∏è Aucun article trouv√© sur la page {page}, arr√™t.")
            break
            
        for article in articles:
            try:
                data = {}
                
                # V1: Marque - BRUT
                title_elem = article.find('h2', class_='listing-card__header__title')
                if title_elem:
                    title_link = title_elem.find('a')
                    data['marque'] = title_link.get_text().strip() if title_link else title_elem.get_text().strip()
                else:
                    data['marque'] = ""
                
                # V2: Ann√©e
                data['ann√©e'] = ""
                if data['marque']:
                    year_match = re.search(r'\b(19|20)\d{2}\b', data['marque'])
                    if year_match:
                        data['ann√©e'] = year_match.group()
                
                # V3: Prix - BRUT
                price_elem = article.find('h3', class_='listing-card__header__price')
                data['prix'] = price_elem.get_text().strip() if price_elem else ""
                
                # V4: Adresse - BRUT
                address_parts = []
                town_elem = article.find('span', class_='town-suburb')
                if town_elem:
                    address_parts.append(town_elem.get_text().strip())
                province_elem = article.find('span', class_='province')
                if province_elem:
                    address_parts.append(province_elem.get_text().strip())
                data['adresse'] = ' '.join(address_parts)
                
                # V5, V6, V7: Caract√©ristiques - BRUT
                data['kilom√©trage'] = ""
                data['boite_vitesse'] = ""
                data['carburant'] = ""
                
                specs = article.find_all('li', class_='listing-card__attribute')
                for spec in specs:
                    spec_text = spec.get_text().strip()
                    spec_lower = spec_text.lower()
                    
                    if 'km' in spec_lower and not data['kilom√©trage']:
                        data['kilom√©trage'] = spec_text
                    
                    if 'automatique' in spec_lower or 'manuelle' in spec_lower:
                        data['boite_vitesse'] = spec_text
                    
                    if any(word in spec_lower for word in ['essence', 'diesel', 'hybride', '√©lectrique']):
                        data['carburant'] = spec_text
                
                # V8: Propri√©taire - BRUT
                author_elem = article.find('p', class_='time-author')
                if author_elem:
                    author_link = author_elem.find('a')
                    data['propri√©taire'] = author_link.get_text().strip() if author_link else author_elem.get_text().strip()
                else:
                    data['propri√©taire'] = ""
                
                all_data.append(data)
                
            except Exception as e:
                print(f"‚ö†Ô∏è Erreur article: {e}")
                continue
        
        if page < max_pages:
            time.sleep(1)
    
    print(f"\n‚úÖ Total voitures scrap√©es: {len(all_data)}")
    return all_data

In [25]:
def scrape_motos_brut(base_url: str, max_pages: int = None) -> List[Dict]:
    """
    Scrape les donn√©es brutes des motos (SANS NETTOYAGE) - TOUTES LES PAGES
    Variables: marque, ann√©e, prix, adresse, kilom√©trage, propri√©taire
    """
    if max_pages is None:
        print("üîç D√©tection du nombre total de pages...")
        max_pages = get_total_pages(base_url)
        print(f"‚úì {max_pages} pages d√©tect√©es\n")
    
    all_data = []
    
    for page in range(1, max_pages + 1):
        print(f"üìÑ Scraping page {page}/{max_pages}...")
        url = f"{base_url}?page={page}" if page > 1 else base_url
        soup = get_page_content(url)
        
        if not soup:
            print(f"‚ùå Impossible de r√©cup√©rer la page {page}, arr√™t.")
            break
            
        articles = soup.find_all('div', class_='listings-cards__list-item')
        
        if not articles:
            print(f"‚ö†Ô∏è Aucun article trouv√© sur la page {page}, arr√™t.")
            break
            
        for article in articles:
            try:
                data = {}
                
                # V1: Marque - BRUT
                title_elem = article.find('h2', class_='listing-card__header__title')
                if title_elem:
                    title_link = title_elem.find('a')
                    data['marque'] = title_link.get_text().strip() if title_link else title_elem.get_text().strip()
                else:
                    data['marque'] = ""
                
                # V2: Ann√©e
                data['ann√©e'] = ""
                if data['marque']:
                    year_match = re.search(r'\b(19|20)\d{2}\b', data['marque'])
                    if year_match:
                        data['ann√©e'] = year_match.group()
                
                # V3: Prix - BRUT
                price_elem = article.find('h3', class_='listing-card__header__price')
                data['prix'] = price_elem.get_text().strip() if price_elem else ""
                
                # V4: Adresse - BRUT
                address_parts = []
                town_elem = article.find('span', class_='town-suburb')
                if town_elem:
                    address_parts.append(town_elem.get_text().strip())
                province_elem = article.find('span', class_='province')
                if province_elem:
                    address_parts.append(province_elem.get_text().strip())
                data['adresse'] = ' '.join(address_parts)
                
                # V5: Kilom√©trage - BRUT
                data['kilom√©trage'] = ""
                specs = article.find_all('li', class_='listing-card__attribute')
                for spec in specs:
                    spec_text = spec.get_text().strip()
                    if 'km' in spec_text.lower() and not data['kilom√©trage']:
                        data['kilom√©trage'] = spec_text
                        break
                
                # V6: Propri√©taire - BRUT
                author_elem = article.find('p', class_='time-author')
                if author_elem:
                    author_link = author_elem.find('a')
                    data['propri√©taire'] = author_link.get_text().strip() if author_link else author_elem.get_text().strip()
                else:
                    data['propri√©taire'] = ""
                
                all_data.append(data)
                
            except Exception as e:
                print(f"‚ö†Ô∏è Erreur article: {e}")
                continue
        
        if page < max_pages:
            time.sleep(1)
    
    print(f"\n‚úÖ Total motos scrap√©es: {len(all_data)}")
    return all_data

In [26]:
def scrape_locations_brut(base_url: str, max_pages: int = None) -> List[Dict]:
    """
    Scrape les donn√©es brutes des locations (SANS NETTOYAGE) - TOUTES LES PAGES
    Variables: marque, ann√©e, prix, adresse, propri√©taire
    """
    if max_pages is None:
        print("üîç D√©tection du nombre total de pages...")
        max_pages = get_total_pages(base_url)
        print(f"‚úì {max_pages} pages d√©tect√©es\n")
    
    all_data = []
    
    for page in range(1, max_pages + 1):
        print(f"üìÑ Scraping page {page}/{max_pages}...")
        url = f"{base_url}?page={page}" if page > 1 else base_url
        soup = get_page_content(url)
        
        if not soup:
            print(f"‚ùå Impossible de r√©cup√©rer la page {page}, arr√™t.")
            break
            
        articles = soup.find_all('div', class_='listings-cards__list-item')
        
        if not articles:
            print(f"‚ö†Ô∏è Aucun article trouv√© sur la page {page}, arr√™t.")
            break
            
        for article in articles:
            try:
                data = {}
                
                # V1: Marque - BRUT
                title_elem = article.find('h2', class_='listing-card__header__title')
                if title_elem:
                    title_link = title_elem.find('a')
                    data['marque'] = title_link.get_text().strip() if title_link else title_elem.get_text().strip()
                else:
                    data['marque'] = ""
                
                # V2: Ann√©e
                data['ann√©e'] = ""
                if data['marque']:
                    year_match = re.search(r'\b(19|20)\d{2}\b', data['marque'])
                    if year_match:
                        data['ann√©e'] = year_match.group()
                
                # V3: Prix - BRUT
                price_elem = article.find('h3', class_='listing-card__header__price')
                data['prix'] = price_elem.get_text().strip() if price_elem else ""
                
                # V4: Adresse - BRUT
                address_parts = []
                town_elem = article.find('span', class_='town-suburb')
                if town_elem:
                    address_parts.append(town_elem.get_text().strip())
                province_elem = article.find('span', class_='province')
                if province_elem:
                    address_parts.append(province_elem.get_text().strip())
                data['adresse'] = ' '.join(address_parts)
                
                # V5: Propri√©taire - BRUT
                author_elem = article.find('p', class_='time-author')
                if author_elem:
                    author_link = author_elem.find('a')
                    data['propri√©taire'] = author_link.get_text().strip() if author_link else author_elem.get_text().strip()
                else:
                    data['propri√©taire'] = ""
                
                all_data.append(data)
                
            except Exception as e:
                print(f"‚ö†Ô∏è Erreur article: {e}")
                continue
        
        if page < max_pages:
            time.sleep(1)
    
    print(f"\n‚úÖ Total locations scrap√©es: {len(all_data)}")
    return all_data

## Ex√©cution du scraping SANS nettoyage - TOUTES LES PAGES

In [27]:
# URLs
URL_VOITURES = "https://www.dakar-auto.com/senegal/voitures-4"
URL_MOTOS = "https://www.dakar-auto.com/senegal/motos-and-scooters-3"
URL_LOCATIONS = "https://www.dakar-auto.com/senegal/location-de-voitures-19"

print("=" * 80)
print("üöó SCRAPING DES VOITURES (TOUTES LES PAGES - SANS NETTOYAGE)")
print("=" * 80)
voitures_brutes = scrape_voitures_brut(URL_VOITURES)
df_voitures_brut = pd.DataFrame(voitures_brutes)

print("\n" + "=" * 80)
print("üèçÔ∏è SCRAPING DES MOTOS (TOUTES LES PAGES - SANS NETTOYAGE)")
print("=" * 80)
motos_brutes = scrape_motos_brut(URL_MOTOS)
df_motos_brut = pd.DataFrame(motos_brutes)

print("\n" + "=" * 80)
print("üöô SCRAPING DES LOCATIONS (TOUTES LES PAGES - SANS NETTOYAGE)")
print("=" * 80)
locations_brutes = scrape_locations_brut(URL_LOCATIONS)
df_locations_brut = pd.DataFrame(locations_brutes)

print("\n" + "=" * 80)
print("‚úÖ SCRAPING TERMIN√â!")
print("=" * 80)
print(f"Voitures: {len(df_voitures_brut)} lignes")
print(f"Motos: {len(df_motos_brut)} lignes")
print(f"Locations: {len(df_locations_brut)} lignes")

üöó SCRAPING DES VOITURES (TOUTES LES PAGES - SANS NETTOYAGE)
üîç D√©tection du nombre total de pages...
‚úì 5 pages d√©tect√©es

üìÑ Scraping page 1/5...
üìÑ Scraping page 2/5...
üìÑ Scraping page 3/5...
üìÑ Scraping page 4/5...
üìÑ Scraping page 5/5...

‚úÖ Total voitures scrap√©es: 100

üèçÔ∏è SCRAPING DES MOTOS (TOUTES LES PAGES - SANS NETTOYAGE)
üîç D√©tection du nombre total de pages...
‚úì 5 pages d√©tect√©es

üìÑ Scraping page 1/5...
üìÑ Scraping page 2/5...
üìÑ Scraping page 3/5...
üìÑ Scraping page 4/5...
üìÑ Scraping page 5/5...

‚úÖ Total motos scrap√©es: 100

üöô SCRAPING DES LOCATIONS (TOUTES LES PAGES - SANS NETTOYAGE)
üîç D√©tection du nombre total de pages...
‚úì 5 pages d√©tect√©es

üìÑ Scraping page 1/5...
üìÑ Scraping page 2/5...
üìÑ Scraping page 3/5...
üìÑ Scraping page 4/5...
üìÑ Scraping page 5/5...

‚úÖ Total locations scrap√©es: 100

‚úÖ SCRAPING TERMIN√â!
Voitures: 100 lignes
Motos: 100 lignes
Locations: 100 lignes


## Aper√ßu des donn√©es brutes

In [28]:
print("üöó VOITURES (donn√©es brutes):")
print(df_voitures_brut.head())
print(f"\nShape: {df_voitures_brut.shape}")
print(f"Valeurs manquantes:\n{df_voitures_brut.isnull().sum()}")

üöó VOITURES (donn√©es brutes):
              marque ann√©e              prix               adresse  \
0  Peugeot 2008 2023  2008   5‚ÄØ000‚ÄØ000 F CFA         Fass, Kaolack   
1    Honda CR-V 2012  2012   4‚ÄØ950‚ÄØ000 F CFA           Fann, Dakar   
2   Ford Fusion 2014  2014   3‚ÄØ800‚ÄØ000 F CFA  Yeumbeul Nord, Dakar   
3   Opel Antara 2007  2007   2‚ÄØ100‚ÄØ000 F CFA     Nord Foire, Dakar   
4   Lexus GX460 2020  2020  38‚ÄØ000‚ÄØ000 F CFA    Ouest Foire, Dakar   

  kilom√©trage boite_vitesse carburant        propri√©taire  
0    20000 km   Automatique   Essence   Par MOURAD ENNAJY  
1   221094 km   Automatique   Essence     Par fafa ndiaye  
2   155000 km   Automatique   Essence  Par Cheikh  Mback√©  
3   171107 km   Automatique   Essence    Par Assane  Ndao  
4    55493 km   Automatique   Essence    Par Ulrich MI√âR√â  

Shape: (100, 8)
Valeurs manquantes:
marque           0
ann√©e            0
prix             0
adresse          0
kilom√©trage      0
boite_vitesse    0
carbura

In [29]:
print("üèçÔ∏è MOTOS (donn√©es brutes):")
print(df_motos_brut.head())
print(f"\nShape: {df_motos_brut.shape}")
print(f"Valeurs manquantes:\n{df_motos_brut.isnull().sum()}")

üèçÔ∏è MOTOS (donn√©es brutes):
              marque ann√©e             prix                     adresse  \
0      Honda SH 2010  2010        450 F CFA           Gu√©diawaye, Dakar   
1     Honda CBF 2007  2007  1‚ÄØ500‚ÄØ000 F CFA  Parcelles Assainies, Dakar   
2      SYM 125S 2023  2023    620‚ÄØ000 F CFA             Rufisque, Dakar   
3   Yamaha TMax 2023  2023  4‚ÄØ300‚ÄØ000 F CFA                  VDN, Dakar   
4  Yamaha X-Max 2025  2025    800‚ÄØ000 F CFA         Sicap Baobab, Dakar   

  kilom√©trage        propri√©taire  
0      160 km  Par Babacar Diallo  
1    11000 km   Par AMADOU NDIAYE  
2     1200 km    Par Lamine  Ndao  
3        1 km    Par Rose  DIOMPY  
4      250 km    Par Rose  DIOMPY  

Shape: (100, 6)
Valeurs manquantes:
marque          0
ann√©e           0
prix            0
adresse         0
kilom√©trage     0
propri√©taire    0
dtype: int64


In [30]:
print("üöô LOCATIONS (donn√©es brutes):")
print(df_locations_brut.head())
print(f"\nShape: {df_locations_brut.shape}")
print(f"Valeurs manquantes:\n{df_locations_brut.isnull().sum()}")

üöô LOCATIONS (donn√©es brutes):
                  marque ann√©e             prix            adresse  \
0  Hyundai Santa Fe 2017  2017     35‚ÄØ000 F CFA       Thi√®s, Thi√®s   
1   Mitsubishi L200 2018  2018     50‚ÄØ000 F CFA       Thi√®s, Thi√®s   
2    Ford ESCAPE-SE 2013  2013     30‚ÄØ000 F CFA  Gu√©diawaye, Dakar   
3        Ford scape 2013  2013  4‚ÄØ500‚ÄØ000 F CFA  Gu√©diawaye, Dakar   
4         Ford Edge 2017  2017     45‚ÄØ000 F CFA  Gu√©diawaye, Dakar   

                  propri√©taire  
0  Par AlfaBusinessGroup BARRY  
1  Par AlfaBusinessGroup BARRY  
2            Par TERANGUA  BII  
3            Par TERANGUA  BII  
4           Par Mouhamed  Sene  

Shape: (100, 5)
Valeurs manquantes:
marque          0
ann√©e           0
prix            0
adresse         0
propri√©taire    0
dtype: int64
