In [1]:
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time  # Pour les pauses entre les requêtes
from datetime import datetime  # Pour générer des noms avec la date et l'heure

# Chemins des fichiers
input_csv = r"D:/Escritorio/Second period 2024-2025/Data analysis/assigment 2 ntbk/movies_with_links.csv"

# Générer un nom de fichier avec la date et l'heure actuelles
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_csv = f"D:/Escritorio/Second period 2024-2025/Data analysis/assigment 2 ntbk/movies_output{current_time}.csv"

# Configuration du navigateur
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--blink-settings=imagesEnabled=false")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Point de départ et nombre de films à traiter
start_point = 0  # Changer cette valeur pour commencer à un index spécifique
num_of_movies_to_extract = 5  # Nombre de films à extraire (par exemple, 10)

try:
    # Lecture du fichier d'entrée
    with open(input_csv, "r", encoding="utf-8") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Ignorer l'en-tête
        urls = [row[5] for row in reader if len(row) > 5 and row[5].strip()]

    # Limiter les films à extraire
    urls = urls[start_point:start_point + num_of_movies_to_extract]

    # Préparation du fichier de sortie
    with open(output_csv, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Title", "Worldwide Gross", "Domestic Gross", "International Gross", "Director", "Cast and Crew"])

        # Extraction des données
        for url in urls:
            driver.get(url)
            wait = WebDriverWait(driver, 10)  # Réduit le temps d'attente

            # Extraction des informations principales
            try:
                title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".a-size-extra-large"))).text
                worldwide_gross = driver.find_element(By.XPATH, '//div[span[contains(text(), "Worldwide")]]/span[2]').text
                domestic_gross = driver.find_element(By.XPATH, '//div[span[contains(text(), "Domestic")]]/span[2]').text
                international_gross = driver.find_element(By.XPATH, '//div[span[contains(text(), "International")]]/span[2]').text
            except:
                pass

            # Aller sur la page Cast and Crew si disponible
            cast_and_crew = "Cast and Crew information not available"
            try:
                cast_and_crew_link = driver.find_element(By.XPATH, '//a[contains(text(), "Cast and Crew")]')
                driver.get(cast_and_crew_link.get_attribute("href"))
                cast_and_crew_table = wait.until(EC.presence_of_element_located((By.ID, "principalCast")))
                rows = cast_and_crew_table.find_elements(By.XPATH, ".//tr[td]")
                cast_and_crew = [
                    f"{row.find_element(By.XPATH, './td[1]').text.strip()} ({row.find_element(By.XPATH, './td[2]').text.strip()})"
                    for row in rows if "See more" not in row.text  # Filtrer les lignes "See more"
                ]
                
                # Extraction du directeur
                director = driver.find_element(By.CSS_SELECTOR, "a.a-link-normal[href*='/name/']").text

                cast_and_crew = ", ".join(cast_and_crew)  # Combine toutes les infos du cast avec un séparateur
            except:
                pass

            # Écriture des données dans le fichier de sortie
            writer.writerow([title, worldwide_gross, domestic_gross, international_gross, director, cast_and_crew])
            print(f"Data extracted successfully for: {title}")

            # Pause entre les requêtes pour éviter le blocage
            time.sleep(2)

    print(f"Data extracted successfully for {len(urls)} films.")

finally:
    driver.quit()


Data extracted successfully for: The Dark Knight (2008)
Data extracted successfully for: Inception (2010)
Data extracted successfully for: Interstellar (2014)
Data extracted successfully for: The Lord of the Rings: The Fellowship of the Ring (2001)
Data extracted successfully for: The Lord of the Rings: The Return of the King (2003)
Data extracted successfully for 5 films.


KeyboardInterrupt: 