In [57]:
import re
import json
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent

In [58]:
url_top = "https://www.imdb.com/es/chart/top/"

In [11]:
UserAgent().random

'Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3.1 Mobile/15E148 Safari/604.1'

In [12]:
response = requests.get(url_top, headers = {
    'User-Agent': UserAgent().random
})
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
print(f"Page Title: {soup.title.string}")

Page Title: Las 250 mejores películas de IMDb


In [23]:
list_li = []
for ul_tag in soup.find_all('ul', class_='ipc-metadata-list'):
    list_li.extend(ul_tag.find_all('li', class_='ipc-metadata-list-summary-item'))

len(list_li)

25

In [24]:
len(list_li)

25

In [44]:
script_tag = soup.find(
    'script',
    type='application/ld+json',
    string=lambda text: text and 'ItemList' in text
)

if not script_tag:
    # En Beautiful Soup, puedes simplemente imprimir o retornar None/manejar el error.
    # No hay un logger integrado como en Scrapy.
    print("Error: No se encontró el bloque ItemList JSON-LD")

# Obtenemos el texto dentro del tag script
list_json_str = script_tag.string

try:
    data = json.loads(list_json_str).get("itemListElement")
    #print(f"len(data) : {len(data.get("itemListElement"))}")
except json.JSONDecodeError as e:
    print(f"Error al parsear JSON: {e}")

# Accedemos a los elementos de la lista, similar a tu código Scrapy
# Tomamos solo el primer elemento [:1] para este ejemplo.
# En un caso real, podrías iterar sobre todos o tomar más.
list_movies = []
for elem in data:
    movie = elem["item"]
    list_movies.append({
        'url': movie['url'],
        'name': movie['name'],
        'duration': movie['duration'],
        "rating": float(movie["aggregateRating"]["ratingValue"]) if "aggregateRating" in movie else None,
        "duration_iso": movie["duration"],
    })

len(list_movies)

250

In [47]:
list_movies[0]

{'url': 'https://www.imdb.com/es/title/tt0111161/',
 'name': 'The Shawshank Redemption',
 'duration': 'PT2H22M',
 'rating': 9.3,
 'duration_iso': 'PT2H22M'}

In [67]:
for mov in list_movies[:1]:
    res_mov = requests.get(mov['url'], headers = {
        'User-Agent': UserAgent().random
    })
    soup_mov = BeautifulSoup(res_mov.text, 'html.parser')
    
    list_inf_li = []
    for ul_tag in soup_mov.find_all('ul', class_='ipc-metadata-list'):
        list_inf_li.extend(ul_tag.find_all('li', class_='ipc-metadata-list__item', attrs={"data-testid": "title-details-releasedate"}))
    
    text_from_tag = list_inf_li[0].get_text()
    print("len ul anio", len(list_inf_li), " texto: ", text_from_tag)
    year_pattern = r'\d{4}'

    # Buscar el patrón en el texto
    match = re.search(year_pattern, text_from_tag)
    year = None
    if match:
        year = match.group(0) # .group(0) devuelve la cadena que coincidió
        print(f"Año encontrado: {year}")
    
    span_score = None
    for score in soup_mov.find_all('span', class_='metacritic-score-box'):
        span_score = score.get_text().strip()
        break
    print(f"metascore: {span_score}")
    mov['year'] = year
    mov['metascore'] = span_score

    seccion_cast = soup_mov.find('section', attrs={"data-testid": "title-cast"})
    actors_tag = []
    if seccion_cast:
        for pos, cast_tag in enumerate(seccion_cast.find_all('div', attrs={"data-testid": "title-cast-item"}), start=1):
            actor_name = cast_tag.find('a', attrs={"data-testid": "title-cast-item__actor"}).text
            actors_tag.append({
                "movie_title": mov['name'],
                "actor_name": actor_name.strip(),
                "position_order": pos
            })
    print(len(actors_tag), "actores")
    print(actors_tag[:3])

len ul anio 1  texto:  Fecha de lanzamiento23 de septiembre de 1994 (México)
Año encontrado: 1994
metascore: 82
18 actores
[{'movie_title': 'The Shawshank Redemption', 'actor_name': 'Tim Robbins', 'position_order': 1}, {'movie_title': 'The Shawshank Redemption', 'actor_name': 'Morgan Freeman', 'position_order': 2}, {'movie_title': 'The Shawshank Redemption', 'actor_name': 'Bob Gunton', 'position_order': 3}]


In [68]:
list_movies[0]

{'url': 'https://www.imdb.com/es/title/tt0111161/',
 'name': 'The Shawshank Redemption',
 'duration': 'PT2H22M',
 'rating': 9.3,
 'duration_iso': 'PT2H22M',
 'year': '1994',
 'metascore': '82'}