## Web scrapping de IMDB

Descarga la información correspondiente y guarda en un csv el top de las 250 películas mediante webscrapping. Encapsúlalo en un script.

Obtén:
* Título
* Año
* Duración
* Posición
* Rating

In [18]:
# Si la petición te devuelve un 403, puedes probar con:
# pip install fake-useragent
# from fake_useragent import UserAgent
# ua = UserAgent()
# headers = {'User-Agent': ua.random}
# response = requests.get(url, headers=headers)

## IMPORTS NECESARIOS

In [4]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from fake_useragent import UserAgent

## CODIGO

In [6]:
url = "https://www.imdb.com/chart/top/"
ua = UserAgent()
headers = {'User-Agent': ua.random}
response = requests.get(url, headers=headers)

In [7]:
response

<Response [200]>

In [9]:
sp = bs(response.content, "html.parser")

In [10]:
sp

<!DOCTYPE html>
<html lang="en-US" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/"><head><meta charset="utf-8"/><meta content="width=device-width" name="viewport"/><script>if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }</script><script>window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1762417767391);
        }
    })</script><title>IMDb Top 250 movies</title><meta content="As rated by regular IMDb voters." data-id="main" name="description"/><meta content="0cadf7898134e79b" name="google-site-verification"/><meta content="C1DACEF2769068C0B0D2687C9E5105FA" name="msvalidate.01"/><meta content="m

In [39]:
titulos = []
for i in sp.find_all("h3", class_ = "ipc-title__text"):
    titulos.append(i.text)

In [15]:
titulos

['Cadena perpetua',
 'El padrino',
 'El caballero oscuro',
 'El padrino parte II',
 '12 hombres sin piedad',
 'El señor de los anillos: El retorno del rey',
 'La lista de Schindler',
 'El señor de los anillos: La comunidad del anillo',
 'Pulp Fiction',
 'El bueno, el feo y el malo',
 'El señor de los anillos: Las dos torres',
 'Forrest Gump',
 'El club de la lucha',
 'Origen',
 'El imperio contraataca',
 'Matrix',
 'Uno de los nuestros',
 'Interstellar',
 'Alguien voló sobre el nido del cuco',
 'Seven',
 'Qué bello es vivir',
 'El silencio de los corderos',
 'Los siete samuráis',
 'Salvar al soldado Ryan',
 'La milla verde',
 'Recently viewed']

In [30]:
anio = []
duracion = []
for j in sp.find_all("span", class_ = "cli-title-metadata-item"):
    if len(j.text) == 4:
        print(j.text)
        anio.append(j.text)
    elif "h" in j.text:
        duracion.append(j.text)

1994
1972
2008
1974
1957
2003
1993
2001
1994
1966
2002
1994
1999
2010
1980
1999
1990
2014
1975
1995
1946
1991
1954
1998
1999


In [31]:
print(anio)
print(duracion)

['1994', '1972', '2008', '1974', '1957', '2003', '1993', '2001', '1994', '1966', '2002', '1994', '1999', '2010', '1980', '1999', '1990', '2014', '1975', '1995', '1946', '1991', '1954', '1998', '1999']
['2h 22m', '2h 55m', '2h 32m', '3h 22m', '1h 36m', '3h 21m', '3h 15m', '2h 58m', '2h 34m', '3h 2m', '2h 59m', '2h 22m', '2h 19m', '2h 28m', '2h 4m', '2h 16m', '2h 25m', '2h 49m', '2h 13m', '2h 7m', '2h 10m', '1h 58m', '3h 27m', '2h 49m', '3h 9m']


In [32]:
print(len(anio))
print(len(titulos))

25
26


In [33]:
rating = []
for i in sp.find_all("span", class_ = "ipc-rating-star--rating"):
    rating.append(i.text)

In [35]:
print(rating)

['9.3', '9.2', '9.1', '9.0', '9.0', '9.0', '9.0', '8.9', '8.8', '8.8', '8.8', '8.8', '8.8', '8.8', '8.7', '8.7', '8.7', '8.7', '8.6', '8.6', '8.6', '8.6', '8.6', '8.6', '8.6']


In [42]:
print(len(titulos))
print(len(anio))
print(len(duracion))
print(len(rating))

25
25
25
25


In [41]:
titulos = titulos[:-1]
print(len(titulos))

25


In [45]:
dic = {
    "titulo" : titulos,
    "año" : anio,
    "duracion" : duracion,
    "rating": rating
}

df = pd.DataFrame(dic)

In [46]:
df

Unnamed: 0,titulo,año,duracion,rating
0,Cadena perpetua,1994,2h 22m,9.3
1,El padrino,1972,2h 55m,9.2
2,El caballero oscuro,2008,2h 32m,9.1
3,El padrino parte II,1974,3h 22m,9.0
4,12 hombres sin piedad,1957,1h 36m,9.0
5,El señor de los anillos: El retorno del rey,2003,3h 21m,9.0
6,La lista de Schindler,1993,3h 15m,9.0
7,El señor de los anillos: La comunidad del anillo,2001,2h 58m,8.9
8,Pulp Fiction,1994,2h 34m,8.8
9,"El bueno, el feo y el malo",1966,3h 2m,8.8


## JSON

In [48]:
import json

In [82]:
data = json.loads(sp.find("script", {"type":"application/ld+json"}).text)

In [51]:
data

{'@type': 'ItemList',
 'itemListElement': [{'@type': 'ListItem',
   'item': {'@type': 'Movie',
    'url': 'https://www.imdb.com/title/tt0111161/',
    'name': 'The Shawshank Redemption',
    'alternateName': 'Cadena perpetua',
    'description': 'A banker convicted of uxoricide forms a friendship over a quarter century with a hardened convict, while maintaining his innocence and trying to remain hopeful through simple compassion.',
    'image': 'https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_.jpg',
    'aggregateRating': {'@type': 'AggregateRating',
     'bestRating': 10,
     'worstRating': 1,
     'ratingValue': 9.3,
     'ratingCount': 3116421},
    'contentRating': '13',
    'genre': 'Drama',
    'duration': 'PT2H22M'}},
  {'@type': 'ListItem',
   'item': {'@type': 'Movie',
    'url': 'https://www.imdb.com/title/tt0068646/',
    'name': 'The Godfather',
    'alternateName': 'El padrino',
    'description': 'The aging patriar

In [53]:
print(len(data["itemListElement"]))

250


In [59]:
data["itemListElement"][4]

{'@type': 'ListItem',
 'item': {'@type': 'Movie',
  'url': 'https://www.imdb.com/title/tt0050083/',
  'name': '12 Angry Men',
  'alternateName': '12 hombres sin piedad',
  'description': 'The jury in a New York City murder trial is frustrated by a single member whose skeptical caution forces them to more carefully consider the evidence before jumping to a hasty verdict.',
  'image': 'https://m.media-amazon.com/images/M/MV5BYThhOGFhODktNGEwNi00MzY2LTg3YWYtNzAzZTE0MTFlMWQxXkEyXkFqcGc@._V1_.jpg',
  'aggregateRating': {'@type': 'AggregateRating',
   'bestRating': 10,
   'worstRating': 1,
   'ratingValue': 9,
   'ratingCount': 954478},
  'contentRating': 'A',
  'genre': 'Crime, Drama',
  'duration': 'PT1H36M'}}

In [83]:
l_titulo = []
l_anio = []
l_duracion  = []
l_rating = []
for i in data["itemListElement"]:
    l_titulo.append(i["item"].get("alternateName", i["item"]["name"]))
    l_duracion.append(i["item"]["duration"][2:])
    l_rating.append(i["item"]["aggregateRating"]["ratingValue"])
   
        

In [64]:
print(len(l_titulo))
print(len(l_duracion))
print(len(l_rating))

250
250
250


In [85]:
data = json.loads(sp.find("script", {"id":"__NEXT_DATA__"}).text)


In [75]:
data["props"]["pageProps"]["pageData"]["chartTitles"]["edges"]

[{'currentRank': 1,
  'node': {'id': 'tt0111161',
   'titleText': {'text': 'Cadena perpetua', '__typename': 'TitleText'},
   'titleType': {'id': 'movie',
    'text': 'Movie',
    'canHaveEpisodes': False,
    'displayableProperty': {'value': {'plainText': '',
      '__typename': 'Markdown'},
     '__typename': 'DisplayableTitleTypeProperty'},
    '__typename': 'TitleType'},
   'originalTitleText': {'text': 'The Shawshank Redemption',
    '__typename': 'TitleText'},
   'primaryImage': {'id': 'rm1690056449',
    'width': 1200,
    'height': 1800,
    'url': 'https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_.jpg',
    'caption': {'plainText': 'Tim Robbins in Cadena perpetua (1994)',
     '__typename': 'Markdown'},
    '__typename': 'Image'},
   'releaseYear': {'year': 1994, 'endYear': None, '__typename': 'YearRange'},
   'ratingsSummary': {'aggregateRating': 9.3,
    'voteCount': 3116421,
    '__typename': 'RatingsSummary'},
   'runt

In [86]:
l_posicion = []
for i in data["props"]["pageProps"]["pageData"]["chartTitles"]["edges"]:
    l_posicion.append(i["currentRank"])
    l_anio.append(i["node"]["releaseYear"]["year"])

In [87]:
print(len(l_posicion))
print(len(l_titulo))
print(len(l_duracion))
print(len(l_rating))
print(len(l_anio))

250
250
250
250
250


In [89]:
dic = {
    "l_posicion":l_posicion,
    "titulo" : l_titulo,
    "año" : l_anio,
    "duracion" : l_duracion,
    "rating": l_rating
}

df = pd.DataFrame(dic)

In [90]:
df

Unnamed: 0,l_posicion,titulo,año,duracion,rating
0,1,Cadena perpetua,1994,2H22M,9.3
1,2,El padrino,1972,2H55M,9.2
2,3,El caballero oscuro,2008,2H32M,9.1
3,4,El padrino parte II,1974,3H22M,9.0
4,5,12 hombres sin piedad,1957,1H36M,9.0
...,...,...,...,...,...
245,246,Las uvas de la ira,1940,2H9M,8.1
246,247,Ser o no ser,1942,1H39M,8.1
247,248,Gangs of Wasseypur,2012,5H21M,8.2
248,249,Drishyam,2015,2H43M,8.2


In [91]:
df.to_csv("top_250.csv")