In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
#get all titles from the container and put into list all_titles
def titles(movies_container):
    all_titles = [h3.find('a').get_text() for h3 in movies_container.findAll('h3')]
    return all_titles

In [3]:
#get all ratings from container and put into mpaa_ratings list
#if no rating exists, put NaN
def mpaa_ratings(imdb_movie_list):
    mpaa_ratings = []
    for movie in imdb_movie_list:
        rating = movie.find('span', class_='certificate')
        if rating:
            mpaa_ratings.append(rating.get_text())
        else:
            mpaa_ratings.append("NaN")
    return mpaa_ratings

In [4]:
#get all runtime from container and put into all_runtimes list
#if no runtimes exists, put NaN
def runtimes(imdb_movie_list):
    all_runtimes = []
    for movie in imdb_movie_list:
        runtime = movie.find('span', class_='runtime')
        if runtime:
            all_runtimes.append(runtime.get_text())
        else:
            all_runtimes.append("NaN")
    return all_runtimes

In [17]:
#get all genres from container and put into all_genres list
#if no genres exists, put NaN
def genres(imdb_movie_list):
    all_genres = []
    for movie in imdb_movie_list:
        genre = movie.find('span', class_='genre')
        if genre:
            all_genres.append(genre.get_text())
        else:
            all_genres.append("NaN")
    all_genres = [genre.strip() for genre in all_genres]
    return all_genres

In [60]:
def imdb_id(imdb_movie_list):
    all_ids = []
    for movie in imdb_movie_list:
        header = movie.find('h3', class_="lister-item-header")
        xid = header.find('a').attrs['href']
        xid = xid.lstrip('/title/')
        xid = xid.rstrip('/')
        xid = "tt"+xid
        all_ids.append(xid)
    return all_ids

In [6]:
#get all star ratings from container and put into star_ratings list
#if no star ratings exists, put NaN
def star_ratings(imdb_movie_list):
    star_ratings = []
    for movie in imdb_movie_list:
        rating = movie.find('strong')
        if rating:
            star_ratings.append(rating.get_text())
        else:
            star_ratings.append("NaN")
    return star_ratings

In [7]:
import time

In [61]:
#collect data from all pages
all_titles = []
all_mpaa_ratings = []
all_runtimes = []
all_genres = []
all_star_ratings = []
all_ids = []

#first page
html_page = requests.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2019-12-31")
soup = BeautifulSoup(html_page.content, 'html.parser')
movies_container = soup.find('div', class_="lister-list")
imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
for title in titles(movies_container):
    all_titles.append(title)
for rating in mpaa_ratings(imdb_movie_list):
    all_mpaa_ratings.append(rating)
for runtime in runtimes(imdb_movie_list):
    all_runtimes.append(runtime)
for genre in genres(imdb_movie_list):
    all_genres.append(genre)
for rating in star_ratings(imdb_movie_list):
    all_star_ratings.append(rating)
for xid in imdb_id(imdb_movie_list):
    all_ids.append(xid)

#for every other page
for i in range(51,10_001,50):
    url = f"https://www.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2019-12-31&start={i}&ref_=adv_nxt"
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    movies_container = soup.find('div', class_="lister-list")
    imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
    for title in titles(movies_container):
        all_titles.append(title)
    for rating in mpaa_ratings(imdb_movie_list):
        all_mpaa_ratings.append(rating)
    for runtime in runtimes(imdb_movie_list):
        all_runtimes.append(runtime)
    for genre in genres(imdb_movie_list):
        all_genres.append(genre)
    for rating in star_ratings(imdb_movie_list):
        all_star_ratings.append(rating)
    for xid in imdb_id(imdb_movie_list):
        all_ids.append(xid)
    time.sleep(0.5)

In [62]:
#creating dataframe with our data
import pandas as pd

columns = {'imdb_id': all_ids, 'title': all_titles, 'mpaa_rating': all_mpaa_ratings, 'runtime': all_runtimes, 'genre': all_genres, 'star_rating': all_star_ratings}

imdb_df = pd.DataFrame(columns)

In [63]:
imdb_df

Unnamed: 0,imdb_id,title,mpaa_rating,runtime,genre,star_rating
0,tt6751668,Parasite,R,132 min,"Comedy, Drama, Thriller",8.6
1,tt8946378,Knives Out,PG-13,131 min,"Comedy, Crime, Drama",8.0
2,tt2584384,Jojo Rabbit,PG-13,108 min,"Comedy, Drama, War",8.0
3,tt8579674,1917,R,119 min,"Drama, War",8.4
4,tt8367814,The Gentlemen,R,113 min,"Action, Comedy, Crime",8.1
...,...,...,...,...,...,...
9995,tt5584732,Die Hölle,Not Rated,92 min,"Action, Crime, Thriller",6.5
9996,tt1533817,Stomp the Yard 2: Homecoming,PG-13,89 min,"Drama, Music",4.2
9997,tt5258904,B&B,TV-MA,87 min,Thriller,5.3
9998,tt2585736,American Bistro,TV-MA,94 min,"Adventure, Comedy, Drama",7.1


In [64]:
imdb_df.to_csv("imdb_data.csv")

In [70]:
len(all_ids)

10000

In [75]:
#use titles to search for movie budget and revenue from themoviedb
import urllib.parse
import json

all_budgets = []
all_revenues = []
for xid in all_ids:
    movie_id = None
    budget = None
    revenue = None
    url = f"https://api.themoviedb.org/3/find/{xid}?api_key=19ad5c6f0c75547d77ec237563fb0d7e&language=en-US&external_source=imdb_id"
    data = json.load(urllib.request.urlopen(url))
    for result in data['movie_results']:
        if result['id']:
            movie_id = result['id']
            new_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key=19ad5c6f0c75547d77ec237563fb0d7e&language=en-US"
            new_data = json.load(urllib.request.urlopen(new_url))
            budget = new_data['budget']
            revenue = new_data['revenue']
        else:
            movie_id = "NaN"
            budget = "NaN"
            revenue = "NaN"
    all_budgets.append(budget)
    all_revenues.append(revenue)
    time.sleep(0.5)

In [77]:
# print(all_budgets, all_revenues)

[11363000, 40000000, 14000000, 100000000, 18400000] [201055038, 163700000, 82468705, 200483309, 48441089]


In [69]:
#     movie_name = urllib.parse.quote(title, safe='')

ValueError: not enough values to unpack (expected 3, got 1)