In [10]:
from bs4 import BeautifulSoup
import requests

In [11]:
html_page = requests.get('https://www.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2019-12-31')

In [12]:
html_page

<Response [200]>

In [13]:
soup = BeautifulSoup(html_page.content, 'html.parser')

In [14]:
type(soup)

bs4.BeautifulSoup

In [15]:
movies_container = soup.find('div', class_='lister-list')

In [16]:
titles = movies_container.findAll('h3')
titles[0]

<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt6751668/">Parasite</a>
<span class="lister-item-year text-muted unbold">(2019)</span>
</h3>

In [17]:
titles[0].find('a')

<a href="/title/tt6751668/">Parasite</a>

In [18]:
titles[0].find('a').get_text()

'Parasite'

In [19]:
imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")

In [20]:
def titles(movie_container):
    all_titles = [h3.find('a').get_text() for h3 in movies_container.findAll('h3')]
    return all_titles

In [21]:
def mpaa_ratings(imdb_movie_list):
    mpaa_ratings = []
    for movie in imdb_movie_list:
        rating = movie.find('span', class_='certificate')
        if rating:
            mpaa_ratings.append(rating.get_text())
        else:
            mpaa_ratings.append("NaN")
    return mpaa_ratings

In [22]:
def runtimes(imdb_movie_list):
    all_runtimes = []
    for movie in imdb_movie_list:
        runtime = movie.find('span', class_='runtime')
        if runtime:
            all_runtimes.append(runtime.get_text())
        else:
            all_runtimes.append('NaN')
    return all_runtimes

In [23]:
def genres(imdb_movie_list):
    all_genres = []
    for movie in imdb_movie_list:
        genre = movie.find('span', class_='genre')
        if genre:
            all_genres.append(genre.get_text())
        else:
            all_genres.append("NaN")
    all_genres = [genre.strip() for genre in all_genres]
    return all_genres

In [24]:
def imdb_id(imdb_movie_list):
    all_ids = []
    for movie in imdb_movie_list:
        header = movie.find('h3', class_="lister-item-header")
        xid = header.find('a').attrs['href']
        xid = xid.lstrip('/title/')
        xid = xid.rstrip('/')
        xid = "tt"+xid
        all_ids.append(xid)
    return all_ids

In [25]:
def star_ratings(imdb_movie_list):
    star_ratings = []
    for movie in imdb_movie_list:
        rating = movie.find('strong')
        if rating:
            star_ratings.append(rating.get_text())
        else:
            star_ratings.append("NaN")
    return star_ratings

In [26]:
import time

In [27]:
#collect data from all pages
all_titles = []
all_mpaa_ratings = []
all_runtimes = []
all_genres = []
all_star_ratings = []
all_ids = []

#first page
html_page = requests.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2019-12-31")
soup = BeautifulSoup(html_page.content, 'html.parser')
movies_container = soup.find('div', class_="lister-list")
imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
for title in titles(movies_container):
    all_titles.append(title)
for rating in mpaa_ratings(imdb_movie_list):
    all_mpaa_ratings.append(rating)
for runtime in runtimes(imdb_movie_list):
    all_runtimes.append(runtime)
for genre in genres(imdb_movie_list):
    all_genres.append(genre)
for rating in star_ratings(imdb_movie_list):
    all_star_ratings.append(rating)
for xid in imdb_id(imdb_movie_list):
    all_ids.append(xid)

#for every other page
for i in range(51,10_001,50):
    url = f"https://www.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2019-12-31&start={i}&ref_=adv_nxt"
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    movies_container = soup.find('div', class_="lister-list")
    imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
    for title in titles(movies_container):
        all_titles.append(title)
    for rating in mpaa_ratings(imdb_movie_list):
        all_mpaa_ratings.append(rating)
    for runtime in runtimes(imdb_movie_list):
        all_runtimes.append(runtime)
    for genre in genres(imdb_movie_list):
        all_genres.append(genre)
    for rating in star_ratings(imdb_movie_list):
        all_star_ratings.append(rating)
    for xid in imdb_id(imdb_movie_list):
        all_ids.append(xid)
    time.sleep(0.5)

In [28]:
movie_dict = {'movie_id': all_ids, 'movie_title': all_titles, 'mpaa_rating': all_mpaa_ratings, 'runtime': all_runtimes, 'genre': all_genres, 'star_rating': all_star_ratings}

In [29]:
import pandas as pd

In [30]:
df = pd.DataFrame(movie_dict)
df.head()

Unnamed: 0,movie_id,movie_title,mpaa_rating,runtime,genre,star_rating
0,tt6751668,Parasite,R,132 min,"Comedy, Drama, Thriller",8.6
1,tt8946378,Knives Out,PG-13,131 min,"Comedy, Crime, Drama",8.0
2,tt2584384,Jojo Rabbit,PG-13,108 min,"Comedy, Drama, War",8.0
3,tt8579674,1917,R,119 min,"Drama, War",8.4
4,tt8367814,The Gentlemen,R,113 min,"Action, Comedy, Crime",8.1


In [31]:
df.to_csv('IMDB_Data')

In [32]:
import configparser

config = configparser.ConfigParser()
config.read('config.py')
config.sections()
api_key = config['API']['apikey']

In [35]:
#use titles to search for movie budget and revenue from themoviedb
import urllib.parse
from urllib.error import HTTPError
import json
all_budgets = []
all_revenues = []
for xid in all_ids:
#     print(f"\n Starting {xid}.\n")
    try:
        movie_id = None
        budget = None
        revenue = None
        url = f"https://api.themoviedb.org/3/find/{xid}?api_key={api_key}&language=en-US&external_source=imdb_id"
        data = json.load(urllib.request.urlopen(url))
        for result in data['movie_results']:
            if result['id']:
                movie_id = result['id']
                new_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US"
                new_data = json.load(urllib.request.urlopen(new_url))
                budget = new_data['budget']
                revenue = new_data['revenue']
            else:
                movie_id = "NaN"
                budget = "NaN"
                revenue = "NaN"
    except HTTPError as err:
        movie_id = "NaN"
        budget = "NaN"
        revenue = "NaN"
#         print(f"\n I failed, continuing {xid}.\n")
        all_budgets.append(budget)
        all_revenues.append(revenue)
        time.sleep(0.5)
        continue
#     print(f"\n I worked, {xid}.\n")
    all_budgets.append(budget)
    all_revenues.append(revenue)
    time.sleep(0.5)

In [64]:
len(all_budgets)

10000

In [65]:
movie_dict = {'movie_id': all_ids, 'movie_title': all_titles, 'mpaa_rating': all_mpaa_ratings, 'runtime': all_runtimes, 'genre': all_genres, 'star_rating': all_star_ratings, 'gross_revenue': all_revenues, 'budget': all_budgets}

In [66]:
movie_df = pd.DataFrame(movie_dict)

In [67]:
movie_df.head()

Unnamed: 0,movie_id,movie_title,mpaa_rating,runtime,genre,star_rating,gross_revenue,budget
0,tt6751668,Parasite,R,132 min,"Comedy, Drama, Thriller",8.6,201055038,11363000
1,tt8946378,Knives Out,PG-13,131 min,"Comedy, Crime, Drama",8.0,163700000,40000000
2,tt2584384,Jojo Rabbit,PG-13,108 min,"Comedy, Drama, War",8.0,82468705,14000000
3,tt8579674,1917,R,119 min,"Drama, War",8.4,200483309,100000000
4,tt8367814,The Gentlemen,R,113 min,"Action, Comedy, Crime",8.1,48441089,18400000


In [69]:
movie_df.to_csv('movie_data.csv')