# Data Collection

First, we import the necessary packages.

In [2]:
from bs4 import BeautifulSoup
import requests
import time
import configparser
import urllib.parse
from urllib.error import HTTPError
import json

### Functions used for collecting individual fields of data

The functions below each return a specific field of data from the IMDB search result list.

In [3]:
#get all titles from the container and put into list all_titles
def titles(movies_container):
    """Returns list of movie titles from IMDB search results."""
    all_titles = [h3.find('a').get_text() for h3 in movies_container.findAll('h3')]
    return all_titles

In [4]:
#get all ratings from container and put into mpaa_ratings list
#if no rating exists, put NaN
def mpaa_ratings(imdb_movie_list):
    """Returns list of movie MPAA ratings from IMDB search results."""
    mpaa_ratings = []
    for movie in imdb_movie_list:
        rating = movie.find('span', class_='certificate')
        if rating:
            mpaa_ratings.append(rating.get_text())
        else:
            mpaa_ratings.append("NaN")
    return mpaa_ratings

In [5]:
#get all runtime from container and put into all_runtimes list
#if no runtimes exists, put NaN
def runtimes(imdb_movie_list):
    """Returns list of movie runtimes from IMDB search results."""
    all_runtimes = []
    for movie in imdb_movie_list:
        runtime = movie.find('span', class_='runtime')
        if runtime:
            all_runtimes.append(runtime.get_text())
        else:
            all_runtimes.append("NaN")
    return all_runtimes

In [6]:
#get all genres from container and put into all_genres list
#if no genres exists, put NaN
def genres(imdb_movie_list):
    """Returns list of movie genres from IMDB search results."""
    all_genres = []
    for movie in imdb_movie_list:
        genre = movie.find('span', class_='genre')
        if genre:
            all_genres.append(genre.get_text())
        else:
            all_genres.append("NaN")
    all_genres = [genre.strip() for genre in all_genres]
    return all_genres

In [7]:
def imdb_id(imdb_movie_list):
    """Returns list of IMDB IDs for movies from IMDB search results."""
    all_ids = []
    for movie in imdb_movie_list:
        header = movie.find('h3', class_="lister-item-header")
        xid = header.find('a').attrs['href']
        xid = xid.lstrip('/title/')
        xid = xid.rstrip('/')
        xid = "tt"+xid
        all_ids.append(xid)
    return all_ids

In [8]:
#get all star ratings from container and put into star_ratings list
#if no star ratings exists, put NaN
def star_ratings(imdb_movie_list):
    """Returns list of movie ratings from IMDB search results."""
    star_ratings = []
    for movie in imdb_movie_list:
        rating = movie.find('strong')
        if rating:
            star_ratings.append(rating.get_text())
        else:
            star_ratings.append("NaN")
    return star_ratings

### Function for grabbing API key

Next, we need a way to grab the API key from the configuration file.

In [None]:
def get_api_key(config_file):
    """Returns API key from config file"""
    config = configparser.ConfigParser()
    config.read(config_file)
    api_key = config['API']['apikey']

### Functions for gathering field data into a dictionary

Using the URL for an IMDB search, this function navigates to the page and calls functions to collect all fields. The first page in the search results needs to have data collected first as the URL changes for all other pages. We then iterate through every other page and run the functions again. To avoid getting blocked by these databases, we add a buffer of 0.5 seconds to each loop. This is what makes the function take a bit to run. With the max amount of results allowed, 10,000, the function should take around 100 seconds.

In [None]:
def collect_imdb_data(imdb_seach_url, total_results):
    """Returns tuple of lists in format below for given IMDB search results: 
    (titles, MPAA ratings, runtimes, genres, star ratings, IMDB IDs)
    Expect this function to take (total_results/50)/2 seconds.
    Total results should be less than 10,000."""
    #initialize all lists
    all_titles = []
    all_mpaa_ratings = []
    all_runtimes = []
    all_genres = []
    all_star_ratings = []
    all_ids = []
    #create soup for first page
    html_page = requests.get(imdb_seach_url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    #create containers for first page
    movies_container = soup.find('div', class_="lister-list")
    imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
    #collect first page data
    for title in titles(movies_container):
        all_titles.append(title)
    for rating in mpaa_ratings(imdb_movie_list):
        all_mpaa_ratings.append(rating)
    for runtime in runtimes(imdb_movie_list):
        all_runtimes.append(runtime)
    for genre in genres(imdb_movie_list):
        all_genres.append(genre)
    for rating in star_ratings(imdb_movie_list):
        all_star_ratings.append(rating)
    for xid in imdb_id(imdb_movie_list):
        all_ids.append(xid)
    #check if total_results is greater than 10,000 since the IMDB URL changes after that many results
    if total_results > 10_000:
        print("The amount of results is too large, this function can only support up to 10,000. Collecting data for top 10,000 results only.")
        total_results = 10_001
    #iterate through the rest of the results to collect data
    for i in range(51,total_results,50):
        #create soup for current page
        url = imdb_seach_url+"&start={i}&ref_=adv_nxt"
        html_page = requests.get(url)
        soup = BeautifulSoup(html_page.content, 'html.parser')
        #create containers for current page
        movies_container = soup.find('div', class_="lister-list")
        imdb_movie_list = movies_container.findAll('div', class_="lister-item-content")
        #collect current page data
        for title in titles(movies_container):
            all_titles.append(title)
        for rating in mpaa_ratings(imdb_movie_list):
            all_mpaa_ratings.append(rating)
        for runtime in runtimes(imdb_movie_list):
            all_runtimes.append(runtime)
        for genre in genres(imdb_movie_list):
            all_genres.append(genre)
        for rating in star_ratings(imdb_movie_list):
            all_star_ratings.append(rating)
        for xid in imdb_id(imdb_movie_list):
            all_ids.append(xid)
        #buffer for half a second so as to not DDOS IMDB
        time.sleep(0.5)
    return all_titles, all_mpaa_ratings, all_runtimes, all_genres, all_star_ratings, all_ids

The function below uses The Movie DB's API to collect the last of the data, revenue and budget. To do this, it uses the IMDB movie IDs to first find the movie in TMDB's database. This gives us the movie's ID for this database and allows us to avoid having mismatched movie results that we may see by using movie title. Next, it takes the TMDB movie ID and uses it to find the movie's details. Inside the movie's details are the fields we are looking for.

The function returns a dictionary storing our field lists, which can then easily be used to build a DataFrame.

In [None]:
def collect_movie_data(imdb_seach_url, total_results, config_file):
    """Returns a dictionary of detailed movie data for given IMDB search results.
    Expect this function to take (total_results/2)+((total_results/50)/2) seconds.
    Total results should be less than 10,000."""
    #call function to collect IMDB data
    imdb_data = collect_imdb_data(imdb_seach_url, total_results)
    #assign IMDB ID data to list
    all_ids = imdb_data[5]
    #initialize lists
    all_budgets = []
    all_revenues = []
    #call function to get api key
    api_key = get_api_key(config_file)
    #iterate through all IDs collected from IMDB
    for xid in all_ids:
        #try-except block to catch any pages which return an HTTP error
        try:
            movie_id = None
            budget = None
            revenue = None
            #use API to get movie information from The Movie DB using the IMDB ID
            url = f"https://api.themoviedb.org/3/find/{xid}?api_key={api_key}&language=en-US&external_source=imdb_id"
            data = json.load(urllib.request.urlopen(url))
            #iterate through movie results in the data received from API
            for result in data['movie_results']:
                #if the movie ID field exists
                if result['id']:
                    #use movie ID in API to get detailed movie information
                    movie_id = result['id']
                    new_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US"
                    new_data = json.load(urllib.request.urlopen(new_url))
                    budget = new_data['budget']
                    revenue = new_data['revenue']
                else:
                    #else, set values to null
                    movie_id = "NaN"
                    budget = "NaN"
                    revenue = "NaN"
        except HTTPError as err:
            #if there is an HTTP error, set values to null and continue
            movie_id = "NaN"
            budget = "NaN"
            revenue = "NaN"
            all_budgets.append(budget)
            all_revenues.append(revenue)
            #buffer for half a second 
            time.sleep(0.5)
            continue
        #if the try block succeeded, add values to lists and 
        all_budgets.append(budget)
        all_revenues.append(revenue)
        #buffer for half a second 
        time.sleep(0.5)
    #store all movie data collected in a dictionary
    movie_data_dictionary = {'movie_id': all_ids, 'movie_title': imdb_data[0], 'mpaa_rating': imdb_data[1], 'runtime': imdb_data[2], 'genre': imdb_data[3], 'star_rating': imdb_data[4], 'gross_revenue': all_revenues, 'budget': all_budgets}
    return movie_data_dictionary

### Creating the dataframe and storing it into a CSV

Finally, we can call our function for collecting movie data and store the returned dictionary into `movie_dict`. This dictionary is used to make our DataFrame, which we then store into a CSV file.

In [None]:
#Use functions above to create a dataframe of movie data and save this data to a CSV file.
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=2010-01-01,2019-12-31"
config_file = "config.py"

movie_dict = collect_movie_data(url, 10_001, config_file)
movie_df = pd.DataFrame(movie_dict)
movie_df.to_csv('movie_data.csv')