---
title: "Data Collection"
---

## Import Required Libraries

In [None]:
from tmdbv3api import TMDb, Genre, Discover, Movie
import json
import requests
import pandas as pd
import time

pd.set_option("display.max_columns", None)

## API Keys

In [None]:
# Obtain API Key for TMDB
with open("../../api-keys.json") as f:
    keys = json.load(f)

# API Keys
API_KEY_TMDB = keys["TMDB"]
API_KEY_OMDB = keys["OMDB"]

## TMDB

In [None]:
# Call the TMDB API
tmdb = TMDb()        
tmdb.api_key = API_KEY_TMDB

movie = Movie()
genre = Genre()
discover = Discover()

# Store all available genres in a variable
all_genres = genre.movie_list()

# Establish a page limit to search
page_limit = 20

# Create a blank movie list
movie_list = []

# Loop through all genres
for g in all_genres:
    genre_id = g["id"]
    genre_name = g["name"]
    
    # Loop through the amount of pages previously established
    for page in range(1, page_limit + 1):
        movies = discover.discover_movies({
            "with_genres": genre_id,
            "sort_by": "revenue.desc",
            "page": page,
            "include_adult": False
        })
        
        if not movies:
            break
        
        # Pull movies from each page and up to 20 pages within each genre
        for movie in movies:

            TMDB_url = f"https://api.themoviedb.org/3/movie/{movie.id}?api_key={API_KEY_TMDB}"

            age_rating_url = f"https://api.themoviedb.org/3/movie/{movie.id}/release_dates?api_key={API_KEY_TMDB}"

            keywords_url = f"https://api.themoviedb.org/3/movie/{movie.id}/keywords?api_key={API_KEY_TMDB}"

            # Create requests for general movie details, as well as the fields that aren't pulled with the regular url
            movie_details = requests.get(TMDB_url).json()
            age_rating_response = requests.get(age_rating_url).json()
            keywords_response = requests.get(keywords_url).json()

            # Loop through age ratings for countries to determine if there is a US rating
            age_rating = None
            for country in age_rating_response.get("results", []):
                if country["iso_3166_1"] == "US":  # Change to desired country code if needed
                    age_rating = country["release_dates"][0].get("certification", None)
                    break


            # Extract keywords from the response
            keywords = [kw["name"] for kw in keywords_response.get("keywords", [])]

            # Put data for each movie in a dictionary
            movie_data = {
                "IMDB_ID": movie_details.get("imdb_id", None),
                "Title": movie.title,
                "Release_Date": movie_details.get("release_date", None),
                "Age_Rating": age_rating,
                "Overview": movie.overview,
                "Popularity": movie.popularity,
                "Genre": genre_name,
                "TMDB_Rating": movie.vote_average,
                "Budget": movie_details.get("budget", None),
                "Revenue": movie_details.get("revenue", None),
                "Keywords": keywords
            }

            # Append the movie data to the movies list
            movie_list.append(movie_data)

# Create a movie df from the movie list
columns = ["IMDB_ID", 
           "Title", 
           "Release_Date", 
           "Age_Rating", 
           "Overview", 
           "Popularity", 
           "Genre", 
           "TMDB_Rating", 
           "Budget", 
           "Revenue", 
           "Keywords"]

TMDB_movies_df = pd.DataFrame(movie_list, columns = columns)

In [None]:
print(f"Total Movie Count: {len(TMDB_movies_df)}\n")

print(f"Raw Datatset Shape: {TMDB_movies_df.shape}\n")

TMDB_movies_df.head(5)

## OMDB

In [None]:
# Request the additional data from OMDB API
def additional_omdb_data(parameter):
    url = f"http://www.omdbapi.com/?apikey={API_KEY_OMDB}&i={parameter}"

    response = requests.get(url)
    data = response.json()

    if data.get("Response") == "True":

        ratings = data.get("Ratings", [])
        rotten_tomatoes_score = next((r["Value"] for r in ratings if r["Source"] == "Rotten Tomatoes"), None)

        return {
            "Year": data.get("Year", None),
            "Director": data.get("Director", None),
            "Actors": data.get("Actors", None),
            "Runtime": data.get("Runtime", None),
            "Awards": data.get("Awards", None),
            "Metascore_Rating": data.get("Metascore", None),
            "IMDB_Rating": data.get("imdbRating", None),
            "Rotten_Tomatoes_Rating": rotten_tomatoes_score
        }
    
    else:
        return {"Year": None, 
                "Director": None, 
                "Actors": None, 
                "Runtime": None, 
                "Awards": None, 
                "Metascore_Rating": None, 
                "IMDB_Rating": None, 
                "Rotten_Tomatoes_Rating": None}


# Variable to keep track of the df length
df_length = len(TMDB_movies_df)

# Loop counter to keep track of how far along the for loop is
loop_counter = 1

# Empty list to hold the newly obtained data
additional_data = []

# Loop through all titles of the TMDB_movies_df to add OMDB data
for id in TMDB_movies_df["IMDB_ID"]:
    # Append new data into the list using the previously created function for OMDB
    additional_data.append(additional_omdb_data(id))

    # Print percentage complete
    percent_complete = loop_counter / df_length
    if loop_counter % (df_length // 10) == 0:
        print(f"Percent Complete: {percent_complete * 100}%")

    loop_counter += 1


# Convert the additional data to a DataFrame
additional_df = pd.DataFrame(additional_data)

# Append the new data to the existing DataFrame
movies_df = pd.concat([TMDB_movies_df, additional_df], axis = 1)

movies_df.to_csv("../data/raw/movies.csv", index = False)


## Preview Full Dataset

In [None]:
print(f"Total Movie Count: {len(movies_df)}\n")

print(f"Raw Datatset Shape: {movies_df.shape}\n")

print(f"{movies_df["Genre"].value_counts()}\n")

movies_df.head(5)