---
title: "Data Collection"
---

## Import Required Libraries

In [5]:
from tmdbv3api import TMDb, Genre, Discover, Movie
import json
import requests
import pandas as pd
from tqdm import tqdm
import time

pd.set_option("display.max_columns", None)

## API Keys

In [10]:
# Obtain API Key for TMDB
with open("../api_keys/api-keys.json") as f:
    keys = json.load(f)

# API Keys
API_KEY_TMDB = keys["TMDB"]
API_KEY_OMDB = keys["OMDB"]

## TMDB

In [28]:
# Call the TMDB API
tmdb = TMDb()        
tmdb.api_key = API_KEY_TMDB

movie = Movie()
genre = Genre()
discover = Discover()

# Store all available genres in a variable
all_genres = genre.movie_list()

# Remove the 'TV Movie' Genre
filtered_genres = []

for g in all_genres:
    if g["name"] != "TV Movie":
        filtered_genres.append(g)

all_genres = filtered_genres

# Establish a page limit to search
page_limit = 35

# Create a blank movie list
movie_list = []

# Create a set of already visited movies
visited_ids = set()

# Loop through all genres
for g in tqdm(all_genres, desc = "TMDB Search"):
    genre_id = g["id"]
    genre_name = g["name"]
    
    # Loop through the amount of pages previously set
    for page in range(1, page_limit + 1):
        movies = discover.discover_movies({
            "with_genres": genre_id,
            "sort_by": "revenue.desc",
            "page": page,
            "include_adult": False
        })
        
        if not movies:
            break
        
        # Pull movies from each page and up to 30 pages within each genre
        for movie in movies:
            # Check if the movie has lready been queried
            if movie.id in visited_ids:
                continue

            # Add the current movie to the set of visited movies
            visited_ids.add(movie.id)

            TMDB_url = f"https://api.themoviedb.org/3/movie/{movie.id}?api_key={API_KEY_TMDB}"

            age_rating_url = f"https://api.themoviedb.org/3/movie/{movie.id}/release_dates?api_key={API_KEY_TMDB}"

            keywords_url = f"https://api.themoviedb.org/3/movie/{movie.id}/keywords?api_key={API_KEY_TMDB}"

            credits_url = f"https://api.themoviedb.org/3/movie/{movie.id}/credits?api_key={API_KEY_TMDB}"


            # Create requests for general movie details, as well as the fields that aren't pulled with the regular url
            movie_details = requests.get(TMDB_url).json()
            age_rating_response = requests.get(age_rating_url).json()
            keywords_response = requests.get(keywords_url).json()
            credits_response = requests.get(credits_url).json()

            # Loop through age ratings for countries to determine if there is a US rating
            age_rating = None
            for country in age_rating_response.get("results", []):
                if country["iso_3166_1"] == "US":
                    age_rating = country["release_dates"][0].get("certification", None)
                    break


            # Extract keywords from the response
            keywords = [kw["name"] for kw in keywords_response.get("keywords", [])]

            # Extract Cast
            cast = credits_response.get("cast", [])
            top_actors = [actor["name"] for actor in cast[:10]]

            # Extract Crew names
            crew = credits_response.get("crew", [])
            directors = [p["name"] for p in crew if p["job"] == "Director"]
            producers = [p["name"] for p in crew if p["job"] == "Producer"]
            writers = [p["name"] for p in crew if p["job"] in ["Screenplay", "Writer"]]
            composers = [p["name"] for p in crew if p["job"] in ["Original Music Composer", "Composer"]]
            cinematographers = [p["name"] for p in crew if p["job"] in ["Director of Photography", "Cinematographer"]]

            # Extract production companies
            production_companies = [p["name"] for p in movie_details.get("production_companies", [])]

            # Extract genres
            genres = [g["name"] for g in movie_details.get("genres", [])]

            # Extract collection name (if any)
            collection_info = movie_details.get("belongs_to_collection")
            collection_name = collection_info["name"] if collection_info else None

            # Put data for each movie in a dictionary
            movie_data = {
                "IMDB_ID": movie_details.get("imdb_id", None),
                "Title": movie.title,
                "Release_Date": movie_details.get("release_date", None),
                "Age_Rating": age_rating,
                "Overview": movie.overview,
                "Popularity": movie.popularity,
                "Genre": genres,
                "TMDB_Rating": movie.vote_average,
                "Budget": movie_details.get("budget", None),
                "Revenue": movie_details.get("revenue", None),
                "Keywords": keywords,
                "Production_Companies": production_companies,
                "Producers": producers,
                "Writers": writers,
                "Composers": composers,
                "Cinematographers": cinematographers,
                "Franchise_Name": collection_name,
                "Actors": top_actors,
                "Directors": directors
            }

            # Append the movie data to the movies list
            movie_list.append(movie_data)

# Create a movie df from the movie list
columns = ["IMDB_ID", 
           "Title", 
           "Release_Date", 
           "Age_Rating", 
           "Overview", 
           "Popularity", 
           "Genre", 
           "TMDB_Rating", 
           "Budget", 
           "Revenue", 
           "Keywords",
           "Production_Companies",
           "Producers",
           "Writers",
           "Composers",
           "Cinematographers",
           "Franchise_Name",
           "Actors",
           "Directors"
           ]

TMDB_movies_df = pd.DataFrame(movie_list, columns = columns)

TMDB Search: 100%|██████████| 18/18 [1:07:13<00:00, 224.09s/it]


In [29]:
TMDB_movies_df.to_csv("../data/raw/TMDB_movies.csv", index = False)

In [30]:
test_df = pd.read_csv("../data/raw/TMDB_movies.csv")

print(f"Total Movie Count: {len(test_df)}\n")

print(f"Raw Datatset Shape: {test_df.shape}\n")

test_df.head(5)

Total Movie Count: 7333

Raw Datatset Shape: (7333, 19)



Unnamed: 0,IMDB_ID,Title,Release_Date,Age_Rating,Overview,Popularity,Genre,TMDB_Rating,Budget,Revenue,Keywords,Production_Companies,Producers,Writers,Composers,Cinematographers,Franchise_Name,Actors,Directors
0,tt0499549,Avatar,2009-12-15,PG-13,"In the 22nd century, a paraplegic Marine is di...",22.0122,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",7.591,237000000,2923706026,"['paraplegic', 'attachment to nature', 'cultur...","['Dune Entertainment', 'Lightstorm Entertainme...","['James Cameron', 'Jon Landau']",['James Cameron'],['James Horner'],['Mauro Fiore'],Avatar Collection,"['Sam Worthington', 'Zoe Saldaña', 'Sigourney ...",['James Cameron']
1,tt4154796,Avengers: Endgame,2019-04-24,PG-13,After the devastating events of Avengers: Infi...,16.825,"['Adventure', 'Science Fiction', 'Action']",8.24,356000000,2799439100,"['superhero', 'time travel', 'space travel', '...",['Marvel Studios'],['Kevin Feige'],"['Stephen McFeely', 'Christopher Markus']",['Alan Silvestri'],['Trent Opaloch'],The Avengers Collection,"['Robert Downey Jr.', 'Chris Evans', 'Mark Ruf...","['Anthony Russo', 'Joe Russo']"
2,tt1630029,Avatar: The Way of Water,2022-12-14,PG-13,Set more than a decade after the events of the...,23.372,"['Science Fiction', 'Adventure', 'Action']",7.611,460000000,2320250281,"['dying and death', 'loss of loved one', 'alie...","['20th Century Studios', 'Lightstorm Entertain...","['Jon Landau', 'James Cameron']","['James Cameron', 'Amanda Silver', 'Rick Jaffa']",['Simon Franglen'],['Russell Carpenter'],Avatar Collection,"['Sam Worthington', 'Zoe Saldaña', 'Sigourney ...",['James Cameron']
3,tt34956443,Ne Zha 2,2025-01-29,NR,"Following the Tribulation, although the souls ...",35.9241,"['Animation', 'Fantasy', 'Adventure', 'Action']",8.1,80000000,2213230000,"['based on myths, legends or folklore', '3d an...","['Chengdu Coco Cartoon', 'Beijing Enlight Pict...",['Liu Wenzhang'],['Yang Yu'],"['Roc Chen', 'Wan Pin Chu', 'Yang Rui']",[],Ne Zha Collection,"['Lu Yanting', 'Joseph', 'Han Mo', 'Chen Hao',...",['Yang Yu']
4,tt2488496,Star Wars: The Force Awakens,2015-12-15,,Thirty years after defeating the Galactic Empi...,9.878,"['Adventure', 'Action', 'Science Fiction']",7.26,245000000,2068223624,"['android', 'spacecraft', 'space opera', 'requ...","['Lucasfilm Ltd.', 'Bad Robot']","['Bryan Burk', 'Kathleen Kennedy', 'J.J. Abrams']","['Michael Arndt', 'Lawrence Kasdan', 'J.J. Abr...",['John Williams'],['Dan Mindel'],Star Wars Collection,"['Harrison Ford', 'Mark Hamill', 'Carrie Fishe...",['J.J. Abrams']


## OMDB

In [31]:
# Request the additional data from OMDB API
def additional_omdb_data(parameter):
    url = f"http://www.omdbapi.com/?apikey={API_KEY_OMDB}&i={parameter}"

    response = requests.get(url)
    data = response.json()

    if data.get("Response") == "True":

        ratings = data.get("Ratings", [])
        rotten_tomatoes_score = next((r["Value"] for r in ratings if r["Source"] == "Rotten Tomatoes"), None)

        return {
            "Year": data.get("Year", None),
            "Runtime": data.get("Runtime", None),
            "Awards": data.get("Awards", None),
            "Country": data.get("Country", None),
            "Language": data.get("Language", None),
            "Metascore_Rating": data.get("Metascore", None),
            "IMDB_Rating": data.get("imdbRating", None),
            "Rotten_Tomatoes_Rating": rotten_tomatoes_score
        }
    
    else:
        return {"Year": None,
                "Runtime": None, 
                "Awards": None, 
                "Country": None,
                "Language": None,
                "Metascore_Rating": None, 
                "IMDB_Rating": None, 
                "Rotten_Tomatoes_Rating": None}

# Empty list to hold the newly obtained data
additional_data = []

# Loop through all titles of the TMDB_movies_df to add OMDB data
for imdb_id in tqdm(TMDB_movies_df["IMDB_ID"], desc = "OMDB Data"):
    # Append new data into the list using the previously created function for OMDB
    additional_data.append(additional_omdb_data(imdb_id))

# Convert the additional data to a DataFrame
OMDB_df = pd.DataFrame(additional_data)

# Append the new data to the existing DataFrame
movies_df = pd.concat([TMDB_movies_df, OMDB_df], axis = 1)

OMDB Data: 100%|██████████| 7333/7333 [09:31<00:00, 12.84it/s]


In [32]:
movies_df.to_csv("../data/raw/movies.csv", index = False)

## Preview Full Dataset

In [33]:
test_df = pd.read_csv("../data/raw/movies.csv")

print(f"Total Movie Count: {len(test_df)}\n")

print(f"Raw Datatset Shape: {test_df.shape}\n")

test_df.head(5)

Total Movie Count: 7333

Raw Datatset Shape: (7333, 27)



Unnamed: 0,IMDB_ID,Title,Release_Date,Age_Rating,Overview,Popularity,Genre,TMDB_Rating,Budget,Revenue,Keywords,Production_Companies,Producers,Writers,Composers,Cinematographers,Franchise_Name,Actors,Directors,Year,Runtime,Awards,Country,Language,Metascore_Rating,IMDB_Rating,Rotten_Tomatoes_Rating
0,tt0499549,Avatar,2009-12-15,PG-13,"In the 22nd century, a paraplegic Marine is di...",22.0122,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",7.591,237000000,2923706026,"['paraplegic', 'attachment to nature', 'cultur...","['Dune Entertainment', 'Lightstorm Entertainme...","['James Cameron', 'Jon Landau']",['James Cameron'],['James Horner'],['Mauro Fiore'],Avatar Collection,"['Sam Worthington', 'Zoe Saldaña', 'Sigourney ...",['James Cameron'],2009,162 min,Won 3 Oscars. 91 wins & 131 nominations total,"United States, United Kingdom","English, Spanish",83.0,7.9,81%
1,tt4154796,Avengers: Endgame,2019-04-24,PG-13,After the devastating events of Avengers: Infi...,16.825,"['Adventure', 'Science Fiction', 'Action']",8.24,356000000,2799439100,"['superhero', 'time travel', 'space travel', '...",['Marvel Studios'],['Kevin Feige'],"['Stephen McFeely', 'Christopher Markus']",['Alan Silvestri'],['Trent Opaloch'],The Avengers Collection,"['Robert Downey Jr.', 'Chris Evans', 'Mark Ruf...","['Anthony Russo', 'Joe Russo']",2019,181 min,Nominated for 1 Oscar. 70 wins & 133 nominatio...,United States,"English, Japanese, Xhosa, German",78.0,8.4,94%
2,tt1630029,Avatar: The Way of Water,2022-12-14,PG-13,Set more than a decade after the events of the...,23.372,"['Science Fiction', 'Adventure', 'Action']",7.611,460000000,2320250281,"['dying and death', 'loss of loved one', 'alie...","['20th Century Studios', 'Lightstorm Entertain...","['Jon Landau', 'James Cameron']","['James Cameron', 'Amanda Silver', 'Rick Jaffa']",['Simon Franglen'],['Russell Carpenter'],Avatar Collection,"['Sam Worthington', 'Zoe Saldaña', 'Sigourney ...",['James Cameron'],2022,192 min,Won 1 Oscar. 75 wins & 153 nominations total,United States,English,67.0,7.5,76%
3,tt34956443,Ne Zha 2,2025-01-29,NR,"Following the Tribulation, although the souls ...",35.9241,"['Animation', 'Fantasy', 'Adventure', 'Action']",8.1,80000000,2213230000,"['based on myths, legends or folklore', '3d an...","['Chengdu Coco Cartoon', 'Beijing Enlight Pict...",['Liu Wenzhang'],['Yang Yu'],"['Roc Chen', 'Wan Pin Chu', 'Yang Rui']",[],Ne Zha Collection,"['Lu Yanting', 'Joseph', 'Han Mo', 'Chen Hao',...",['Yang Yu'],2025,143 min,3 wins total,China,"Mandarin, Chinese, English, Hindi",63.0,8.1,96%
4,tt2488496,Star Wars: The Force Awakens,2015-12-15,,Thirty years after defeating the Galactic Empi...,9.878,"['Adventure', 'Action', 'Science Fiction']",7.26,245000000,2068223624,"['android', 'spacecraft', 'space opera', 'requ...","['Lucasfilm Ltd.', 'Bad Robot']","['Bryan Burk', 'Kathleen Kennedy', 'J.J. Abrams']","['Michael Arndt', 'Lawrence Kasdan', 'J.J. Abr...",['John Williams'],['Dan Mindel'],Star Wars Collection,"['Harrison Ford', 'Mark Hamill', 'Carrie Fishe...",['J.J. Abrams'],2015,138 min,Nominated for 5 Oscars. 64 wins & 140 nominati...,"United States, United Kingdom",English,80.0,7.8,93%
