---
title: "Data Collection"
---

## Import Required Libraries

In [9]:
from tmdbv3api import TMDb, Genre, Discover, Movie
import json
import requests
import pandas as pd
from tqdm import tqdm
import time

pd.set_option("display.max_columns", None)

## API Keys

In [10]:
# Obtain API Key for TMDB
with open("../api_keys/api-keys.json") as f:
    keys = json.load(f)

# API Keys
API_KEY_TMDB = keys["TMDB"]
API_KEY_OMDB = keys["OMDB"]

In [13]:
genre = Genre()
all_genres = genre.movie_list()

filtered_genres = []

for g in all_genres:
    if g["name"] != "TV Movie":
        filtered_genres.append(g)

all_genres = filtered_genres

print(all_genres)

[{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}, {'id': 99, 'name': 'Documentary'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 14, 'name': 'Fantasy'}, {'id': 36, 'name': 'History'}, {'id': 27, 'name': 'Horror'}, {'id': 10402, 'name': 'Music'}, {'id': 9648, 'name': 'Mystery'}, {'id': 10749, 'name': 'Romance'}, {'id': 878, 'name': 'Science Fiction'}, {'id': 53, 'name': 'Thriller'}, {'id': 10752, 'name': 'War'}, {'id': 37, 'name': 'Western'}]


## TMDB

In [15]:
# Call the TMDB API
tmdb = TMDb()        
tmdb.api_key = API_KEY_TMDB

movie = Movie()
genre = Genre()
discover = Discover()

# Store all available genres in a variable
all_genres = genre.movie_list()

# Remove the 'TV Movie' Genre
filtered_genres = []

for g in all_genres:
    if g["name"] != "TV Movie":
        filtered_genres.append(g)

all_genres = filtered_genres

# Establish a page limit to search
page_limit = 30

# Create a blank movie list
movie_list = []

# Create a set of already visited movies
visited_ids = set()

# Loop through all genres
for g in tqdm(all_genres, desc = "TMDB Search"):
    genre_id = g["id"]
    genre_name = g["name"]
    
    # Loop through the amount of pages previously set
    for page in range(1, page_limit + 1):
        movies = discover.discover_movies({
            "with_genres": genre_id,
            "sort_by": "revenue.desc",
            "page": page,
            "include_adult": False
        })
        
        if not movies:
            break
        
        # Pull movies from each page and up to 30 pages within each genre
        for movie in movies:
            # Check if the movie has lready been queried
            if movie.id in visited_ids:
                continue

            # Add the current movie to the set of visited movies
            visited_ids.add(movie.id)

            TMDB_url = f"https://api.themoviedb.org/3/movie/{movie.id}?api_key={API_KEY_TMDB}"

            age_rating_url = f"https://api.themoviedb.org/3/movie/{movie.id}/release_dates?api_key={API_KEY_TMDB}"

            keywords_url = f"https://api.themoviedb.org/3/movie/{movie.id}/keywords?api_key={API_KEY_TMDB}"

            # Create requests for general movie details, as well as the fields that aren't pulled with the regular url
            movie_details = requests.get(TMDB_url).json()
            age_rating_response = requests.get(age_rating_url).json()
            keywords_response = requests.get(keywords_url).json()

            # Loop through age ratings for countries to determine if there is a US rating
            age_rating = None
            for country in age_rating_response.get("results", []):
                if country["iso_3166_1"] == "US":
                    age_rating = country["release_dates"][0].get("certification", None)
                    break


            # Extract keywords from the response
            keywords = [kw["name"] for kw in keywords_response.get("keywords", [])]

            # Put data for each movie in a dictionary
            movie_data = {
                "IMDB_ID": movie_details.get("imdb_id", None),
                "Title": movie.title,
                "Release_Date": movie_details.get("release_date", None),
                "Age_Rating": age_rating,
                "Overview": movie.overview,
                "Popularity": movie.popularity,
                "Genre": [g["name"] for g in movie_details.get("genres", [])],
                "TMDB_Rating": movie.vote_average,
                "Budget": movie_details.get("budget", None),
                "Revenue": movie_details.get("revenue", None),
                "Keywords": keywords,
                "Production_Companies": [p["name"] for p in movie_details.get("production_companies", [])],
            }

            # Append the movie data to the movies list
            movie_list.append(movie_data)

# Create a movie df from the movie list
columns = ["IMDB_ID", 
           "Title", 
           "Release_Date", 
           "Age_Rating", 
           "Overview", 
           "Popularity", 
           "Genre", 
           "TMDB_Rating", 
           "Budget", 
           "Revenue", 
           "Keywords",
           "Production_Companies"
           ]

TMDB_movies_df = pd.DataFrame(movie_list, columns = columns)

TMDB Search:   0%|          | 0/18 [00:00<?, ?it/s]

TMDB Search: 100%|██████████| 18/18 [1:10:01<00:00, 233.44s/it]


In [16]:
TMDB_movies_df.to_csv("../data/raw/TMDB_movies.csv", index = False)

In [17]:
test_df = pd.read_csv("../data/raw/TMDB_movies.csv")

print(f"Total Movie Count: {len(test_df)}\n")

print(f"Raw Datatset Shape: {test_df.shape}\n")

test_df.head(5)

Total Movie Count: 6309

Raw Datatset Shape: (6309, 12)



Unnamed: 0,IMDB_ID,Title,Release_Date,Age_Rating,Overview,Popularity,Genre,TMDB_Rating,Budget,Revenue,Keywords,Production_Companies
0,tt0499549,Avatar,2009-12-15,PG-13,"In the 22nd century, a paraplegic Marine is di...",22.0122,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",7.591,237000000,2923706026,"['paraplegic', 'attachment to nature', 'cultur...","['Dune Entertainment', 'Lightstorm Entertainme..."
1,tt4154796,Avengers: Endgame,2019-04-24,PG-13,After the devastating events of Avengers: Infi...,16.825,"['Adventure', 'Science Fiction', 'Action']",8.24,356000000,2799439100,"['superhero', 'time travel', 'space travel', '...",['Marvel Studios']
2,tt1630029,Avatar: The Way of Water,2022-12-14,PG-13,Set more than a decade after the events of the...,23.372,"['Science Fiction', 'Adventure', 'Action']",7.611,460000000,2320250281,"['dying and death', 'loss of loved one', 'alie...","['20th Century Studios', 'Lightstorm Entertain..."
3,tt34956443,Ne Zha 2,2025-01-29,NR,"Following the Tribulation, although the souls ...",35.9241,"['Animation', 'Fantasy', 'Adventure', 'Action']",8.1,80000000,2213230000,"['based on myths, legends or folklore', '3d an...","['Chengdu Coco Cartoon', 'Beijing Enlight Pict..."
4,tt2488496,Star Wars: The Force Awakens,2015-12-15,,Thirty years after defeating the Galactic Empi...,9.878,"['Adventure', 'Action', 'Science Fiction']",7.26,245000000,2068223624,"['android', 'spacecraft', 'space opera', 'requ...","['Lucasfilm Ltd.', 'Bad Robot']"


## OMDB

In [18]:
# Request the additional data from OMDB API
def additional_omdb_data(parameter):
    url = f"http://www.omdbapi.com/?apikey={API_KEY_OMDB}&i={parameter}"

    response = requests.get(url)
    data = response.json()

    if data.get("Response") == "True":

        ratings = data.get("Ratings", [])
        rotten_tomatoes_score = next((r["Value"] for r in ratings if r["Source"] == "Rotten Tomatoes"), None)

        return {
            "Year": data.get("Year", None),
            "Director": data.get("Director", None),
            "Actors": data.get("Actors", None),
            "Runtime": data.get("Runtime", None),
            "Awards": data.get("Awards", None),
            "Country": data.get("Country", None),
            "Language": data.get("Language", None),
            "Metascore_Rating": data.get("Metascore", None),
            "IMDB_Rating": data.get("imdbRating", None),
            "Rotten_Tomatoes_Rating": rotten_tomatoes_score
        }
    
    else:
        return {"Year": None, 
                "Director": None, 
                "Actors": None, 
                "Runtime": None, 
                "Awards": None, 
                "Country": None,
                "Language": None,
                "Metascore_Rating": None, 
                "IMDB_Rating": None, 
                "Rotten_Tomatoes_Rating": None}

# Empty list to hold the newly obtained data
additional_data = []

# Loop through all titles of the TMDB_movies_df to add OMDB data
for imdb_id in tqdm(TMDB_movies_df["IMDB_ID"], desc = "OMDB Data"):
    # Append new data into the list using the previously created function for OMDB
    additional_data.append(additional_omdb_data(imdb_id))

# Convert the additional data to a DataFrame
OMDB_df = pd.DataFrame(additional_data)

# Append the new data to the existing DataFrame
movies_df = pd.concat([TMDB_movies_df, OMDB_df], axis = 1)

OMDB Data: 100%|██████████| 6309/6309 [19:02<00:00,  5.52it/s] 


In [19]:
movies_df.to_csv("../data/raw/movies.csv", index = False)

## Preview Full Dataset

In [22]:
test_df = pd.read_csv("../data/raw/movies.csv")

print(f"Total Movie Count: {len(test_df)}\n")

print(f"Raw Datatset Shape: {test_df.shape}\n")

test_df.head(5)

Total Movie Count: 6309

Raw Datatset Shape: (6309, 22)



Unnamed: 0,IMDB_ID,Title,Release_Date,Age_Rating,Overview,Popularity,Genre,TMDB_Rating,Budget,Revenue,Keywords,Production_Companies,Year,Director,Actors,Runtime,Awards,Country,Language,Metascore_Rating,IMDB_Rating,Rotten_Tomatoes_Rating
0,tt0499549,Avatar,2009-12-15,PG-13,"In the 22nd century, a paraplegic Marine is di...",22.0122,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",7.591,237000000,2923706026,"['paraplegic', 'attachment to nature', 'cultur...","['Dune Entertainment', 'Lightstorm Entertainme...",2009,James Cameron,"Sam Worthington, Zoe Saldaña, Sigourney Weaver",162 min,Won 3 Oscars. 91 wins & 131 nominations total,"United States, United Kingdom","English, Spanish",83.0,7.9,81%
1,tt4154796,Avengers: Endgame,2019-04-24,PG-13,After the devastating events of Avengers: Infi...,16.825,"['Adventure', 'Science Fiction', 'Action']",8.24,356000000,2799439100,"['superhero', 'time travel', 'space travel', '...",['Marvel Studios'],2019,"Anthony Russo, Joe Russo","Robert Downey Jr., Chris Evans, Mark Ruffalo",181 min,Nominated for 1 Oscar. 70 wins & 133 nominatio...,United States,"English, Japanese, Xhosa, German",78.0,8.4,94%
2,tt1630029,Avatar: The Way of Water,2022-12-14,PG-13,Set more than a decade after the events of the...,23.372,"['Science Fiction', 'Adventure', 'Action']",7.611,460000000,2320250281,"['dying and death', 'loss of loved one', 'alie...","['20th Century Studios', 'Lightstorm Entertain...",2022,James Cameron,"Sam Worthington, Zoe Saldaña, Sigourney Weaver",192 min,Won 1 Oscar. 75 wins & 153 nominations total,United States,English,67.0,7.5,76%
3,tt34956443,Ne Zha 2,2025-01-29,NR,"Following the Tribulation, although the souls ...",35.9241,"['Animation', 'Fantasy', 'Adventure', 'Action']",8.1,80000000,2213230000,"['based on myths, legends or folklore', '3d an...","['Chengdu Coco Cartoon', 'Beijing Enlight Pict...",2025,Yu Yang,"Yanting Lü, Mo Han, Hao Chen",143 min,3 wins total,China,"Mandarin, Chinese, English, Hindi",63.0,8.1,96%
4,tt2488496,Star Wars: The Force Awakens,2015-12-15,,Thirty years after defeating the Galactic Empi...,9.878,"['Adventure', 'Action', 'Science Fiction']",7.26,245000000,2068223624,"['android', 'spacecraft', 'space opera', 'requ...","['Lucasfilm Ltd.', 'Bad Robot']",2015,J.J. Abrams,"Daisy Ridley, John Boyega, Oscar Isaac",138 min,Nominated for 5 Oscars. 64 wins & 140 nominati...,"United States, United Kingdom",English,80.0,7.8,93%
