# Encriching data
#### 1. Adding data from www.themoviedb.org (TMDB)

In this section we are going to be using the API of www.themoviedb.org (TMDB) to populate the existing dataframe with more information about the movie. The script is going to query the website with the movie name and match with the movie, where the release date is the closest. If no match is found the code skips the movie and goes to the next.

This 1. step is most importantly adding the TMDB_ID which is a unique identifier for the movie which can be used to further enrich the data.

This script was run separately in another file in /scratch/add_TMDB_movie_metadata.py as it needed to be run several times since the API was limited by 40 requests/s.
 


In [None]:
import tmdbsimple as tmdb # Wrapper library for the API of themoviedb.org  (TMDB)
from tqdm import tqdm # Progress bar for the script
import pandas as pd
from datetime import datetime # Used for comparing movie release dates
from dotenv import load_dotenv # Makes keeping the API-key as local environment file simpler
import os

load_dotenv() # Loads .env files aka. the TMDB_API_KEY

headers_movie_metadata = ["Wikipedia Movie ID", "Freebase Movie ID", "Movie name", "Movie release date",
                          "Movie box office revenue", "Movie runtime", "Movie languages", "Movie countries",
                          "Movie genres"]
movie_metadata = pd.read_csv('../data/movie.metadata.tsv', sep="\t", names=headers_movie_metadata)


# Load API key
TMDB_API_KEY = os.environ.get("TMDB_API_KEY")
tmdb.API_KEY = TMDB_API_KEY
tmdb.REQUESTS_TIMEOUT = 5  # Seconds, for both connect and read

# Create a list to save progress
saved_progress = []

# Determine where to resume
start_index = 28000

# Progress file that can be used to resume
#saved_progress = pd.read_json('progress.json')['index'].tolist()
#start_index = saved_progress[-1] + 1  # Start from the next index

# Create a DataFrame to store the data
movie_metadata_TMDB = movie_metadata.copy()

for index, row in tqdm(movie_metadata_TMDB.iterrows(), total=len(movie_metadata_TMDB), desc="Processing"): # Wraps for loop in progress bar.
    
    # Skip previously processed indices - Commented out in favor of manual start_index
    #if index in saved_progress:
    #    continue
    if index < start_index:
        continue
    try:
        if not pd.isna(row["Movie release date"]):
            search = tmdb.Search()
            response = search.movie(query=row["Movie name"])

            # Convert dataframe release date to datetime
            movie_release_date_str = row["Movie release date"]
            if len(movie_release_date_str) == 4:  # Handle "YYYY" format
                movie_release_date = datetime.strptime(movie_release_date_str, "%Y").date()
            elif len(movie_release_date_str) == 7:  # Handle "YYYY-DD" format
                movie_release_date = datetime.strptime(movie_release_date_str, "%Y-%m").date()
            else:  # Assume it's in the format "YYYY-MM-DD"
                movie_release_date = datetime.strptime(movie_release_date_str, "%Y-%m-%d").date()

            # Handle if release_date is empty.
            date_list_converted = [datetime.strptime(each_date['release_date'], "%Y-%m-%d").date() for each_date in
                                   search.results if each_date.get('release_date')]

            # Create list of differences in time
            differences = [abs(movie_release_date - each_date) for each_date in date_list_converted]
            
            # If differences are empty = skip
            if not differences:
                continue
            minimum_index = differences.index(min(differences))  # Index of the closest match
            match = search.results[minimum_index]
            # print(f"Closest match: {match['title']} (Release Date: {match['release_date']})")

            # Add info in dataframe about the movie
            movie_metadata_TMDB.loc[index, 'TMDB_id'] = match['id']
            movie_metadata_TMDB.loc[index, 'TMDB_original_language'] = match['original_language']
            movie_metadata_TMDB.loc[index, 'TMDB_original_title'] = match['original_title']
            movie_metadata_TMDB.loc[index, 'TMDB_overview'] = match['overview']
            movie_metadata_TMDB.loc[index, 'TMDB_popularity'] = match['popularity']
            movie_metadata_TMDB.loc[index, 'TMDB_release_date'] = match['release_date']
            movie_metadata_TMDB.loc[index, 'TMDB_title'] = match['title']
            movie_metadata_TMDB.loc[index, 'TMDB_vote_average'] = match['vote_average']
            movie_metadata_TMDB.loc[index, 'TMDB_vote_count'] = match['vote_count']
            
            # Save the index as progress
            saved_progress.append(index)

            # Save progress periodically (in case of interruption)
            if index % 50 == 0:
                progress_df = pd.DataFrame({'index': saved_progress})
                progress_df.to_json('progress.json')
                movie_metadata_TMDB.to_csv('movie_metadata_TMDB.csv', index=False)

    except Exception as e:
        print(f"Error at index {index}: {str(e)}")

# Save final progress
progress_df = pd.DataFrame({'index': saved_progress})
progress_df.to_json('progress.json')

# Save your final DataFrame
movie_metadata_TMDB.to_csv('modified_data/movie_metadata_TMDB.csv')