## TMDB Data Loading

### Imports:

In [1]:
import pandas as pd
from time import sleep

import requests
from tqdm import tqdm
tqdm.pandas()

from ada_config.config import CONFIG

### Load TMDB Data:

In [2]:
tmdb_data = pd.read_csv( CONFIG["tmdb_path"] / 'TMDB_movie_dataset_v11.csv')
tmdb_data.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


### Add Wikidata IDs to TMDB Data:

In [3]:
def get_wikidata_ids_batch(tmdb_ids, batch_size=512):
    """
    Get Wikidata IDs for a batch of TMDb IDs.
    :param tmdb_ids: list of TMDb IDs 
    :param batch_size: batch size for SPARQL queries
    :return: dictionary of TMDb IDs to Wikidata IDs
    """
    url = "https://query.wikidata.org/sparql"
    headers = {
        "User-Agent": "BatchTMDbToWikidata/1.0 (your_email@example.com)"
    }
    wikidata_ids = {}

    # Break TMDb IDs into batches
    for i in tqdm(range(0, len(tmdb_ids), batch_size)):
        batch = tmdb_ids[i:i+batch_size]
        values_clause = " ".join(f'"{tmdb_id}"' for tmdb_id in batch)
        
        query = f"""
        SELECT ?item ?tmdb_id WHERE {{
          VALUES ?tmdb_id {{{values_clause}}}
          ?item wdt:P4947 ?tmdb_id .
        }}
        """

        response = requests.get(url, params={'query': query, 'format': 'json'}, headers=headers)
        
        # Ensure we handle potential rate-limiting by waiting and retrying
        if response.status_code == 429:  # Too Many Requests
            print("Rate limited. Waiting before retrying...")
            sleep(5)  # Adjust this delay as needed
            response = requests.get(url, params={'query': query, 'format': 'json'}, headers=headers)

        data = response.json()
        for result in data['results']['bindings']:
            tmdb_id = result['tmdb_id']['value']
            wikidata_id = result['item']['value'].split("/")[-1]
            wikidata_ids[tmdb_id] = wikidata_id

        # # Optional delay to avoid rate-limiting
        # sleep(0.1)

    return wikidata_ids


In [None]:
# Get Wikidata IDs for TMDb IDs
tmdb_ids = tmdb_data['id'].tolist()
wikidata_ids = get_wikidata_ids_batch(tmdb_ids)

 50%|████▉     | 1097/2211 [08:38<05:02,  3.69it/s]

In [None]:
# Convert Wikidata IDs to integers so they match the TMDb IDs type
wikidata_ids_copy = {int(k): v for k, v in wikidata_ids.items()}

In [None]:
# Add Wikidata IDs to TMDB data
tmdb_data['wikidata_id'] = tmdb_data['id'].map(wikidata_ids_copy)
tmdb_data.to_csv(CONFIG["tmdb_path"] / 'TMDB_movie_dataset_v12.csv', index=False)