## TMDB Data Loading

### Imports:

In [80]:
import pandas as pd
from time import sleep

import requests
from tqdm import tqdm
tqdm.pandas()

### Load TMDB Data:

In [None]:
data_path = '../data'

In [None]:
tmdb_data = pd.read_csv(f'{data_path}/TMDB_movie_dataset_v11.csv')
tmdb_data.head()

### Add Wikidata IDs to TMDB Data:

In [101]:
def get_wikidata_ids_batch(tmdb_ids, batch_size=512):
    """
    Get Wikidata IDs for a batch of TMDb IDs.
    :param tmdb_ids: list of TMDb IDs 
    :param batch_size: batch size for SPARQL queries
    :return: dictionary of TMDb IDs to Wikidata IDs
    """
    url = "https://query.wikidata.org/sparql"
    headers = {
        "User-Agent": "BatchTMDbToWikidata/1.0 (your_email@example.com)"
    }
    wikidata_ids = {}

    # Break TMDb IDs into batches
    for i in tqdm(range(0, len(tmdb_ids), batch_size)):
        batch = tmdb_ids[i:i+batch_size]
        values_clause = " ".join(f'"{tmdb_id}"' for tmdb_id in batch)
        
        query = f"""
        SELECT ?item ?tmdb_id WHERE {{
          VALUES ?tmdb_id {{{values_clause}}}
          ?item wdt:P4947 ?tmdb_id .
        }}
        """

        response = requests.get(url, params={'query': query, 'format': 'json'}, headers=headers)
        
        # Ensure we handle potential rate-limiting by waiting and retrying
        if response.status_code == 429:  # Too Many Requests
            print("Rate limited. Waiting before retrying...")
            sleep(5)  # Adjust this delay as needed
            response = requests.get(url, params={'query': query, 'format': 'json'}, headers=headers)

        data = response.json()
        for result in data['results']['bindings']:
            tmdb_id = result['tmdb_id']['value']
            wikidata_id = result['item']['value'].split("/")[-1]
            wikidata_ids[tmdb_id] = wikidata_id

        # # Optional delay to avoid rate-limiting
        # sleep(0.1)

    return wikidata_ids


100%|██████████| 2211/2211 [15:17<00:00,  2.41it/s]


In [None]:
# Get Wikidata IDs for TMDb IDs
tmdb_ids = tmdb_data['id'].tolist()
wikidata_ids = get_wikidata_ids_batch(tmdb_ids)

In [108]:
# Convert Wikidata IDs to integers so they match the TMDb IDs type
wikidata_ids_copy = {int(k): v for k, v in wikidata_ids.items()}

In [110]:
d# Add Wikidata IDs to TMDB data
tmdb_data['wikidata_id'] = tmdb_data['id'].map(wikidata_ids_copy)
tmdb_data.to_csv(f'{data_path}/TMDB_movie_dataset_v12.csv', index=False)