In [8]:
import os
import logging

import aiohttp
import asyncio
import backoff
from tqdm.asyncio import tqdm_asyncio
from constants import API_KEY # import API_Key from another file

this code just fetchs collection id and collection title.
can fetch more information if needed.

In [2]:
@backoff.on_exception(
    backoff.expo,  # Exponential backoff
    aiohttp.ClientResponseError,  # Retry on client response errors
    max_time=5,  # Maximum total wait time (5 seconds)
    giveup=lambda e: e.status != 429,  # Only retry if status code is 429 (Too Many Requests)
    factor=0.2
)
async def get_collection_info(movie_id: int, session: aiohttp.ClientSession, semaphore: asyncio.Semaphore, logger=logging.Logger):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    params = {'api_key': API_KEY}

    async with semaphore:
        async with session.get(url, params=params) as response:
            if response.status == 200:
                data = await response.json()
                if data.get('belongs_to_collection'):
                    collection = data['belongs_to_collection']
                    return collection['name'], collection['id']
                else:
                    logger.info(f"No collection found for {movie_id}")
                    return None, None
            else:
                logger.warning(f"Error: Unable to retrieve data for {movie_id}. Status Code: {response.status}")
                return None, None

# Example usage with delay between requests
async def main(movie_ids, log_file_name):
    # create logs folder
    if not os.path.exists('logs'):
        os.makedirs('logs')
    log_file_path = f'logs/{log_file_name}'
    # check if the same file name exists
    if os.path.exists(log_file_path):
        raise FileExistsError(f"Log file {log_file_path} already exists. Please use a different file name.")
    logging.basicConfig(level=logging.INFO,
                        format="%(asctime)s - %(levelname)s - %(message)s",
                        handlers=[logging.FileHandler(log_file_path)], # File handler to write logs to 'logs/*.log'
                        force=True
                        )
    logger = logging.getLogger(__name__)
    logger.info("Starting the main function ...")
    CONCURRENT_REQUESTS = 10
    semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS) 
    async with aiohttp.ClientSession() as session:
        tasks = [get_collection_info(movie_id, session, semaphore, logger) for movie_id in movie_ids]
        results = await tqdm_asyncio.gather(*tasks)
        collection_name = [result[0] for result in results]
        collection_id = [result[1] for result in results]
    return collection_name, collection_id

In [None]:
import pandas as pd
new_data = pd.read_csv('data/movie_metadata_with_tmdb_id.csv')

In [4]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81740 entries, 0 to 81739
Data columns (total 10 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Wikipedia movie ID                         81740 non-null  int64  
 1   Freebase movie ID                          81740 non-null  object 
 2   Movie name                                 81740 non-null  object 
 3   Movie release date                         74838 non-null  object 
 4   Movie box office revenue                   8400 non-null   float64
 5   Movie runtime                              61290 non-null  float64
 6   Movie languages (Freebase ID:name tuples)  81740 non-null  object 
 7   Movie countries (Freebase ID:name tuples)  81740 non-null  object 
 8   Movie genres (Freebase ID:name tuples)     81740 non-null  object 
 9   tmdb_id                                    70215 non-null  float64
dtypes: float64(3), int64(1

In [6]:
# Run the main function
movie_ids = new_data['tmdb_id']
collection_names, collection_ids = await main(movie_ids, 'get_collection_info.log')

100%|██████████| 81740/81740 [28:54<00:00, 47.12it/s]  


In [8]:
new_data['collection_name'] = collection_names
new_data['collection_id'] = collection_ids
new_data.to_csv('data/movie_metadata_with_tmdb.csv', index=False)