In [1]:
import pandas as pd
import requests
import time
import os
from tqdm import tqdm

def get_person_data(person_id, api_key):
    url = f'https://api.themoviedb.org/3/person/{person_id}?api_key={api_key}'
    response = requests.get(url)
    data = response.json()
    return data

# Get current working directory
current_working_dir = os.getcwd()

# Construct the full path to your file
file_path = os.path.join(current_working_dir, "data", "movie_to_persons.csv")

# Load data
df = pd.read_csv(file_path)

# replace with your actual API key
api_key = '4ea37a8158ca566deb7455166dc1edda'

# Create a new DataFrame to store person data
df_person = pd.DataFrame()

# Only take unique person ids to avoid redundant requests
unique_person_ids = df['person_id'].unique()

for person_id in tqdm(unique_person_ids):
    data = get_person_data(person_id, api_key)
    
    # extract the needed information from the data
    row = {
        'id': data['id'], 
        'imdb_id': data.get('imdb_id', None),
        'name': data['name'],
        'gender': data['gender'],
        'birthday': data.get('birthday', None),
        'deathday': data.get('deathday', None),
        'profile_path': data.get('profile_path', None)
    }

    # Append the row to the DataFrame
    df_person = pd.concat([df_person, pd.DataFrame([row])], ignore_index=True)
    
    # Save the DataFrame to a csv file inside the loop
    df_person.to_csv('./data/person_data.csv', index=False)
    
    # time.sleep(0.3)  # delay to avoid hitting rate limit


 80%|███████▉  | 76852/96464 [3:38:43<55:48,  5.86it/s]  


ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

In [None]:
# https://api.themoviedb.org/3/person/14435?api_key=4ea37a8158ca566deb7455166dc1edda

In [None]:
# # attempt at multi async runs
# import pandas as pd
# import asyncio
# import aiohttp
# import time
# import os
# from tqdm import tqdm

# # replace with your actual API key
# api_key = '4ea37a8158ca566deb7455166dc1edda'

# async def get_person_data(session, person_id):
#     url = f'https://api.themoviedb.org/3/person/{person_id}?api_key={api_key}'
#     async with session.get(url) as response:
#         return await response.json()

# async def main():
#     # Get current working directory
#     current_working_dir = os.getcwd()
#     print(current_working_dir)

#     # Construct the full path to your file
#     file_path = os.path.join(current_working_dir, 'code', "data", "movie_to_persons.csv")

#     # Load data
#     df = pd.read_csv(file_path)

#     # Create a new DataFrame to store person data
#     rows = []

#     # Only take unique person ids to avoid redundant requests
#     unique_person_ids = df['person_id'].unique()

#     async with aiohttp.ClientSession() as session:
#         tasks = []
#         for person_id in tqdm(unique_person_ids):
#             tasks.append(get_person_data(session, person_id))

#         persons_data = await asyncio.gather(*tasks, return_exceptions=True)

#         for data in persons_data:
#             if isinstance(data, Exception):
#                 print(f"Failed to get data: {data}")
#                 continue
                
#             # extract the needed information from the data
#             row = {
#                 'id': data['id'], 
#                 'imdb_id': data.get('imdb_id', None),
#                 'name': data['name'],
#                 'gender': data['gender'],
#                 'birthday': data.get('birthday', None),
#                 'deathday': data.get('deathday', None),
#                 'profile_path': data.get('profile_path', None)
#             }

#             # Append the row to the DataFrame
#             rows.append(row)

#     # Create a DataFrame from collected rows and save it to a CSV file
#     df_person = pd.DataFrame(rows)
#     df_person.to_csv('./code/data/person_data.csv', index=False)

# # Run the main function
# asyncio.run(main())


In [17]:
# Skip web scraping if data laready exists in person_data.csv
import pandas as pd
import requests
import time
import os
from tqdm import tqdm

def get_person_data(person_id, api_key):
    url = f'https://api.themoviedb.org/3/person/{person_id}?api_key={api_key}'
    response = requests.get(url)
    data = response.json()
    return data

# Get current working directory
current_working_dir = os.getcwd()

# Construct the full path to your files
file_path = os.path.join(current_working_dir, "data", "movie_to_persons.csv")
person_data_path = os.path.join(current_working_dir, "data", "person_data.csv")

# Load data
df = pd.read_csv(file_path)

# Load the person_data file if it exists
if os.path.isfile(person_data_path):
    df_person = pd.read_csv(person_data_path)
else:
    df_person = pd.DataFrame()

# replace with your actual API key
api_key = '4ea37a8158ca566deb7455166dc1edda'

# Only take unique person ids to avoid redundant requests
unique_person_ids = df['person_id'].unique()

# Get already processed IDs from person_data
already_processed_ids = df_person['id'].unique()

for person_id in tqdm(unique_person_ids):
    # If the id is already in the person_data.csv, skip this id
    if person_id in already_processed_ids:
        continue

    data = get_person_data(person_id, api_key)
    
    # extract the needed information from the data
    row = {
        'id': data['id'], 
        'imdb_id': data.get('imdb_id', None),
        'name': data['name'],
        'gender': data['gender'],
        'birthday': data.get('birthday', None),
        'deathday': data.get('deathday', None),
        'profile_path': data.get('profile_path', None)
    }

    # Append the row to the DataFrame
    df_person = pd.concat([df_person, pd.DataFrame([row])], ignore_index=True)
    

    # Save the DataFrame to a csv file inside the loop
df_person.to_csv(person_data_path, index=False)
    
    # time.sleep(0.1)  # delay to avoid hitting rate limit


 17%|█▋        | 16795/96464 [00:00<00:01, 54699.26it/s]