In [19]:
import pandas as pd

from ada_config.config import CONFIG

In [None]:
movie_data = pd.read_csv(CONFIG["data_path"] / "enrich_movie_data.csv")
movie_data.head()

In [None]:
from tqdm import tqdm
import time
import pandas as pd

API_KEY = 'api_key'
BASE_URL = 'https://api.themoviedb.org/3/movie/{movie_id}/credits'

movie_ids = movie_data['tmdb_id'].dropna().astype(int).tolist()

crew_cast_data = []

# Batch size and rate limit configuration
BATCH_SIZE = 100
DELAY = 0.1

def fetch_movie_credits(movie_id):
    url = BASE_URL.format(movie_id=movie_id)
    params = {'api_key': API_KEY}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    elif response.status_code == 429:
        print("Rate limited. Waiting before retrying...")
        time.sleep(10)
        return fetch_movie_credits(movie_id)
    else:
        print(f"Failed to fetch data for movie ID {movie_id}.")
        return None

last_index = 0
try_num = 0
for i in tqdm(range(last_index, len(movie_ids), BATCH_SIZE)):
    batch_ids = movie_ids[i:i + BATCH_SIZE]
    for movie_id in batch_ids:
        data = fetch_movie_credits(movie_id)
        if data:
            crew_cast_data.append({
                'movie_id': movie_id,
                'cast': data.get('cast', []),
                'crew': data.get('crew', [])
            })
    if i % (BATCH_SIZE * 50) == 0:
        pd.DataFrame(crew_cast_data).to_csv(f'crew_cast_data{try_num}.csv', index=False)
        print(f"Saved batch {i // BATCH_SIZE + 1}")
    if i % (BATCH_SIZE * 10) == 0:
        time.sleep(60)

pd.DataFrame(crew_cast_data).to_csv(f'crew_cast_data{try_num}.csv', index=False)


In [None]:
# merge data
file_names = ['crew_cast_data.csv', 'crew_cast_data2.csv', 'crew_cast_data3.csv', 'crew_cast_data4.csv']
crew_cast_data = pd.concat([pd.read_csv(file_name) for file_name in file_names], ignore_index=True)
crew_cast_data.to_csv('crew_cast_data_final.csv', index=False)

In [21]:
crew_cast_data = pd.read_csv(CONFIG['tmdb_path'] / 'crew_cast_data_final.csv')

In [36]:

def get_stars(casts, k=5):
    casts = eval(casts)
    sort_popularity = sorted(casts, key=lambda x: x['popularity'], reverse=True)
    return [sort_popularity[i]['popularity'] for i in range(min(k, len(sort_popularity)))] + [None] * (k - len(sort_popularity))

crew_cast_data[['star_1', 'star_2', 'star_3', 'star_4', 'star_5']] = crew_cast_data['cast'].apply(get_stars, k=5).apply(pd.Series)

In [44]:
def get_best_crew(crew):
    crew = eval(crew)
    director = None
    writer = None
    producer = None
    for member in crew:
        if member['job'] == 'Director':
            director = member['popularity']
        elif member['job'] == 'Writer':
            writer = member['popularity']
        elif member['job'] == 'Producer':
            producer = member['popularity']
    return director, writer, producer

crew_cast_data[['Director', 'Writer', 'Producer']] = crew_cast_data['crew'].apply(get_best_crew).apply(pd.Series)

In [49]:
crew_cast_data.drop(columns=['cast', 'crew'], inplace=True)
crew_cast_data.to_csv(CONFIG['tmdb_path'] / 'crew_cast_best.csv', index=False)