In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
import json

In [19]:
def extract_names(data):
    # Handle null values
    if data is None:
        return 'N/A'
    
    # If data is a list of dictionaries, extract 'name' from each dict
    if isinstance(data, list):
        return ', '.join([item['name'] for item in data if 'name' in item])
    
    # If data is a dictionary, extract 'name'
    if isinstance(data, dict):
        return data.get('name', 'N/A')
    
    # Return 'N/A' for data types not handled above
    return data

def get_details(m_id):
    url = f"https://api.themoviedb.org/3/movie/{m_id}?language=en-US"

    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmOTI1ZjNmYWFlZGMzOTRmYTkzNzk3OGI4MDE1YzVlMCIsInN1YiI6IjY1ZWVjYmNhZjVjYjIxMDE4NTQ1ODljNCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.OTDEC3rxSkP-LDdv3prOaMPAiCuwRyCdgtyL69Z4KYU"
    }

    response = requests.get(url, headers=headers)
    data = json.loads(response.text)
    for key in data.keys():
        data[key] = extract_names(data.get(key))
    return data

def get_credit(m_id):
    url = f"https://api.themoviedb.org/3/movie/{m_id}/credits?language=en-US"

    headers = {
        "accept": "application/json",
        "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmOTI1ZjNmYWFlZGMzOTRmYTkzNzk3OGI4MDE1YzVlMCIsInN1YiI6IjY1ZWVjYmNhZjVjYjIxMDE4NTQ1ODljNCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.OTDEC3rxSkP-LDdv3prOaMPAiCuwRyCdgtyL69Z4KYU"
    }

    response = requests.get(url, headers=headers)
    data = json.loads(response.text)
    if 'cast' in data:
        cast_df = pd.DataFrame(data['cast'])
        if cast_df.size != 0:
            casts = cast_df.sort_values(by='order')
            casts = casts['name'].drop_duplicates().to_list()
            top_n = min(len(casts), 10)
            casts = casts[:top_n]
        else:
            casts = 'N/A'
    else:
        return
    if 'crew' in data:
        crew_df = pd.DataFrame(data['crew'])
        if crew_df.size != 0:
            director = crew_df[crew_df['job'] =='Director']['name'].drop_duplicates().to_list()
            dop = crew_df[crew_df['job'] == 'Director of Photography']['name'].drop_duplicates().to_list()
            producer = crew_df[crew_df['job'] == 'Producer']['name'].drop_duplicates().to_list()
            writers = crew_df[crew_df['department'] == 'Writing']['name'].drop_duplicates().to_list()
            if crew_df[crew_df['job'] == 'Original Music Composer'].size == 0:
                if crew_df[crew_df['job'] == 'Music'].size != 0:
                    music_composer = crew_df[crew_df['job'] == 'Music']['name'].drop_duplicates().to_list()
                elif crew_df[crew_df['job'] == 'Sound'].size != 0:
                    music_composer = crew_df[crew_df['job'] == 'Sound']['name'].drop_duplicates().to_list()
                else:
                    music_composer = 'N/A'
            else:
                music_composer = crew_df[crew_df['job'] == 'Original Music Composer']['name'].drop_duplicates().to_list()
        else:
            director = 'N/A'
            dop = 'N/A'
            producer = 'N/A'
            writers = 'N/A'
            music_composer = 'N/A'
    else:
        return
        
    mv_credits = pd.DataFrame([[casts, director, dop, producer, writers, music_composer]], columns=['casts', 'director', 'director_of_photography', 'producer', 'writer', 'music_composer'])
    return mv_credits
    

In [24]:
from tqdm import tqdm


In [26]:
temp_list = [17048,
            301739,
            392817,
            473647,
            573245,
            705916,
            708736,
            709008,
            709811,
            805561,
            854643,
            865395,
            884771,
            887550,
            889886,
            1049324,
            1060666,
            1134214,
            1183566,
            1192412,
            1223746,
            1226125,
            1236453,
            1237326,
            1244144,
            1245311,
            1246063,
            1246578,
            1246749,
            1248433,
            1250069,
            1253599,
            1254236,
            1254614,
            1257084,
            1257716,
            1257754,
            522966, 
            534048, 
            545745, 
            1180314, 
            1208668
            ]


# for item in temp_list:
#     print(get_credit(item))

df1 = pd.DataFrame()
df2 = pd.DataFrame()

for idx, m_id in tqdm(enumerate(temp_list), total=len(temp_list)):
    try:
        details = get_details(m_id)
        mv_credits_df = get_credit(m_id)
        details_df = pd.DataFrame([details])  # Assuming out_columns are handled inside get_details
        # curr_mv_df = pd.concat([details_df, mv_credits_df], axis=1)
        # final_df = pd.concat([final_df, curr_mv_df], axis=0)
        df1 = pd.concat([df1, details_df], axis=0)
        df2 = pd.concat([df2, mv_credits_df], axis=0)
    except Exception as e:
        print(f"Error processing ID {m_id}: {e}")
        continue  # or 'break' depending on whether you want to stop or continue on error

df1.to_csv('data/tmdb_divided/tmdb_movies_details_7.csv')
df2.to_csv('data/tmdb_divided/tmdb_movies_casts_7.csv')

100%|██████████| 42/42 [00:14<00:00,  2.97it/s]


In [6]:
df = pd.read_csv('data/TMDB_all_movies.csv')

In [7]:
tmdb_id_list = df['id']

In [43]:
len(tmdb_id_list)

904594

In [5]:
m_id = 27
url = f"https://api.themoviedb.org/3/movie/{m_id}/credits?language=en-US"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmOTI1ZjNmYWFlZGMzOTRmYTkzNzk3OGI4MDE1YzVlMCIsInN1YiI6IjY1ZWVjYmNhZjVjYjIxMDE4NTQ1ODljNCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.OTDEC3rxSkP-LDdv3prOaMPAiCuwRyCdgtyL69Z4KYU"
}

response = requests.get(url, headers=headers)
data = json.loads(response.text)
cast_df = pd.DataFrame(data['cast'])
if cast_df.size != 0:
    casts = cast_df.sort_values(by='order')
    casts = casts['name'].to_list()[:10]
else:
    casts = 'N/A'
crew_df = pd.DataFrame(data['crew'])
if crew_df.size != 0:
    director = crew_df[crew_df['job'] =='Director']['name'].to_list()
    dop = crew_df[crew_df['job'] == 'Director of Photography']['name'].to_list()
    producer = crew_df[crew_df['job'] == 'Producer']['name'].to_list()
    writers = crew_df[crew_df['department'] == 'Writing']['name'].to_list()
    if crew_df[crew_df['job'] == 'Original Music Composer'].size == 0:
        if crew_df[crew_df['job'] == 'Music'].size != 0:
            music_composer = crew_df[crew_df['job'] == 'Music']['name'].to_list()
        elif crew_df[crew_df['job'] == 'Sound'].size != 0:
            music_composer = crew_df[crew_df['job'] == 'Sound']['name'].to_list()
        else:
            print(crew_df)
    else:
        music_composer = crew_df[crew_df['job'] == 'Original Music Composer']['name'].to_list()
else:
    director = 'N/A'
    dop = 'N/A'
    producer = 'N/A'
    writers = 'N/A'
    music_composer = 'N/A'
crew_df


Unnamed: 0,adult,gender,id,known_for_department,name,original_name,popularity,profile_path,credit_id,department,job
0,False,2,172,Directing,Michael Winterbottom,Michael Winterbottom,9.979,/r4cpuukDMG3xLS8tEaTV7X2IxnO.jpg,52fe4210c3a36847f8001217,Directing,Director
1,False,2,172,Directing,Michael Winterbottom,Michael Winterbottom,9.979,/r4cpuukDMG3xLS8tEaTV7X2IxnO.jpg,52fe4210c3a36847f8001243,Editing,Editor
2,False,2,172,Directing,Michael Winterbottom,Michael Winterbottom,9.979,/r4cpuukDMG3xLS8tEaTV7X2IxnO.jpg,52fe4210c3a36847f800121d,Production,Producer
3,False,2,172,Directing,Michael Winterbottom,Michael Winterbottom,9.979,/r4cpuukDMG3xLS8tEaTV7X2IxnO.jpg,52fe4210c3a36847f8001229,Writing,Author
4,False,2,173,Production,Andrew Eaton,Andrew Eaton,7.168,/8NLEP2n3lvdvXCeWqeuopjPix9B.jpg,52fe4210c3a36847f8001223,Production,Producer
5,False,2,179,Camera,Marcel Zyskind,Marcel Zyskind,5.014,,52fe4210c3a36847f8001237,Camera,Director of Photography
6,False,2,182,Directing,Mat Whitecross,Mat Whitecross,5.573,/kuo7QsaWWrO4CfV5BOPKUsFoKgs.jpg,52fe4210c3a36847f800123d,Editing,Editor
7,False,0,3697,Production,Steve Daly,Steve Daly,2.766,,52fe4210c3a36847f8001261,Production,Casting
8,False,0,3698,Production,Julie Dunne,Julie Dunne,2.937,,52fe4210c3a36847f8001267,Production,Casting


In [4]:
get_details(27)

{'adult': False,
 'backdrop_path': '/qU7tNIMpRqkizIObXfkJY3haTqh.jpg',
 'belongs_to_collection': 'N/A',
 'budget': 1000000,
 'genres': 'Drama, Music, Romance',
 'homepage': '',
 'id': 27,
 'imdb_id': 'tt0411705',
 'original_language': 'en',
 'original_title': '9 Songs',
 'overview': "Matt, a young glaciologist, soars across the vast, silent, icebound immensities of the South Pole as he recalls his love affair with Lisa. They meet at a mobbed rock concert in a vast music hall - London's Brixton Academy. They are in bed at night's end. Together, over a period of several months, they pursue a mutual sexual passion whose inevitable stages unfold in counterpoint to nine live-concert songs.",
 'popularity': 31.126,
 'poster_path': '/91O7z0vo7MiNWd5xD2BoivwbQsb.jpg',
 'production_companies': 'Revolution Films',
 'production_countries': 'United Kingdom',
 'release_date': '2004-07-16',
 'revenue': 1574623,
 'runtime': 69,
 'spoken_languages': 'English',
 'status': 'Released',
 'tagline': '2 lov

In [8]:
out_columns = ['id', 'title', 'budget', 'genres', 'imdb_id', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'status', 'tagline', 'vote_average', 'vote_count']


In [None]:
!pip install tqdm

In [None]:
from tqdm import tqdm
import pandas as pd

# Assuming get_details and get_credit are defined elsewhere
# and tmdb_id_list is a list of movie IDs

final_df = pd.DataFrame()

for idx, m_id in tqdm(enumerate(tmdb_id_list), total=len(tmdb_id_list)):
    try:
        # details = get_details(m_id)
        mv_credits_df = get_credit(m_id)
        # details_df = pd.DataFrame([details])  # Assuming out_columns are handled inside get_details
        # curr_mv_df = pd.concat([details_df, mv_credits_df], axis=1)
        # final_df = pd.concat([final_df, curr_mv_df], axis=0)
        final_df = pd.concat([final_df, mv_credits_df], axis=0)
    except Exception as e:
        print(f"Error processing ID {m_id}: {e}")
        continue  # or 'break' depending on whether you want to stop or continue on error

In [9]:
final_df = pd.DataFrame()
for idx, m_id in enumerate(tmdb_id_list):
    try:
        details = get_details(m_id)
        mv_credits_df =  get_credit(m_id)
        details_df = pd.DataFrame([details])[out_columns]
        curr_mv_df = pd.concat([details_df, mv_credits_df], axis=1)
        final_df = pd.concat([final_df, curr_mv_df], axis=0)
    except:
        print(m_id)
        break



48797


In [46]:
final_df

Unnamed: 0,id,title,budget,genres,imdb_id,overview,popularity,poster_path,production_companies,production_countries,...,status,tagline,vote_average,vote_count,casts,director,director_of_photography,producer,writer,music_composer
0,2,Ariel,0,"Drama, Comedy, Romance, Crime",tt0094675,After the coal mine he works at closes and his...,9.625,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,Villealfa Filmproductions,Finland,...,Released,,7.092,305,"[Turo Pajala, Susanna Haavisto, Matti Pellonpä...",[Aki Kaurismäki],[Timo Salminen],[Aki Kaurismäki],[Aki Kaurismäki],"[Melrose, Rauli Somerjoki, Olavi Virta, Taisto..."
0,3,Shadows in Paradise,0,"Drama, Comedy, Romance",tt0092149,"Nikander, a rubbish collector and would-be ent...",9.372,/nj01hspawPof0mJmlgfjuLyJuRN.jpg,Villealfa Filmproductions,Finland,...,Released,,7.287,345,"[Matti Pellonpää, Kati Outinen, Sakari Kuosman...",[Aki Kaurismäki],[Timo Salminen],[Mika Kaurismäki],[Aki Kaurismäki],[Jouko Lumme]
0,5,Four Rooms,4000000,Comedy,tt0113101,It's Ted the Bellhop's first night on the job....,18.32,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,"Miramax, A Band Apart",United States of America,...,Released,Twelve outrageous guests. Four scandalous requ...,5.82,2535,"[Tim Roth, Jennifer Beals, Antonio Banderas, V...","[Allison Anders, Alexandre Rockwell, Robert Ro...","[Guillermo Navarro, Andrzej Sekula, Phil Parme...",[Lawrence Bender],"[Allison Anders, Alexandre Rockwell, Robert Ro...",[Combustible Edison]
0,6,Judgment Night,21000000,"Action, Crime, Thriller",tt0107286,"Four young friends, while taking a shortcut en...",14.115,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,"Largo Entertainment, JVC, Universal Pictures",United States of America,...,Released,Don't move. Don't whisper. Don't even breathe.,6.525,317,"[Emilio Estevez, Cuba Gooding Jr., Denis Leary...",[Stephen Hopkins],[Peter Levy],[Gene Levy],"[Lewis Colick, Jere Cunningham]",[Alan Silvestri]


In [116]:
url = f"https://api.themoviedb.org/3/movie/{2203}/credits?language=en-US"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmOTI1ZjNmYWFlZGMzOTRmYTkzNzk3OGI4MDE1YzVlMCIsInN1YiI6IjY1ZWVjYmNhZjVjYjIxMDE4NTQ1ODljNCIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.OTDEC3rxSkP-LDdv3prOaMPAiCuwRyCdgtyL69Z4KYU"
}

response = requests.get(url, headers=headers)
data = json.loads(response.text)
data
# crew_df = pd.DataFrame(data['crew'])
# crew_df
# crew_df[crew_df['department'] == 'Sound']
# if crew_df[crew_df['job'] == 'Original Music Composer'].size == 0:
#     music_composer = crew_df[crew_df['job'] == 'Music']
# else:
#     music_composer = crew_df[crew_df['job'] == 'Original Music Composer']
# if music_composer.size == 0:
#     print(m_id)

{'id': 2203, 'cast': [], 'crew': []}

In [None]:
3
27
80
92
94
125
132
133
136
138
139
145

In [72]:
for key in data.keys():
    data[key] = extract_names(data.get(key))

In [73]:
pd.DataFrame([data])

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,The Avengers Collection,220000000,"Science Fiction, Action, Adventure",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,2012-04-25,1518815515,143,"English, हिन्दी, Pусский",Released,Some assembly required.,The Avengers,False,7.713,29810
