In [1]:
import http.client
import json
from settings import API_KEY
import time
from pandas.io.json import json_normalize
import pandas as pd
import  csv
import ast

## Funções

In [24]:
def get_url_credits(movie_id, api_key):
    return("/3/movie/" + str(movie_id) + "/credits" + "?api_key=" + str(api_key))

def get_json(conn, url):
	payload = "{}"
	conn.request("GET", url, payload)
	res = conn.getresponse()
	data = res.read()
	try:
		decoded = json.loads(data.decode("utf-8"))
	except ValueError:
		print("Decode Error")
		return(None,0)        
        
	header = res.getheader('Retry-After')
	return(decoded, header)

In [26]:
def get_movie_credits(ids):
    
	conn = http.client.HTTPSConnection("api.themoviedb.org")    
	movie_credits_df = pd.DataFrame()
    
	for idx,movie_id in enumerate(ids,1):
		url = get_url_credits(movie_id, API_KEY)
		movie_data, header = get_json(conn, url)
        
		if idx % 1000 == 1:
			print("Iteration: ", idx, "Movie ID: ", movie_id)
            
		if movie_data == None:
			print("%s - Movie data == None" % time.strftime("%H:%M",time.localtime()))
        
		if ('status_code' in movie_data):
			time.sleep(int(header)+1)  
			movie_data, header = get_json(conn, url)        
        
		df = json_normalize(movie_data)
		movie_credits_df = movie_credits_df.append(df)

	return movie_credits_df

In [95]:
def get_cast(ids, movie_credits_df):

    movie_cast_df = pd.DataFrame()
    movie_crew_df = pd.DataFrame()    

    for idx,movie_id in enumerate(ids,1):
        movie_item = pd.DataFrame()
        
        if idx % 1000 == 1:
            print("Iteration: ", idx, "Movie_id: ", movie_id)
            
        movie_item = movie_credits_df[movie_credits_df['id']==movie_id]#.reset_index()
        
        for cast_id in movie_item['cast'][0]:
            if cast_id:                        
                try:
                    cast_id.update({'movie_id': int(movie_id)})
                    cast_id.pop('profile_path', None)            
                    movie_cast_df = movie_cast_df.append(cast_id, ignore_index=True)
                except AttributeError:
                    print("Cast ERROR in Movie_id: ", movie_id)
                    print("")
                    print(cast_id)
                    print("")
                    break
                    
        for crew_id in movie_item['crew'][0]:
            if crew_id: 
                try:
                    crew_id.update({'movie_id': int(movie_id)})
                    crew_id.pop('profile_path', None)            
                    movie_crew_df = movie_crew_df.append(crew_id, ignore_index=True)
                except AttributeError:
                    print("Crew ERROR in Movie_id: ", movie_id)
                    print("")
                    print(crew_id)
                    print("")
                    break
    
    return(movie_cast_df, movie_crew_df)

## Testes

In [13]:
#with open('out_sample.csv', 'r') as f:
#with open('out_all.csv', 'r') as f:   
with open('../data/us/out.csv', 'r') as f:
    reader = csv.reader(f, skipinitialspace=True, delimiter=',')
    ids_list = list(reader)
    
ids = [item for sublist in ids_list for item in sublist]

In [14]:
len(ids)

65347

In [15]:
# Transformando ids em numérico
ids_num = list(map(int, ids))

In [28]:
start_time = time.time()
print("Starting time: %s" % time.strftime("%H:%M", time.localtime(start_time)))
print("")
movie_credits_df = get_movie_credits(ids_num)
print("--- Movie Credits: %s ---" % (time.time() - start_time))
print("")
print("Ending time: %s" % time.strftime("%H:%M", time.localtime()))
print("")

Starting time: 20:55

Iteration:  1 Movie ID:  98
Iteration:  1001 Movie ID:  115258
Iteration:  2001 Movie ID:  12103
Iteration:  3001 Movie ID:  298977
Iteration:  4001 Movie ID:  49941
Iteration:  5001 Movie ID:  56744
Iteration:  6001 Movie ID:  275773
Iteration:  7001 Movie ID:  40113
Iteration:  8001 Movie ID:  383968
Iteration:  9001 Movie ID:  245052
Iteration:  10001 Movie ID:  43655
Iteration:  11001 Movie ID:  330311
Iteration:  12001 Movie ID:  82607
Iteration:  13001 Movie ID:  108864
Iteration:  14001 Movie ID:  226876
Iteration:  15001 Movie ID:  458448
Iteration:  16001 Movie ID:  46247
Iteration:  17001 Movie ID:  43764
Iteration:  18001 Movie ID:  129904
Iteration:  19001 Movie ID:  438979
Iteration:  20001 Movie ID:  22057
Iteration:  21001 Movie ID:  119607
Iteration:  22001 Movie ID:  442075
Iteration:  23001 Movie ID:  272704
Iteration:  24001 Movie ID:  20313
Iteration:  25001 Movie ID:  18034
Iteration:  26001 Movie ID:  138477
Iteration:  27001 Movie ID:  12484

In [29]:
movie_credits_df.head()

Unnamed: 0,cast,crew,id
0,"[{'gender': 2, 'character': 'Maximus', 'id': 9...","[{'gender': 2, 'id': 578, 'name': 'Ridley Scot...",98
0,"[{'gender': 2, 'character': 'Benjamin Martin',...","[{'gender': 2, 'id': 6046, 'name': 'Roland Emm...",2024
0,"[{'gender': 1, 'character': 'Drew Decker', 'id...","[{'gender': 2, 'id': 35689, 'name': 'Keenen Iv...",4247
0,"[{'gender': 2, 'character': 'Leonard', 'id': 5...","[{'gender': 2, 'id': 525, 'name': 'Christopher...",77
0,"[{'gender': 2, 'character': 'João Grilo', 'id'...","[{'gender': 2, 'id': 102560, 'name': 'Guel Arr...",40096


In [30]:
movie_credits_df.shape

(65347, 3)

In [31]:
movie_credits_df['id'].isnull().sum()

0

In [32]:
movie_credits_df['id'].duplicated().sum()

1569

In [35]:
pd.Series(ids_num).duplicated().sum()

1569

In [107]:
movie_credits_df_dedup = movie_credits_df.drop_duplicates('id', keep='first')

In [108]:
movie_credits_df_dedup.shape

(63778, 3)

In [50]:
#ids_num_sample = ids_num[0:250]
#movie_credits_df_sample = movie_credits_df.head(250)

In [109]:
start_time = time.time()
print("Starting time: %s" % time.strftime("%H:%M", time.localtime(start_time)))
print("")
#movie_cast_df, movie_crew_df = get_cast(ids_num_sample,movie_credits_df_sample)
#movie_cast_df, movie_crew_df = get_cast(ids_num,movie_credits_df)
movie_cast_df, movie_crew_df = get_cast(ids_num,movie_credits_df_dedup)
print("--- Movie Cast/Crew: %s ---" % (time.time() - start_time))
print("")
print("Ending time: %s" % time.strftime("%H:%M", time.localtime()))
print("")

Starting time: 09:25

Iteration:  1 Movie_id:  98
Iteration:  1001 Movie_id:  115258
Iteration:  2001 Movie_id:  12103
Iteration:  3001 Movie_id:  298977
Iteration:  4001 Movie_id:  49941
Iteration:  5001 Movie_id:  56744
Iteration:  6001 Movie_id:  275773
Iteration:  7001 Movie_id:  40113
Iteration:  8001 Movie_id:  383968
Iteration:  9001 Movie_id:  245052
Iteration:  10001 Movie_id:  43655
Iteration:  11001 Movie_id:  330311
Iteration:  12001 Movie_id:  82607
Iteration:  13001 Movie_id:  108864
Iteration:  14001 Movie_id:  226876
Iteration:  15001 Movie_id:  458448
Iteration:  16001 Movie_id:  46247
Iteration:  17001 Movie_id:  43764
Iteration:  18001 Movie_id:  129904
Iteration:  19001 Movie_id:  438979
Iteration:  20001 Movie_id:  22057
Iteration:  21001 Movie_id:  119607
Iteration:  22001 Movie_id:  442075
Iteration:  23001 Movie_id:  272704
Iteration:  24001 Movie_id:  20313
Iteration:  25001 Movie_id:  18034
Iteration:  26001 Movie_id:  138477
Iteration:  27001 Movie_id:  12484

In [110]:
movie_cast_df.head()

Unnamed: 0,cast_id,character,credit_id,gender,id,movie_id,name,order
0,8.0,Maximus,52fe4217c3a36847f8003435,2.0,934.0,98.0,Russell Crowe,0.0
1,9.0,Commodus,52fe4217c3a36847f8003439,2.0,73421.0,98.0,Joaquin Phoenix,1.0
2,10.0,Lucilla,52fe4217c3a36847f800343d,1.0,935.0,98.0,Connie Nielsen,2.0
3,11.0,Proximo,52fe4217c3a36847f8003441,2.0,936.0,98.0,Oliver Reed,3.0
4,12.0,Marcus Aurelius,52fe4217c3a36847f8003445,2.0,194.0,98.0,Richard Harris,4.0


In [111]:
movie_crew_df.head()

Unnamed: 0,credit_id,department,gender,id,job,movie_id,name
0,52fe4217c3a36847f800340d,Directing,2.0,578.0,Director,98.0,Ridley Scott
1,52fe4217c3a36847f8003413,Production,2.0,929.0,Producer,98.0,David Franzoni
2,52fe4217c3a36847f8003419,Production,2.0,930.0,Producer,98.0,Branko Lustig
3,52fe4217c3a36847f800341f,Production,2.0,931.0,Producer,98.0,Douglas Wick
4,52fe4217c3a36847f8003425,Writing,2.0,929.0,Screenplay,98.0,David Franzoni


In [113]:
movie_cast_df.to_csv('../data/movie_cast_df.csv')
movie_crew_df.to_csv('../data/movie_crew_df.csv')