In [None]:
from tmdbv3api import TMDb, Movie, Discover, TV, Person, Season
from tmdbv3api import Account
from tmdbv3api import Authentication

##Import the rest of required libraries
import time, random
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set(palette='icefire') # set the default Seaborn style for graphics

## Own functions
import functions_1115 as func

##TMDB details
##Account details for getting movie reccomendations
##Run this block only once during a session
tmdb = TMDb()
tmdb.api_key = '0921b0cce35c0b2ec8b874614d1d0b47' ##insert apikey
tmdb.language = 'en'

USERNAME = "cybercat94"
PASSWORD = "Wave1994@"
auth = Authentication(username=USERNAME, password=PASSWORD)
account = Account()
details = account.details()
movie = Movie()

# List of functions
#### 1. `Full_Mov_DB_2_CSV(date)`
    TMDB provides daily exports of its entire database.
    However, there are not many attributes to this list.
    Takes a date param as string (YYYY-MM-DD), and writes the export for that day to a CSV file in the local folder.
#### 2. `consec_dict_2_df(dictionary_in)`
    Converts a dictionary in the style of a DF into a Pandas Dataframe.
    E.g. each column has multiple rows containing the information for an index.
    **still a work in progress**
#### 3. `financialstat_counter(data)`
    Sends API requests by passing a list of movie IDs
    Running tally of the number of API calls
    Shows number of successful and unsuccessful API calls after executing
#### 4. `discover_2_csv(dict_param, output)`
    Searches TMDB database based on given search parameters
    Returns a maximum of 500*20 movie entries (10000)
    Exports the dataframe to csv 

In [None]:
## Extract daily exports of entire TMBD database
## Function accepts ISO 8601 spec for dates
## e.g. Full_Mov_DB_2_CSV('YYYY-MM-DD')
def full_mov_db_2_csv(date):
    ## format the fetch url
    db_online_path='http://files.tmdb.org/p/exports/movie_ids_'+date[5:7]+'_'+date[8:10]+'_'+date[:4]+'.json.gz'
    print("Export link:", db_online_path)
    
    ## will throw exception if url does not exist
    try:
        data_raw = pd.read_json(db_online_path, compression='gzip', lines=True)
        print("Writing to file...")
    except:
        print('Error in extracting file from link...\nFile may not yet exist on TMDB.')
        return
    
    ## format the local file name
    db_path='FullDB_'+date+'.csv'
    data_raw.to_csv(db_path)
    print("Exported to local folder:",db_path)

In [None]:
## input param must be a list/array containing multiple dictionaries that have identically formatted keys
## Returns a propery formatted dataframe with the keys at column positions
def consec_dict_2_df(dict_in):
    return_df = None
    for item in dict_in:
        sub_df = pd.DataFrame.from_dict(item, orient='index')
        if return_df is None:
            return_df = sub_df.T
        else:
            return_df = pd.concat([return_df,sub_df.T], sort=True)
    ## reindex dataframe, auto-convert to most suitable datatype
    return return_df.convert_dtypes().reset_index(drop=True)

## testing
#consec_dict_2_df(stats['casts']['crew'])

In [None]:
## pass the 'id' column from daily export CSV into function
## take the first 10 values and increment by an order of magnitude for each iteration

def financialstat_counter(data):
    hit=0
    total_count=0
    total_uncount=0
    top=len(data)

    for item_id in data:
        ## visual indication of progress
        time.sleep(0.001)
        if total_count+total_uncount == top-1:
            print('API call complete:   '+str(total_count+total_uncount+1)+'/'+str(top))
        else:
            print('API calls: '+str(total_count+total_uncount+1)+'/'+str(top),end = "\r")

        ## skips call if api barfs 
        try:
            mov = movie.details(item_id)
            if(mov['budget'] and mov['revenue']):
                hit += 1
            total_count += 1
        except:
            total_uncount += 1
    
    value=hit/total_count*100
    print("Successful calls:   ",str(total_count))
    print("Unsuccessful calls: ",str(total_uncount))
    print("Percentage of movies with financial statistics: ",str(value)+'%\n')
    return value

In [None]:
search_params = {
    ## Comment out params that are not required for search
    'sort_by':'',
    ## 'popularity.asc/desc','release_date.asc/desc','revenue.asc/desc','original_tite.asc/desc'
    
    'page':'1', ## 1-500, default 1. Do not modify this parameter.
    'primary_release_year':'', ## YYYY
    'primary_release_date.gte':'', ## 'YYYY-MM-DD'
    'primary_release_date.lte':'', ## 'YYYY-MM-DD'
    'release_date.gte':'', ## 'YYYY-MM-DD'
    'release_date.lte':'', ## 'YYYY-MM-DD'
    'year':'', ## YYYY
    
    ## Comma-separated list of person IDs
    'with_cast':'',
    'with_crew':'',
    'with_people':'',
    ## Comma-separated list of company IDs
    'with_companies':'',
    ## Comma-separated list of genre IDs
    'with_genres':'',
    'without_genres':'',
    ## Comma-separated list of keyword IDs
    'with_keywords':'',
    
    ## Integer minutes
    'with_runtime.gte':'',
    'with_runtime.lte':''  
}

def discover_2_csv(dict_param, output):
    df_out = None
    call_fail = 0
    
    ## Input validation
    if type(dict_param)!=dict or type(output)!=str:
        print('Error, inputs must be a dictionary and a string')
        return
    
    ## up to 500 pages available to extract, depending on how broad search is. So up to 20*500=10000 movies possible
    for page_no in range(1,501): ## for debugging purposes, change to max=10
        
        ## Set the current page
        dict_param['page']=page_no
        
        ## Get the list
        movie_list = discover.discover_movies(dict_param)
        
        ## Terminate search if most recent page has nothing
        if len(movie_list) == 0:
            break
            
        ## Make the large dataframe
        try:
            if df_out is None:
                df_out = func.consec_dict_2_df(movie_list)
            else:
                df_out = pd.concat([df_out,func.consec_dict_2_df(movie_list)],sort=True)
        except:
            call_fail += 1
        
        ## Output statements to keep track of things
        print('Page number: '+str(page_no)+'/500', end='\r')
    
    ## Resetting the index to count normally
    df_out.reset_index(drop=True, inplace=True)
    print('\nSearch complete\n'+str(page_no*20)+' movies found')
    print(str(len(df_out))+' movies returned')
    print(str(call_fail)+' calls did not get through')
    df_out.to_csv(output)
    print("Exported to local folder:",output)
    return df_out

# Cleaning up the dataset 
#### Remove entries that are:
- movies release before 2000 (21st century movies in dataset)
- movies with no financial information (0 box office and 0 budget)
- 


In [None]:
## Create local CSV file
## hashed out this one as local fil for 2021-03-15 has already been created
#### full_mov_db_2_csv('2021-03-15')

In [None]:
data_raw = pd.read_csv('FullDB_2021-03-15.csv')
display(data_raw.describe())
display(data_raw.head())

In [None]:
## Taking a look at what col headers are present in movie details
stats = movie.details(10000)
print(pd.DataFrame(stats),'\n')

In [None]:
## taking a peek at a movie's stats
## does NOT use IMDB id, uses 'id' <- not sure about where this id comes from
## taking a random movie id
stats = movie.details(805900)
movie_attributes = ['title','original_title','id','imdb_id','release_date','genres','budget','revenue','overview','popularity','spoken_languages','vote_average','production_companies','production_countries']
for stuff in movie_attributes:
    print(stuff,':',stats[stuff])
## the 'casts' key contains too many items, using the next code box to look at casts
## production countries may also have many elements for large movies

In [None]:
## looking at actors in a specific movie
cast_attributes = ['original_name','gender','popularity','adult']
for thing in stats['casts']['cast']:
    for item in cast_attributes:
        print(item,':',thing[item])
    print()

In [None]:
##looking at crew members in a specific movie
crew_attributes = ['original_name','id','gender','adult', 'job','department','known_for_department']
cast_count = len(stats['casts']['crew'])
print('Number of cast members:', str(cast_count),'\n')
for item in stats['casts']['crew']:
    for subitem in crew_attributes:
        print(subitem,':',item[subitem])
    print()

In [None]:
test_df = pd.DataFrame.from_dict(stats['casts']['crew'][0], orient='index')
#print(test_df,'\n')
#test_df.info()

## Test code, still unfinished
## Dictionary to DF process:
## Extract dictionary, place into DataFrame
## Concatenate consec dictionaries together
## Recast columns into the appropriate data types
new_df = None
for item in stats['casts']['crew']:
    sub_df = pd.DataFrame.from_dict(item, orient='index')
    if new_df is None:
        new_df = sub_df.T
    else:
        new_df = pd.concat([new_df, sub_df.T], sort=True)
new_df = new_df.convert_dtypes().reset_index()
new_df.info()
new_df

In [None]:
## shortening the dataframe size so that code does not take forever to run
## 10K entries is apparently too long
## limit to 100 entries for now
data_short = data_raw.head(10)
data_short

In [None]:
## checking the number of movies with financial data
## only counts movies that have both budget and box office numbers (not zero)
## Looks like the call rate is heavily limited after approximately 5000 calls
## API calls after 6000 are reduced to around 4 per second
## Call API in batches of 5000 or less to get past this bottleneck
## Need to check the off time in between batches

hit=0
total_count=0
total_uncount=0
top=len(data_short['id'])

for item_id in data_short['id']:
    ## visual indication of progress
    time.sleep(0.001)
    if total_count+total_uncount == top-1:
        print('API call complete:   '+str(total_count+total_uncount+1)+'/'+str(top))
    else:
        print('API calls: '+str(total_count+total_uncount+1)+'/'+str(top),end = "\r")
    
    ## skips call if api barfs 
    try:
        mov = movie.details(item_id)
        if(mov['budget'] and mov['revenue']):
            hit += 1
        total_count += 1
    except:
        total_uncount += 1
        
print("Successful calls:   ",str(total_count))
print("Unsuccessful calls: ",str(total_uncount))
print("Percentage of movies with financial statistics: ",str(hit/total_count*100)+'%')

In [None]:
%%time 
## must be placed at the TOP of the cell, including above comments
## Checking api call rate/success rate
## Takes the list of movie IDs and samples a set numeber of them randomly
rep_list = [1,10,100]
count=0
x=[]
for rep in rep_list:
    x.append(0.01 * financialstat_counter(random.sample(list(data_raw['id']),rep) ) )
    count+=1
    if count == len(rep_list):
        break
print(x)