# Wikidata query

In [1]:
import pandas as pd
from src.apiQueries import getLabelFromFBID, getInfoFromTMDB, multiprocess_query



DATA_PATH = './datasets/'
MOVIE_DATASET_PATH = DATA_PATH + 'MovieSummaries/'
char_datapath = MOVIE_DATASET_PATH + 'character.metadata.tsv'
movie_metadata = MOVIE_DATASET_PATH + 'movie.metadata.tsv'
name_clusters = MOVIE_DATASET_PATH + 'name.clusters.txt'
plot_summaries = MOVIE_DATASET_PATH + 'plot_summaries.txt'
tvtropes_clusters = MOVIE_DATASET_PATH + 'tvtropes.clusters.txt'

In [2]:
## Define columns names corresponding to characters metadata 
## and import the table using Pandas
char_col_names= ['Wikipedia_movie_ID','Freebase_move_ID', 'movie_release_date',
                   'Character_name','Actor_DOB','Actor_gender',
                   'Actor_height','Actor_ethnicity','Actor_name',
                   'Actor_age_release','Freebase_character_map',
                   'Freebase_character_map2','Freebase_character_map3']

char_metadata_df = pd.read_csv(char_datapath,header=None,sep='\t',names=char_col_names) 

In [3]:
# Extract the list of Freebase ID ethnicities and get their label
# with a query on the wikiData api
FBids = char_metadata_df['Actor_ethnicity'].dropna().unique()
df = getLabelFromFBID(id_list=FBids,verbose=True)

Querying the id_list to api...


In [4]:
# Save the labels for later use (if needed)
df.to_csv('./generated/ethnicities_labels.csv')

In [5]:
# Merge on character data to get the labels associated to FreeBase ids
ethn_df = char_metadata_df.dropna(subset=['Actor_ethnicity'])
ethn_df = ethn_df.merge(right = df,
                       how='left',
                       left_on='Actor_ethnicity',
                       right_on='freebaseID.value')

In [6]:
# Assess number of missing actor ethnicities
ethn_df['freebaseID.value'].isna().sum()

3155

Some data is still missing -> assess if that's sufficient or not for analysis.

## IMDB Queries

In [7]:
# Extract Movie names and runtime to find matches in the TMDB database
col_names = names=['Wikipedia_movie_ID','Freebase_move_ID',
                   'Movie_name','Movie_release_date','Movie_box_office_revenue',
                   'Movie_runtime','Movie_languages','Movie_countries','Movie_genres']


movie_metadata_df = pd.read_csv(movie_metadata,header=None,sep='\t',
                                names=col_names) 
movie_list = movie_metadata_df[['Movie_name','Movie_runtime']]

In [8]:
# Import the api key
with open('api_keys/tmdbkey.txt','r') as file:
    key = file.readline()

In [11]:
# Run a multiprocess to query the TMDB with the list of movies
# Careful, takes 6 hours to run
def task(chunk, key):
    return getInfoFromTMDB(movie_list = chunk,key = key, verbose=False)
movie_info_df = multiprocess_query(movie_list=movie_list,api_key=key, nb_workers=1,task = task)

In [None]:
# Save the dataframe as csv
movie_info_df.to_csv('./generated/TMDB_extra_info.csv')