In [1]:
import pandas as pd
import numpy as np

In [2]:
from utils.freebase import *
from utils.data_initial import *
from utils.data_generated import *

# Ethnicity values

In [3]:
df = load_cmu_character_metadata()

In [4]:
etnicities = list(df.actor_ethnicity.dropna().unique())

In [5]:
mappings = {}
not_found = []
for etn in etnicities:
    val = query_freebase_id_from_wikidata(etn)
    if val == None:
        not_found.append(etn)
    else:
        mappings[etn] = val

weird result for /m/019kn7
[{'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q161652'}, 'sLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Japanese people'}}, {'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q31340083'}, 'sLabel': {'type': 'literal', 'value': 'Q31340083'}}]

weird result for /m/0j6x8
[{'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q170355'}, 'sLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Indigenous Australians'}}, {'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q12060728'}, 'sLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Aboriginal Australians'}}]

weird result for /m/062_25
[{'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1065371'}, 'sLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Italian Brazilians'}}, {'s': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q54864438'}, 'sLabel': {'type': 'literal', 'value': 'Q54864438'}}]

weird result for /m/0640_7

In [6]:
# manually correct the mappings that were under an unexpected format
mappings['/m/019kn7'] = 'Japanese'
mappings['/m/0j6x8'] = 'Aboriginal Australians'
mappings['/m/062_25'] = 'Italian Brazilians'
mappings['/m/0640_7q'] = 'Moroccan Jews'
mappings['/m/0180zw'] = 'Kikuyu'
mappings['/m/09snp5'] = 'Muhajir'
mappings['/m/03x1x'] = 'Haudenosaunee Confederacy'
mappings['/m/06bkf'] = 'Quebeckers'

In [7]:
# create dataframe
# set mappings for ids without values to NaN
mappings.update({id: np.NaN for id in not_found})

df = pd.DataFrame(data=mappings.items(), index=range(0,len(mappings)), columns=['freebase_id', 'ethnicity'])

In [8]:
# save dataframe
df.to_pickle(PATH_DATA_GEN + FILENAME_ETHNICITIES)

# Combine CMU with IMDB

In [3]:
df_imdb = load_imdb_title_basics()

In [4]:
df_movie_md = load_cmu_movie_metadata()

In [5]:
# drop original title to avoid double entries
df_imdb.drop('original_title', axis='columns', inplace=True)

In [6]:
# we merge on the title, doing a left outer join
merged = df_movie_md.merge(df_imdb, left_on='movie_name', right_on='primary_title', how='left')

In [7]:
# keep only movies
merged = merged[merged.type == 'movie']

In [8]:
# since titles are not unique, keep only those with matching release date and runtime
merged = merged[merged.release_date.apply(lambda r: r.year) == merged.start_year]

In [9]:
# we see that there are still duplicate entries
duplicates = merged.movie_id_freebase.value_counts()
duplicates = duplicates[duplicates > 1]
duplicates

/m/0h1z21s    5
/m/0bbx0_     4
/m/09vq1kn    4
/m/064p159    4
/m/051wx4f    3
             ..
/m/03h5vh9    2
/m/0hgppgh    2
/m/0gvskl2    2
/m/04zxvt1    2
/m/099qr9     2
Name: movie_id_freebase, Length: 465, dtype: Int64

In [10]:
df_imdb_rating = load_imdb_title_rating()

In [11]:
# keep the ones with the most imdb votes, assuming that those entries are those with the most information
df = merged[merged.movie_id_freebase.isin(list(duplicates.index))].merge(df_imdb_rating, on='title_id', how='left')

In [12]:
to_drop = []
for id in list(duplicates.index):
    dft = df[df.movie_id_freebase == id]
    ids = list(dft.sort_values('num_votes', ascending=False).iloc[1:].title_id.values)
    to_drop += ids

In [13]:
merged = merged[~merged.title_id.isin(to_drop)]

In [14]:
# we see that there are no duplicates left
duplicates = merged.movie_id_freebase.value_counts()
duplicates = duplicates[duplicates > 1]
duplicates

Series([], Name: movie_id_freebase, dtype: Int64)

In [15]:
# use imdb runtime information for missing ones in the cmu dataset
merged.runtime = merged.runtime.fillna(merged.runtime_minutes)
merged.runtime = merged.runtime.astype(pd.Float32Dtype())

In [16]:
# drop columns we don't need
# we keep both genre columns separate, because their granularity is very different (as we saw in our eda) and this could be useful
merged = merged.drop(['start_year', 'end_year', 'type', 'primary_title', 'runtime_minutes'], axis='columns')

In [17]:
# reorder columns
merged = merged.iloc[:, [0,1,9,2,3,4,5,6,7,10,8,11]]

In [18]:
# rename columns
merged = merged.rename({'title_id': 'title_id_imdb', 'genres_x': 'genres_cmu', 'genres_y':'genres_imdb'}, axis='columns')

In [20]:
# save dataframe
merged.to_pickle(PATH_DATA_GEN + FILENAME_MOVIE_METADATA)