In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import json
import ast
import requests
import os
import statsmodels.formula.api as smf


In [5]:
characters = pd.read_csv('./Data/MovieSummaries/character_metadata.tsv', sep='\t', header=None, 
names = ["WikiID", "FreebaseID", "Release", "Character name", "Birth", 
"Sex", "Height (m)", "Ethnicity", "Name", "Age at movie release",
"Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"])
characters.head()

Unnamed: 0,WikiID,FreebaseID,Release,Character name,Birth,Sex,Height (m),Ethnicity,Name,Age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [3]:
#List of all ethnicities in the dataset
ethnicities = characters['Ethnicity'].dropna().unique().tolist()

In [8]:
if os.path.isfile("./Data/characters"):
    # To accelerate code execution we will import the pickle file if it exists
    characters = pd.read_pickle("./Data/characters")
else:
    # Function which allows query search of ids using Wikidata SPARQL service
    def get_ethnicities(ids):

        tmp = '""'.join(ids)

        url = 'https://query.wikidata.org/sparql'
        query = '''
        SELECT  ?ethnicities ?sLabel WHERE {
        VALUES ?ethnicities {"'''+tmp+'''"} 
        ?s wdt:P646 ?ethnicities .

            SERVICE wikibase:label {
            bd:serviceParam wikibase:language "en" .
            }
        }
        '''
        r = requests.post(url, params = {'format': 'json', 'query': query})
        data = r.json()
        return data
    
    d1 = get_ethnicities(ethnicities[1:200])
    d2 = get_ethnicities(ethnicities[201:478])
    tmp1 = []
    tmp2 = []
    for d in (d1, d2):
        for el in d["results"]["bindings"]:
            tmp1.append(el["ethnicities"]["value"])
            tmp2.append(el["sLabel"]["value"])
    table_eth = pd.DataFrame(tmp1, tmp2).reset_index().rename(columns = {"index": "Ethnicities", 0: "code"})
    # Table associating the freebase id with the ethnicities


    """Code for querying using Google's knowledge graph"""
    import json
    import urllib
    # We are only going to use the Google's KG for the remaining codes that we couldn't find in WikiData 
    ids = list(set(ethnicities) - set(table_eth["code"]))
    # Private api key for using Google's API
    if os.path.isfile("api_key.txt"):
        api_key = open("api_key.txt").read()
        service_url = 'https://kgsearch.googleapis.com/v1/entities:search'
        params = {
            'limit': 500,
            'indent': True,
            'key': api_key,
        }
        url = service_url + '?' + 'ids='+'&ids='.join(ids).replace("/", "%2F" )+'&'+urllib.parse.urlencode(params)
        response = json.loads(urllib.request.urlopen(url).read())
        for element in response['itemListElement']:
            print(element['result']['name'] + ' (' + str(element['result']["@id"]) + ')')
        tmp1 = []
        tmp2 = []
        for element in response['itemListElement']:
            tmp2.append(element['result']['name'])
            tmp1.append(element['result']["@id"].replace("kg:", ''))
        table_eth = pd.concat([table_eth, pd.DataFrame(tmp1, tmp2).reset_index().rename(columns = {"index": "Ethnicities", 0: "code"})])
 

    # Replace ethnicity code by value
    characters["Ethnicity"] = characters["Ethnicity"].apply(lambda x: table_eth["Ethnicities"][table_eth["code"]==x].values[0]
    if any(table_eth["code"]==x) else x)
    characters.to_pickle("./Data/characters")
characters.head()

Unnamed: 0,WikiID,FreebaseID,Release,Character name,Birth,Sex,Height (m),Ethnicity,Name,Age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
1,975900,/m/03vyhn,2001,Lieutenant Melanie Ballard,1974,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001,Desolation Williams,1969,M,1.727,African Americans,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
5,975900,/m/03vyhn,2001,Commander Helena Braddock,1949,F,1.727,African Americans,Pam Grier,52.0,/m/02vdcfp,/m/0bgchnd,/m/0418ft
11,975900,/m/03vyhn,2001,Tres,1959,M,,Omaha people,Rodney A. Grant,42.0,/m/0bgchrs,/m/0bgchrw,/m/03ydsb
27,3196793,/m/08yl5d,2000,,1937,M,,African Americans,Albert Hall,62.0,/m/0lr37dy,,/m/01lntp
