In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
import json
import ast
import requests
import os
import statsmodels.formula.api as smf


In [6]:
# visualize character data
characters = pd.read_csv('../Data/MovieSummaries/character.metadata.tsv', sep='\t', header=None, 
    names = ["WikiID", "FreebaseID", "Release", "Character name", "Birth", 
    "Sex", "Height (m)", "Ethnicity ID", "Name", "Age at movie release",
    "Freebase character/actor map ID", "Freebase character ID", "Freebase actor ID"])
characters.head()

Unnamed: 0,WikiID,FreebaseID,Release,Character name,Birth,Sex,Height (m),Ethnicity ID,Name,Age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [7]:
# List of all Freebase ethnicity ID in the dataset
ethnicities = characters['Ethnicity ID'].dropna().unique().tolist()
print(ethnicities)

['/m/044038p', '/m/0x67', '/m/064b9n', '/m/041rx', '/m/033tf_', '/m/04gfy7', '/m/0222qb', '/m/01qhm_', '/m/0dryh9k', '/m/048sp5', '/m/04mvp8', '/m/0bzkm2', '/m/02p1pl6', '/m/0bjbszh', '/m/022fdt', '/m/0cqgdq', '/m/0ffkb4', '/m/075dhf0', '/m/01hwt', '/m/0xnvg', '/m/0dqqwy', '/m/048z7l', '/m/07bch9', '/m/09v5bdn', '/m/02w7gg', '/m/03bkbh', '/m/02vsw1', '/m/09kr66', '/m/09vc4s', '/m/0g0x7_', '/m/042gtr', '/m/0cm7w1', '/m/046cwm', '/m/04dbw3', '/m/02ctzb', '/m/0g8_vp', '/m/092h2qt', '/m/0g6ff', '/m/0278pqj', '/m/0301y_', '/m/019kn7', '/m/0cnvdq1', '/m/03295l', '/m/065b6q', '/m/03pqwy', '/m/01xhh5', '/m/03ts0c', '/m/06gbnc', '/m/07hwkr', '/m/0bpjh3', '/m/0fpjs3j', '/m/04nrnz', '/m/09k5jvk', '/m/07mqps', '/m/08hpk0', '/m/03ttfc', '/m/0d9q7j', '/m/075_n6', '/m/0dllcfn', '/m/04kbvpz', '/m/03ftx7', '/m/0747611', '/m/025rpb0', '/m/06mvq', '/m/047l_90', '/m/029f2r', '/m/01rv7x', '/m/05sf2x', '/m/01336l', '/m/0bh91q8', '/m/01g7zj', '/m/0cn68', '/m/02sch9', '/m/0fqp6zk', '/m/02y_9mh', '/m/0d7wh', '

In [10]:
''' code by CLEMENT (inspired by stackoverflow)
https://stackoverflow.com/questions/74277269/wikidata-query-service-how-do-i-search-by-freebase
'''

def get_freebase_label(freebase_ids):
      '''
      Checks the wikidata database for the label of a freebase id
      - Input: list of freebase ids
      - Output: list of labels
      '''
      labels = []
      url = 'https://query.wikidata.org/sparql'
      for id in freebase_ids:
            query = '''
            SELECT ?s ?sLabel ?freebaseID
            WHERE {

            VALUES ?freebaseID { 
            "'''+id+'''"
            }

            ?s wdt:P646 ?freebaseID .

            SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } 
            }
                  '''
            r = requests.post(url, params = {'format': 'json', 'query': query})
            data = r.json()
            if len(data['results']['bindings']) != 0:
                  labels.append(data['results']['bindings'][0]['sLabel']['value'])
            else:
                  labels.append('NaN')
      return labels

# test
a = ['/m/0x67', '/m/064b9n']
labels = get_freebase_label(a)
print(labels)


['African Americans', 'Omaha people']


In [13]:
# check if file already exists
if os.path.exists(os.path.join(os.getcwd(), '..', 'ethnicities_data.tsv')):
      print('File ethnicities_data.tsv already exists')
else:
      # get labels and save to csv file
      labels = get_freebase_label(ethnicities)
      ethnicities_data = pd.DataFrame({'Ethnicities ID':ethnicities, 'Ethnicities':labels})
      ethnicities_data.to_csv('ethnicities_data.tsv', sep='\t', index=False)
      print('Saved file ethnicities_data.tsv')

File ethnicities_data.tsv already exists
