# Queries and scraping: Additional features obtained through wikidata

In [1]:
import pandas as pd
import numpy as np
import requests
# scraping libraries
from bs4 import BeautifulSoup
import wikidata
from wikidata.client import Client

This notebook contains the code used to obtain additional features for the dataset. The features are obtained through the wikidata API. The features are:
- Actors' occupation
- Actors' Country of citizenship

In order to obtain the features, we first need to obtain the wikidata ID of the actors. First we attempted to obtain the wikidata ID by scraping the search results of the actor's freebase ID on wikidata and then extracting the wikidata ID from the first result. However, this method was not reliable as the first result was not always the correct one. Therefore, we decided to use the wikidata API to obtain the wikidata ID of the actors. This was done by querying the wikidata API with the actor's freebase ID. The wikidata ID was then extracted from the result.

The following code exemplifies the described process using and old version of our data that we dropped. The obtained results  are therfore for illustrative purposes but the pipeline remains valid.

**Note:** Through the notebook we frequently saved the obtained results to a csv file and loaded them again. This was done to avoid having to run the whole notebook again as querying the wikidata API takes a long time.

## Wikidata ID extraction

In [None]:
# read the dataframe
df = pd.read_csv("CSV_files/top_20_communities_before_scraping.csv")

In [None]:
### dropped method ###

# function to get the wikidata id from the freebase id 
def get_wikidata_id_soup(freebase_id):
    # search the freebase actor id in wikidata using beautiful soup
    url = (
        "https://www.wikidata.org/w/index.php?search="
        + freebase_id
        + "&title=Special%3ASearch&go=Go"
    )
    # get the wiki identifier from the first result, if the object is not found, return None
    soup = BeautifulSoup(requests.get(url).text, "html.parser")

    if soup.find("div", {"class": "mw-search-result-heading"}) is not None:
        result = soup.find("div", {"class": "mw-search-result-heading"}).find("a")["href"]
        wikidata_id = result.split("/")[-1]

        return wikidata_id
    else:
        return None
        
### dropped method ###

In [4]:
# make a funtion that takes a freebase ID and returns the wikidata ID
def get_wikidata_id_query(freebase_id):
  # strip spaces from the freebase ID
  freebase_id = freebase_id.strip()
  # make the query and select items that match the freebase ID
  query = '''
  PREFIX wdt: 
  PREFIX wd: 

  SELECT ?item 
  WHERE {
    ?item wdt:P646 "%s" 

      SERVICE wikibase:label {
      bd:serviceParam wikibase:language "en" .
      }
    }
  ''' % freebase_id

  # make the query
  r = requests.get('https://query.wikidata.org/sparql', params = {'format': 'json', 'query': query})
  # check if the server responded with a valid json
  if r.status_code == 200:
    # get the json data
    data = r.json()
    # check if there is any data
    if len(data['results']['bindings']) > 0:
      # get the wikidata ID
      wikidata_id = data['results']['bindings'][0]['item']['value'].split("/")[-1]
      # return the wikidata ID
      return wikidata_id
    else:
      # return None if there is no data
      return None
  else:
    # rerun the function if the server did not respond with a valid response
    return get_wikidata_id_query(freebase_id)

In [6]:
df['Wikidata_actor_id'] = df['Freebase_actor_ID'].apply(get_wikidata_id_query)

In [11]:
df

Unnamed: 0,Actor_name,connectivity,actor_gender,Actor_date_of_birth,Actor_ethnicity,Freebase_actor_ID,community,Wikidata_actor_id
0,John Wayne,286,M,1907-05-26,/m/063k3h,/m/043gj,1,Q40531
1,Roy Rogers,242,M,1911-11-05,/m/07hwkr,/m/01mc6h1,1,Q367129
2,George 'Gabby' Hayes,130,M,1885-05-07,,/m/03l6jx,1,Q1277973
3,Basil Rathbone,121,M,1892-06-13,,/m/0hwd8,1,Q336865
4,John Carradine,114,M,1906-02-05,,/m/021mlp,1,Q312878
...,...,...,...,...,...,...,...,...
6069,Pushpavalli,3,F,,,/m/080hxfv,20,Q7261853
6070,Sarathi,3,M,1942-06-26,,/m/0463_8t,20,Q7423331
6071,Ragini,3,F,1937,,/m/06w4wmq,20,Q7283094
6072,Nirmalamma,3,F,1920,,/m/07kj3f9,20,Q7040057


In [8]:
# save the dataframe with Wikidata_actor_id
# df.to_csv("top_20_communities_all_actors_wikidata.csv", index=False)
# load the intermediate data frame with Wikidata_actor_id
df = pd.read_csv("CSV_files/top_20_communities_all_actors_wikidata.csv")

## Occupation and Country of citizenship extraction

Once we have the wikidata ID of the actors, we use it to obtain the occupation and country of citizenship of the actors. For this purpose, the wikidata library came in handy as it provides a structured framework to query the wikidata API.

In [10]:
# create a new client
client = Client()

In [11]:
# function that gets the actor country_of_citizenship / occupation from the wikidata id (if an error occurs, it returns 'NaN')
def get_actor_country_of_citizenship(wikidata_id):
    try:
        actor = client.get(wikidata_id, load=True)
        citizenship_prop = client.get('P27')
        citizenship = actor[citizenship_prop]
        return str(citizenship.label)
    except:
        return np.nan
    
def get_actor_occupation(wikidata_id):
    try:
        actor = client.get(wikidata_id, load=True)
        occupation_prop = client.get('P106')
#         occupation = actor[occupation_prop]
#         return occupation.label
        return [str(c.label) for c in actor.getlist(occupation_prop)]
    except:
        return np.nan
 

In [12]:
# sample test to check if scraping is working correctly
df_Sample = df.sample(10)
df_Sample['actor_country_of_citizenship'] = df_Sample['Wikidata_actor_id'].apply(get_actor_country_of_citizenship)
df_Sample['actor_occupation'] = df_Sample['Wikidata_actor_id'].apply(get_actor_occupation)
df_Sample

Unnamed: 0,Actor_name,connectivity,actor_gender,Actor_date_of_birth,Actor_ethnicity,Freebase_actor_ID,community,Wikidata_actor_id,actor_country_of_citizenship,actor_occupation
3422,Gundu Hanmantha Rao,44,M,,/m/0dryh9k,/m/0gbx0xx,6,Q5618679,India,"[actor, comedian, stage actor]"
1381,Guru Dutt,13,M,1925-07-09,/m/0dryh9k,/m/06kxs3,2,Q149136,British Raj,"[film actor, film director, film producer, cho..."
122,George Brent,29,M,1899-03-15,,/m/033htq,1,Q1124735,Republic of Ireland,"[actor, stage actor, television actor, film ac..."
2999,Courtney B. Vance,10,M,1960-03-12,/m/0x67,/m/0337t1,5,Q710169,United States of America,"[film actor, television actor, stage actor, fi..."
4615,Natalia Tena,124,F,1984-11-01,/m/03ttfc,/m/0b7ct_,10,Q232163,United Kingdom,"[actor, musician, stage actor, film actor, sin..."
5621,Kenji Sahara,86,M,1932-05-14,,/m/04byn,16,Q977656,Japan,"[actor, film actor, lyricist]"
3550,Rohini,9,F,1962-02-02,,/m/04m_1zf,6,Q277662,India,"[voice actor, actor, film director, screenwrit..."
6002,Pauline Bush,3,F,1886-05-22,,/m/0ksvc0,19,Q2440503,United States of America,"[actor, film actor, stage actor]"
2286,John Rhys-Davies,80,M,1944-05-05,/m/06gbnc,/m/01846t,4,Q16455,United Kingdom,"[actor, television actor, film actor, stage ac..."
2924,Mike Judge,18,M,1962-10-17,,/m/01p8r8,5,Q434585,United States of America,"[film director, actor, voice actor, screenwrit..."


In [None]:
# apply the function and scrap country of citizen ship and occupation for all actors in top 20 communities
df['actor_country_of_citizenship'] = df['Wikidata_actor_id'].apply(get_actor_country_of_citizenship)
df['actor_occupation'] = df['Wikidata_actor_id'].apply(get_actor_occupation)

In [14]:
# save the data to a new csv file
# df.to_csv('top_20_communities_all_actors_wikidata_with_country_occupation.csv', index=False)
df=pd.read_csv("CSV_files/top_20_communities_after_scraping.csv")

In [15]:
df.actor_country_of_citizenship.value_counts()

United States of America    2798
India                       1130
United Kingdom               722
Japan                        233
Canada                       131
                            ... 
Colombia                       1
Argentina                      1
Greece                         1
Malta                          1
Lithuania                      1
Name: actor_country_of_citizenship, Length: 71, dtype: int64

In [16]:
df.actor_occupation.value_counts()

['actor']                                                                        615
['actor', 'film actor']                                                          298
['actor', 'television actor']                                                    293
['actor', 'film actor', 'television actor']                                      112
['actor', 'television actor', 'film actor']                                      103
                                                                                ... 
['actor', 'film actor', 'screenwriter', 'television actor']                        1
['actor', 'film actor', 'film director', 'television actor', 'film producer']      1
['voice actor', 'comedian']                                                        1
['actor', 'painter', 'television actor', 'film actor', 'screenwriter']             1
['author', 'actor', 'journalist']                                                  1
Name: actor_occupation, Length: 2794, dtype: int64

In [17]:
# Check nan values in scraped data
print('Number of nan values in country of citizenship column =',df.actor_country_of_citizenship.isna().sum())
print('Number of nan values in occupation column =',df.actor_occupation.isna().sum())

Number of nan values in country of citizenship column = 310
Number of nan values in occupation column = 271
