In [1]:
import pandas as pd
import numpy as np
import wikidata

# scraping libraries
from bs4 import BeautifulSoup
from wikidata.client import Client

In [2]:
# import the data
df = pd.read_csv('top_20_communities_all_actors_wikidata.csv')

In [3]:
df

Unnamed: 0,Actor_name,connectivity,actor_gender,Actor_date_of_birth,Actor_ethnicity,Freebase_actor_ID,community,Wikidata_actor_id
0,John Wayne,286,M,1907-05-26,/m/063k3h,/m/043gj,1,Q40531
1,Roy Rogers,242,M,1911-11-05,/m/07hwkr,/m/01mc6h1,1,Q367129
2,George 'Gabby' Hayes,130,M,1885-05-07,,/m/03l6jx,1,Q1277973
3,Basil Rathbone,121,M,1892-06-13,,/m/0hwd8,1,Q336865
4,John Carradine,114,M,1906-02-05,,/m/021mlp,1,Q312878
...,...,...,...,...,...,...,...,...
6069,Pushpavalli,3,F,,,/m/080hxfv,20,Q7261853
6070,Sarathi,3,M,1942-06-26,,/m/0463_8t,20,Q7423331
6071,Ragini,3,F,1937,,/m/06w4wmq,20,Q7283094
6072,Nirmalamma,3,F,1920,,/m/07kj3f9,20,Q7040057


In [4]:
# create a new client
client = Client()

In [5]:
# function that gets the actor description from the wikidata id (if an error occurs, it returns 'NaN')
def get_actor_description(wikidata_id):
    try:
        actor = client.get(wikidata_id, load=True)
        return actor.description.texts['en']
    except:
        return np.nan
 

In [6]:
def get_actor_country_of_citizenship(wikidata_id):
    try:
        actor = client.get(wikidata_id, load=True)
        citizenship_prop = client.get('P27')
        citizenship = actor[citizenship_prop]
        return str(citizenship.label)
    except:
        return np.nan
 

In [7]:
# actor = client.get('Q1971717', load=True)
# citizenship_prop = client.get('P27')
# citizenship = actor[citizenship_prop]
# str(citizenship.label)
# get_actor_occupation('Q1971717')

In [8]:
def get_actor_occupation(wikidata_id):
    try:
        actor = client.get(wikidata_id, load=True)
        occupation_prop = client.get('P106')
#         occupation = actor[occupation_prop]
#         return occupation.label
        return [str(c.label) for c in actor.getlist(occupation_prop)]
    except:
        return np.nan
 

In [9]:
# sample test
df_Sample = df.sample(10)
df_Sample['actor_description'] = df_Sample['Wikidata_actor_id'].apply(get_actor_description)
df_Sample['actor_country_of_citizenship'] = df_Sample['Wikidata_actor_id'].apply(get_actor_country_of_citizenship)
df_Sample['actor_occupation'] = df_Sample['Wikidata_actor_id'].apply(get_actor_occupation)
df_Sample

Unnamed: 0,Actor_name,connectivity,actor_gender,Actor_date_of_birth,Actor_ethnicity,Freebase_actor_ID,community,Wikidata_actor_id,actor_description,actor_country_of_citizenship,actor_occupation
4449,Hung Yan Yan,9,M,1965-02-25,,/m/064nh_q,9,Q3787623,actor,People's Republic of China,"[actor, film director]"
3489,Poornam Vishwanathan,20,M,1921,,/m/04mxlwb,6,Q7228866,Indian actor,India,[actor]
3048,Morris Chestnut,8,M,1969-01-01,/m/0x67,/m/0hqyy,5,Q472053,American actor,United States of America,"[actor, film producer, television actor, film ..."
4661,Georgie Henley,18,F,1995-07-09,,/m/097c18,10,Q228875,English actress,United Kingdom,"[actor, child actor, stage actor, film actor, ..."
3124,Mariel Hemingway,6,F,1961-11-22,,/m/02ldxr,5,Q234101,American actress and author,United States of America,"[actor, film actor, television actor, screenwr..."
2380,David Tomlinson,25,M,1917-05-07,,/m/02w3kb,4,Q933980,English actor (1917-2000),United Kingdom,"[actor, stage actor, film actor, television ac..."
4893,Rachel Weisz,7,F,1970-03-07,/m/013b6_,/m/014x77,11,Q134077,British actress,United Kingdom,"[television actor, film actor, film director, ..."
4107,Angelina Jolie,19,F,1975-06-04,/m/027n1m6,/m/0f4vbz,8,Q13909,American actress (born 1975),United States of America,"[film actor, film director, film producer, mod..."
842,Ruth Chatterton,3,F,1892-12-24,,/m/0173p_,1,Q287713,actress (1892-1961),United States of America,"[stage actor, film actor, television actor, no..."
3846,Millie Perkins,6,F,1938-05-12,,/m/05v_f6,7,Q266512,actress,United States of America,"[actor, model, television actor, film actor]"


In [10]:
# apply the function 
df['actor_description'] = df['Wikidata_actor_id'].apply(get_actor_description)
df['actor_country_of_citizenship'] = df['Wikidata_actor_id'].apply(get_actor_country_of_citizenship)
df['actor_occupation'] = df['Wikidata_actor_id'].apply(get_actor_occupation)

In [11]:
# save the data to a new csv file
# df.to_csv('top_20_communities_all_actors_wikidata_with_country_occupation.csv', index=False)