## Collecting data from Wikidata

Methodology: For each movie in the our database, we send a query to Wikidata using its Freebase ID. 
If we can retrieve a single result, we record the included Wikidata URI and send another GET request and record everyting as a JSON file.
We also record a dataframe index by Freebase ID consisting of Wikidata URI, Wikidata ID and IMDb ID.

Here we show the resulting dataframes only. You can check the processed dataset at [here](https://drive.google.com/drive/folders/1FycaszmTdI2UjO06tgsg5nqvtpLG_z4s?usp=sharing).

### Libraries used and imports

In [133]:
from typing import Tuple
from pathlib import Path
import shutil
import requests
import json
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

### Loading data

In [116]:
cmu_path = Path('../data/raw/MovieSummaries/')

In [5]:
col_names = [
    'Wikipedia movie ID',
    'Freebase movie ID',
    'Movie name',
    'Movie release date',
    'Movie box office revenue',
    'Movie runtime',
    'Movie languages (Freebase ID:name tuples)',
    'Movie countries (Freebase ID:name tuples)',
    'Movie genres (Freebase ID:name tuples)'
]

df_movie = pd.read_csv(cmu_path.joinpath('movie.metadata.tsv'), delimiter='\t', names=col_names)

### Variables

In [87]:
WIKIPEDIA_QUERY_URL = 'https://query.wikidata.org/sparql'

QUERY_PAYLOAD = ''.join([
'SELECT DISTINCT ?item ?itemLabel WHERE {{',
'  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}',
'  {{',
'    SELECT DISTINCT ?item WHERE {{',
'      ?item p:P646 ?statement0.',
'      ?statement0 (ps:P646) "{0}".',
'    }} LIMIT 100',
'  }}',
'}}'])


DST_PATH = '../data/raw/wikidata/'

USER_AGENT = 'akgokce'

### Helper functions

In [None]:
def post_query(freebase_id:str) -> dict:
    '''
    Posts a query to Wikidata and returns the response as a dict.
    '''
    query = QUERY_PAYLOAD.format(freebase_id)
    r = requests.post(WIKIPEDIA_QUERY_URL, data={'format':'json', 'query':query}, headers={'User-Agent': USER_AGENT})
    return r.json()

def get_wikidata(wikidata_uri: str) -> dict:
    r = requests.get(wikidata_uri, headers={'User-Agent': USER_AGENT})
    return r.json()

def get_imdb_id(wikidata: dict, wikidata_id: str) -> str:
    '''
    Read a Wikidata JSON response and return IMDb ID.
    '''
    try:
        claims = wikidata.get('entities').get(wikidata_id).get('claims')
        imdb_id = claims.get('P345')[0].get('mainsnak').get('datavalue').get('value')
        return imdb_id
    except:
        return None

def get_wikidata_ids(freebase_id:str) -> str:
    '''
    Using SPARQL statements, posts a query to Wikidata and returns URI.
    '''
    try:
        response = post_query(freebase_id)
        results = response.get('results').get('bindings')
        if len(results)==0:
            print(f'No movies found for {freebase_id}')
            raise Exception
        elif len(results)>=2:
            print(f'Multiple movies found for {freebase_id}')
            raise Exception
        wikidata_uri = results[0].get('item').get('value')
        return wikidata_uri
    except:
        print(f'Failed to process the response for {freebase_id}')
        return None


def process_wikidata(wikidata_uri: str, save_json:bool = None):
    '''
    Given a Wikidata URI, send a GET request and process the response, returning Wikidata URI, Wikidata ID and IMDb ID if possible.
    '''
    try:
        wikidata_id = wikidata_uri.split('/')[-1]
        response = get_wikidata(wikidata_uri)
        if save_json:
            with open(dst_path.joinpath(wikidata_id+'.json'), 'w') as file:
                json.dump(response, file, indent=2)
        imdb_id = get_imdb_id(response, wikidata_id)
        return (wikidata_uri, wikidata_id, imdb_id)
    except:
        print(f'Failed to a response for {wikidata_uri}')
        return (None, None, None)

### Get data from Wikipedia

In [None]:
wikipedia_data = dict()
failed_list = []
for freebase_id in tqdm(df_movie['Freebase movie ID']):
    wikidata_uri = get_wikidata_ids(freebase_id)
    if wikidata_uri is None:
        failed_list.append(freebase_id)
        continue
        
    _, wikidata_id, imdb_id = process_wikidata(wikidata_uri, True)
    wikipedia_data[freebase_id] = wikidata_uri, wikidata_id, imdb_id

### Convert to DataFrame and save as CSV

In [None]:
df_wiki = pd.DataFrame.from_dict(wikipedia_data, orient='index', columns=['Wikidata URI', 'Wikidata ID', 'IMDb ID'])
df_wiki.to_csv('../data/processed/wikipedia_ids.csv')

In [85]:
df_wiki = pd.read_csv('../data/processed/wikipedia_ids.csv')
df_wiki.head(5)

Unnamed: 0.1,Unnamed: 0,Wikidata URI,Wikidata ID,IMDb ID
0,/m/03vyhn,http://www.wikidata.org/entity/Q261700,Q261700,tt0228333
1,/m/08yl5d,http://www.wikidata.org/entity/Q16250726,Q16250726,tt0245916
2,/m/0crgdbh,http://www.wikidata.org/entity/Q4978832,Q4978832,tt0094806
3,/m/0285_cd,http://www.wikidata.org/entity/Q7995657,Q7995657,tt0094320
4,/m/01mrr1,http://www.wikidata.org/entity/Q869644,Q869644,tt0083949


### FreebaseID to Ethnicity

In [88]:
# Adapted from https://edstem.org/eu/courses/134/discussion/8415

query = '''SELECT ?item ?freebaseID ?name WHERE {
  ?item p:P646 [ps:P646 ?freebaseID]. #get the freebaseID
  ?item rdfs:label ?name.             #get the name of the enthnic group
  ?item p:P31 [ps:P31 wd:Q41710].     #get only the items whose "instance of" is "ethnic group"
  filter(lang(?name) = "en")          #get the names in english
}'''

r = requests.post(WIKIPEDIA_QUERY_URL, data={'format':'json', 'query':query}, headers={'User-Agent': USER_AGENT})
response = r.json()

In [89]:
freebaseID_ethnicity = {}

for entry in response['results']['bindings']:
    freebaseID = entry['freebaseID']['value']
    ethnicity = entry['name']['value']
    wikidata_uri = entry['item']['value']
    freebaseID_ethnicity[freebaseID] = (ethnicity, wikidata_uri)

In [92]:
df_ethnicity = pd.DataFrame.from_dict(freebaseID_ethnicity, orient='index', columns=['Ethnicity', 'Wikidata URI'])
df_ethnicity.to_csv('../data/processed/ethnicity.csv')

In [93]:
df_ethnicity = pd.read_csv('../data/processed/ethnicity.csv')
df_ethnicity.head(5)

Unnamed: 0.1,Unnamed: 0,Ethnicity,Wikidata URI
0,/m/03c8vzn,Czech Canadians,http://www.wikidata.org/entity/Q3498126
1,/m/04g1jdt,Chagatai people,http://www.wikidata.org/entity/Q3501965
2,/m/05q747b,Croatian Chilean,http://www.wikidata.org/entity/Q3503021
3,/m/0g6k0p,Croatian Peruvian,http://www.wikidata.org/entity/Q3503027
4,/m/047d03w,Croatian Brazilian,http://www.wikidata.org/entity/Q3507533
