# Wikidata Scraping with SPARQL

In this notebook, we scrape data from [Wikidata](https://www.wikidata.org) using the **Wikidata Query Service**. We use **SPARQL** queries to download data directly from Wikidata.

In [4]:
import pandas as pd
import numpy as np
import time
import requests

from wikidata_scraping import download_wikidata_sparql_csv

DATA_PATH = "./../../Data/"

# Wikidata scraping - Movies

In [3]:
# Define query to search for movies based on their imdbID

generic_query = """
SELECT ?imdbID
       (SAMPLE(?wikidataID) AS ?wikidataID)
       (SAMPLE(?freebaseID) AS ?freebaseID)
       (SAMPLE(?wikipediaLink) AS ?wikipediaLink)
       (SAMPLE(?title) AS ?title)
       (SAMPLE(?description) AS ?description)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?runtimes), ",", ""); SEPARATOR=",") AS ?runtime)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?releaseDates), ",", ""); SEPARATOR=",") AS ?releaseDate)
       (GROUP_CONCAT(DISTINCT REPLACE(?genreLabel, ",", ""); SEPARATOR=",") AS ?genres)
       (GROUP_CONCAT(DISTINCT REPLACE(?countryLabel, ",", ""); SEPARATOR=",") AS ?countries)
       (GROUP_CONCAT(DISTINCT REPLACE(?languageLabel, ",", ""); SEPARATOR=",") AS ?languages)
       (GROUP_CONCAT(DISTINCT REPLACE(?companyLabel, ",", ""); SEPARATOR=",") AS ?productionCompanies)
WHERE {
  VALUES ?imdbID {
|
  }
  
  ?item wdt:P345 ?imdbID .
  BIND(STR(?item) AS ?wikidataID)

  OPTIONAL { ?item wdt:P646 ?freebaseID }
  OPTIONAL { ?item wdt:P136 ?genre . ?genre rdfs:label ?genreLabel FILTER(LANG(?genreLabel) = "en") }
  OPTIONAL { ?item wdt:P495 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") }
  OPTIONAL { ?item wdt:P364 ?language . ?language rdfs:label ?languageLabel FILTER(LANG(?languageLabel) = "en") }
  OPTIONAL { ?item wdt:P272 ?company . ?company rdfs:label ?companyLabel FILTER(LANG(?companyLabel) = "en") }
  OPTIONAL { ?item wdt:P577 ?releaseDates }
  OPTIONAL { ?item wdt:P2047 ?runtimes }
  
  OPTIONAL {
    ?wikipediaLink schema:about ?item ;
                   schema:inLanguage "en" ;
                   schema:isPartOf <https://en.wikipedia.org/> .
  }

  ?item rdfs:label ?title FILTER(LANG(?title) = "en") .
  OPTIONAL { ?item schema:description ?description FILTER(LANG(?description) = "en") }
}
GROUP BY ?imdbID
"""

In [5]:
# We run the above defined SPARQL query on groups of 120 imdbIDs, the results are saved in temporary files

imdb_ids = pd.read_csv(DATA_PATH + "movies_imdb_tmdb.tsv", sep='\t')['imdb_id_movie'].values

temp_files_name = "tempFiles/wikidata_imdbID_temp_"

n = len(imdb_ids)
step = 120

c = 1
low = step*(c-1)
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join([f"\"{imdb_ids[i]}\"" for i in range(low,up)])
    
    query = generic_query.replace("|", ids_string)
    
    success = download_wikidata_sparql_csv(query, f"{DATA_PATH}{temp_files_name}{c:06d}.csv")
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

    time.sleep(0.25)

120/381216 done
240/381216 done
360/381216 done
480/381216 done
600/381216 done
720/381216 done
840/381216 done
960/381216 done
1080/381216 done
1200/381216 done
1320/381216 done
1440/381216 done
1560/381216 done
1680/381216 done
1800/381216 done
1920/381216 done
2040/381216 done
2160/381216 done
2280/381216 done
2400/381216 done
2520/381216 done
2640/381216 done
2760/381216 done
2880/381216 done
3000/381216 done
3120/381216 done
3240/381216 done
3360/381216 done
3480/381216 done
3600/381216 done
3720/381216 done
3840/381216 done
3960/381216 done
4080/381216 done
4200/381216 done
4320/381216 done
4440/381216 done
4560/381216 done
4680/381216 done
4800/381216 done
4920/381216 done
5040/381216 done
5160/381216 done
5280/381216 done
5400/381216 done
5520/381216 done
5640/381216 done
5760/381216 done
5880/381216 done
6000/381216 done
6120/381216 done
6240/381216 done
6360/381216 done
6480/381216 done
6600/381216 done
6720/381216 done
6840/381216 done
6960/381216 done
7080/381216 done
7200/

In [8]:
# Define query to search for movies based on their freebaseID

generic_query = """
SELECT ?freebaseID
       (SAMPLE(?wikidataID) AS ?wikidataID)
       (SAMPLE(?imdbID) AS ?imdbID)
       (SAMPLE(?wikipediaLink) AS ?wikipediaLink)
       (SAMPLE(?title) AS ?title)
       (SAMPLE(?description) AS ?description)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?runtimes), ",", ""); SEPARATOR=",") AS ?runtime)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?releaseDates), ",", ""); SEPARATOR=",") AS ?releaseDate)
       (GROUP_CONCAT(DISTINCT REPLACE(?genreLabel, ",", ""); SEPARATOR=",") AS ?genres)
       (GROUP_CONCAT(DISTINCT REPLACE(?countryLabel, ",", ""); SEPARATOR=",") AS ?countries)
       (GROUP_CONCAT(DISTINCT REPLACE(?languageLabel, ",", ""); SEPARATOR=",") AS ?languages)
       (GROUP_CONCAT(DISTINCT REPLACE(?companyLabel, ",", ""); SEPARATOR=",") AS ?productionCompanies)
WHERE {
  VALUES ?freebaseID {
|
  }
  
  ?item wdt:P646 ?freebaseID .
  BIND(STR(?item) AS ?wikidataID)
  
  OPTIONAL { ?item wdt:P136 ?genre . ?genre rdfs:label ?genreLabel FILTER(LANG(?genreLabel) = "en") }
  OPTIONAL { ?item wdt:P495 ?country . ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") }
  OPTIONAL { ?item wdt:P364 ?language . ?language rdfs:label ?languageLabel FILTER(LANG(?languageLabel) = "en") }
  OPTIONAL { ?item wdt:P272 ?company . ?company rdfs:label ?companyLabel FILTER(LANG(?companyLabel) = "en") }
  OPTIONAL { ?item wdt:P577 ?releaseDates }
  OPTIONAL { ?item wdt:P2047 ?runtimes }
  
  OPTIONAL {
    ?wikipediaLink schema:about ?item ;
                   schema:inLanguage "en" ;
                   schema:isPartOf <https://en.wikipedia.org/> .
  }

  ?item rdfs:label ?title FILTER(LANG(?title) = "en") .
  OPTIONAL { ?item schema:description ?description FILTER(LANG(?description) = "en") }
}
GROUP BY ?freebaseID
"""

In [12]:
# We run the above defined SPARQL query on groups of 120 freebaseIDs, the results are saved in temporary files

freebase_ids = list(set(pd.read_csv(DATA_PATH + 'movie.metadata.tsv', sep='\t', header=None)[1].values).difference(set(pd.read_csv(DATA_PATH + "wikidata_imdbID.csv")['freebaseID'].values)))

temp_files_name = "tempFiles/wikidata_imdbID_temp_"

n = len(freebase_ids)
step = 120

c = 1
low = step*(c-1)
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join([f"\"{freebase_ids[i]}\"" for i in range(low,up)])
    
    query = generic_query.replace("|", ids_string)
    
    success = download_wikidata_sparql_csv(query, f"{DATA_PATH}{temp_files_name}{3178+c-1:06d}.csv")
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

    time.sleep(0.25)

120/21998 done
240/21998 done
360/21998 done
480/21998 done
600/21998 done
720/21998 done
840/21998 done
960/21998 done
1080/21998 done
1200/21998 done
1320/21998 done
1440/21998 done
1560/21998 done
1680/21998 done
1800/21998 done
1920/21998 done
2040/21998 done
2160/21998 done
2280/21998 done
2400/21998 done
2520/21998 done
2640/21998 done
2760/21998 done
2880/21998 done
3000/21998 done
3120/21998 done
3240/21998 done
3360/21998 done
3480/21998 done
3600/21998 done
3720/21998 done
3840/21998 done
3960/21998 done
4080/21998 done
4200/21998 done
4320/21998 done
4440/21998 done
4560/21998 done
4680/21998 done
4800/21998 done
4920/21998 done
5040/21998 done
5160/21998 done
5280/21998 done
5400/21998 done
5520/21998 done
5640/21998 done
5760/21998 done
5880/21998 done
6000/21998 done
6120/21998 done
6240/21998 done
6360/21998 done
6480/21998 done
6600/21998 done
6720/21998 done
6840/21998 done
6960/21998 done
7080/21998 done
7200/21998 done
7320/21998 done
7440/21998 done
7560/21998 done


In [16]:
# Concatenate all the temporary files in a single csv file

pd.concat([pd.read_csv(f"{DATA_PATH}{temp_files_name}{i:06d}.csv") for i in range(1,3362)], ignore_index=True).to_csv(DATA_PATH + "wikidata_freebaseID_imdbID.csv", index=False)

# Wikidata scraping - Casts

In [None]:
# Define query to search for casts and directors based on movies' imdbID

generic_query = """
SELECT ?film ?freebaseFilmID ?imdbID ?filmLabel ?person ?freebasePersonID ?personIMDbID ?personLabel ?role ?characterName
WHERE {
  VALUES ?imdbID {
|
  }

  ?film wdt:P345 ?imdbID .
  OPTIONAL { ?film wdt:P646 ?freebaseFilmID }
  {
    ?film wdt:P57 ?person .
    BIND("director" AS ?role)
  }
  UNION
  {
    ?film wdt:P161 ?person .
    BIND("actor" AS ?role)
    OPTIONAL {
      ?film p:P161 ?castStatement .
      ?castStatement ps:P161 ?person ;
                     pq:P4633 ?characterName .
    }
  }

  OPTIONAL { ?person wdt:P646 ?freebasePersonID }
  OPTIONAL { ?person wdt:P345 ?personIMDbID }

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
"""

In [None]:
# We run the above defined SPARQL query on groups of 200 imdbIDs, the results are saved in temporary files

movies_complete = pd.read_csv(DATA_PATH + "movies_complete.tsv", sep='\t')
imdb_ids_movie = movies_complete["imdb_id_movie"].dropna().values

temp_files_name = "tempFiles/wikidata_cast_temp_"

n = len(imdb_ids_movie)
step = 200

c = 1
low = step*(c-1)
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join([f"\"{imdb_ids_movie[i]}\"" for i in range(low,up)])
    
    query = generic_query.replace("|", ids_string)
    
    success = download_wikidata_sparql_csv(query, f"{DATA_PATH}{temp_files_name}{c:06d}.csv")
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

    time.sleep(0.25)

200/65454 done
400/65454 done
600/65454 done
800/65454 done
1000/65454 done
1200/65454 done
1400/65454 done
1600/65454 done
1800/65454 done
2000/65454 done
2200/65454 done
2400/65454 done
2600/65454 done
2800/65454 done
3000/65454 done
3200/65454 done
3400/65454 done
3600/65454 done
3800/65454 done
4000/65454 done
4200/65454 done
4400/65454 done
4600/65454 done
4800/65454 done
5000/65454 done
5200/65454 done
5400/65454 done
5600/65454 done
5800/65454 done
6000/65454 done
6200/65454 done
6400/65454 done
6600/65454 done
6800/65454 done
7000/65454 done
7200/65454 done
7400/65454 done
7600/65454 done
7800/65454 done
8000/65454 done
8200/65454 done
8400/65454 done
8600/65454 done
8800/65454 done
9000/65454 done
9200/65454 done
9400/65454 done
9600/65454 done
9800/65454 done
10000/65454 done
10200/65454 done
10400/65454 done
10600/65454 done
10800/65454 done
11000/65454 done
11200/65454 done
11400/65454 done
11600/65454 done
11800/65454 done
12000/65454 done
12200/65454 done
12400/65454 done

In [None]:
# Concatenate all the temporary files and use movies_complete to translate imdbIDs into freebaseIDs, save the result in a csv file

df = pd.concat([pd.read_csv(f"{DATA_PATH}{temp_files_name}{i:06d}.csv") for i in range(1,329)], ignore_index=True)

translator = pd.Series(movies_complete.freebase_id_movie.values, index=movies_complete.imdb_id_movie.values)
df["freebaseFilmID"] = df.apply(lambda row: translator[row["imdbID"]] if pd.isna(row["freebaseFilmID"]) else row["freebaseFilmID"], axis=1)

df.to_csv(DATA_PATH + "wikidata_cast_imdb.csv", index=False)

In [None]:
# Define query to search for casts and directors based on movies' freebaseID

generic_query = """
SELECT ?film ?freebaseFilmID ?imdbID ?filmLabel ?person ?freebasePersonID ?personIMDbID ?personLabel ?role ?characterName
WHERE {
  VALUES ?freebaseFilmID {
|
  }

  ?film wdt:P646 ?freebaseFilmID .
  OPTIONAL { ?film wdt:P345 ?imdbID }
  {
    ?film wdt:P57 ?person .
    BIND("director" AS ?role)
  }
  UNION
  {
    ?film wdt:P161 ?person .
    BIND("actor" AS ?role)
    OPTIONAL {
      ?film p:P161 ?castStatement .
      ?castStatement ps:P161 ?person ;
                     pq:P4633 ?characterName .
    }
  }

  OPTIONAL { ?person wdt:P646 ?freebasePersonID }
  OPTIONAL { ?person wdt:P345 ?personIMDbID }

  SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
"""

In [None]:
# We run the above defined SPARQL query on groups of 200 freebaseIDs, the results are saved in temporary files

freebase_ids = list(set(movies_complete["freebase_id_movie"].dropna().values).difference(set(pd.read_csv(DATA_PATH + "wikidata_cast_imdb.csv")["freebaseFilmID"].dropna().values)))

temp_files_name = "tempFiles/wikidata_cast_temp_"

n = len(freebase_ids)
step = 200

c = 1
low = step*(c-1)
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join([f"\"{freebase_ids[i]}\"" for i in range(low,up)])
    
    query = generic_query.replace("|", ids_string)
    
    success = download_wikidata_sparql_csv(query, f"{DATA_PATH}{temp_files_name}{329+c-1:06d}.csv")
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

    time.sleep(0.25)

200/15949 done
400/15949 done
600/15949 done
800/15949 done
1000/15949 done
1200/15949 done
1400/15949 done
1600/15949 done
1800/15949 done
2000/15949 done
Error: Failed to retrieve data. Status code 429
2000 - 2200 to redo
Error: Failed to retrieve data. Status code 429
2000 - 2200 to redo
2200/15949 done
Error: Failed to retrieve data. Status code 429
2200 - 2400 to redo
Error: Failed to retrieve data. Status code 429
2200 - 2400 to redo
2400/15949 done
Error: Failed to retrieve data. Status code 429
2400 - 2600 to redo
Error: Failed to retrieve data. Status code 429
2400 - 2600 to redo
2600/15949 done
Error: Failed to retrieve data. Status code 429
2600 - 2800 to redo
Error: Failed to retrieve data. Status code 429
2600 - 2800 to redo
2800/15949 done
Error: Failed to retrieve data. Status code 429
2800 - 3000 to redo
3000/15949 done
Error: Failed to retrieve data. Status code 429
3000 - 3200 to redo
3200/15949 done
Error: Failed to retrieve data. Status code 429
3200 - 3400 to redo


In [None]:
# Concatenating the new temporary files we find that no rows have been added to the one we had before

df = pd.concat([pd.read_csv(f"{DATA_PATH}{temp_files_name}{i:06d}.csv") for i in range(329,409)], ignore_index=True)

print(len(df))

0


# Wikidata scraping - People

In [None]:
# Define query to search for people based on their imdbID

generic_query = """
SELECT ?imdbID
       (SAMPLE(?wikidataID) AS ?wikidataID)
       (SAMPLE(?freebaseID) AS ?freebaseID)
       (SAMPLE(?wikipediaLink) AS ?wikipediaLink)
       (SAMPLE(?itemLabel) AS ?nameSurname)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?givenNameLabel), ",", ""); SEPARATOR=",") AS ?givenName)
       (SAMPLE(?familyNameLabel) AS ?familyName)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?birthDates), ",", ""); SEPARATOR=",") AS ?birthDate)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?genderLabel), ",", ""); SEPARATOR=",") AS ?gender)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?citizenshipLabel), ",", ""); SEPARATOR=",") AS ?citizenship)
       (SAMPLE(?placeOfBirthLabel) AS ?placeOfBirth)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?nativeLanguageLabel), ",", ""); SEPARATOR=",") AS ?nativeLanguage)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?deathDates), ",", ""); SEPARATOR=",") AS ?deathDate)
WHERE {
  VALUES ?imdbID {
|
  }

  ?item wdt:P345 ?imdbID .
  BIND(STR(?item) AS ?wikidataID)

  OPTIONAL { ?item wdt:P646 ?freebaseID }

  OPTIONAL { ?item wdt:P735 ?givenName . ?givenName rdfs:label ?givenNameLabel FILTER(LANG(?givenNameLabel) = "en") }
  OPTIONAL { ?item wdt:P734 ?familyName . ?familyName rdfs:label ?familyNameLabel FILTER(LANG(?familyNameLabel) = "en") }

  OPTIONAL { ?item wdt:P569 ?birthDates }
  OPTIONAL { ?item wdt:P21 ?gender . ?gender rdfs:label ?genderLabel FILTER(LANG(?genderLabel) = "en") }
  OPTIONAL { ?item wdt:P27 ?citizenship . ?citizenship rdfs:label ?citizenshipLabel FILTER(LANG(?citizenshipLabel) = "en") }
  OPTIONAL { ?item wdt:P19 ?placeOfBirth . ?placeOfBirth rdfs:label ?placeOfBirthLabel FILTER(LANG(?placeOfBirthLabel) = "en") }
  OPTIONAL { ?item wdt:P103 ?nativeLanguage . ?nativeLanguage rdfs:label ?nativeLanguageLabel FILTER(LANG(?nativeLanguageLabel) = "en") }
  OPTIONAL { ?item wdt:P570 ?deathDates }

  OPTIONAL {
    ?wikipediaLink schema:about ?item ;
                   schema:inLanguage "en" ;
                   schema:isPartOf <https://en.wikipedia.org/> .
  }

  ?item rdfs:label ?itemLabel FILTER(LANG(?itemLabel) = "en") .
}
GROUP BY ?imdbID
"""

In [None]:
# We run the above defined SPARQL query on groups of 200 imdbIDs, the results are saved in temporary files

movie_actor_complete = pd.read_csv(DATA_PATH + "movie_actor_complete.tsv", sep='\t')
imdb_ids = set(movie_actor_complete.imdb_id_actor.dropna())
freebase_ids = movie_actor_complete.freebase_id_actor[~movie_actor_complete.imdb_id_actor.isin(imdb_ids).dropna()].values
imdb_ids = list(imdb_ids)

temp_files_name = "tempFiles/wikidata_people_temp_"

n = len(imdb_ids)
step = 200

c = 233
low = step*(c-1)
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join([f"\"{imdb_ids[i]}\"" for i in range(low,up)])
    
    query = generic_query.replace("|", ids_string)

    success = False
    try:
        success = download_wikidata_sparql_csv(query, f"{DATA_PATH}{temp_files_name}{c:06d}.csv")
    except:
        time.sleep(1)
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

    time.sleep(0.25)

46600/103877 done
46800/103877 done
47000/103877 done
47200/103877 done
47400/103877 done
47600/103877 done
47800/103877 done
48000/103877 done
48200/103877 done
48400/103877 done
48600/103877 done
48800/103877 done
49000/103877 done
49200/103877 done
49400/103877 done
49600/103877 done
49800/103877 done
50000/103877 done
50200/103877 done
50400/103877 done
50600/103877 done
50800/103877 done
51000/103877 done
51200/103877 done
51400/103877 done
51600/103877 done
51800/103877 done
52000/103877 done
52200/103877 done
52400/103877 done
52600/103877 done
52800/103877 done
53000/103877 done
53200/103877 done
53400/103877 done
53600/103877 done
53800/103877 done
54000/103877 done
54200/103877 done
54400/103877 done
54600/103877 done
54800/103877 done
55000/103877 done
55200/103877 done
55400/103877 done
55600/103877 done
55800/103877 done
56000/103877 done
56200/103877 done
56400/103877 done
56600/103877 done
56800/103877 done
57000/103877 done
57200/103877 done
57400/103877 done
57600/1038

In [None]:
# Define query to search for people based on their freebaseID

generic_query = """
SELECT (SAMPLE(?imdbID) AS ?imdbID)
       (SAMPLE(?wikidataID) AS ?wikidataID)
       ?freebaseID
       (SAMPLE(?wikipediaLink) AS ?wikipediaLink)
       (SAMPLE(?itemLabel) AS ?nameSurname)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?givenNameLabel), ",", ""); SEPARATOR=",") AS ?givenName)
       (SAMPLE(?familyNameLabel) AS ?familyName)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?birthDates), ",", ""); SEPARATOR=",") AS ?birthDate)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?genderLabel), ",", ""); SEPARATOR=",") AS ?gender)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?citizenshipLabel), ",", ""); SEPARATOR=",") AS ?citizenship)
       (SAMPLE(?placeOfBirthLabel) AS ?placeOfBirth)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?nativeLanguageLabel), ",", ""); SEPARATOR=",") AS ?nativeLanguage)
       (GROUP_CONCAT(DISTINCT REPLACE(STR(?deathDates), ",", ""); SEPARATOR=",") AS ?deathDate)
WHERE {
  VALUES ?freebaseID {
|
  }

  ?item wdt:P646 ?freebaseID .
  BIND(STR(?item) AS ?wikidataID)

  OPTIONAL { ?item wdt:P345 ?imdbID }

  OPTIONAL { ?item wdt:P735 ?givenName . ?givenName rdfs:label ?givenNameLabel FILTER(LANG(?givenNameLabel) = "en") }
  OPTIONAL { ?item wdt:P734 ?familyName . ?familyName rdfs:label ?familyNameLabel FILTER(LANG(?familyNameLabel) = "en") }

  OPTIONAL { ?item wdt:P569 ?birthDates }
  OPTIONAL { ?item wdt:P21 ?gender . ?gender rdfs:label ?genderLabel FILTER(LANG(?genderLabel) = "en") }
  OPTIONAL { ?item wdt:P27 ?citizenship . ?citizenship rdfs:label ?citizenshipLabel FILTER(LANG(?citizenshipLabel) = "en") }
  OPTIONAL { ?item wdt:P19 ?placeOfBirth . ?placeOfBirth rdfs:label ?placeOfBirthLabel FILTER(LANG(?placeOfBirthLabel) = "en") }
  OPTIONAL { ?item wdt:P103 ?nativeLanguage . ?nativeLanguage rdfs:label ?nativeLanguageLabel FILTER(LANG(?nativeLanguageLabel) = "en") }
  OPTIONAL { ?item wdt:P570 ?deathDates }

  OPTIONAL {
    ?wikipediaLink schema:about ?item ;
                   schema:inLanguage "en" ;
                   schema:isPartOf <https://en.wikipedia.org/> .
  }

  ?item rdfs:label ?itemLabel FILTER(LANG(?itemLabel) = "en") .
}
GROUP BY ?freebaseID
"""

In [None]:
# We run the above defined SPARQL query on groups of 200 freebaseIDs, the results are saved in temporary files

n = len(freebase_ids)
step = 200

c = 1
low = step*(c-1)
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join([f"\"{freebase_ids[i]}\"" for i in range(low,up)])
    
    query = generic_query.replace("|", ids_string)
    
    success = download_wikidata_sparql_csv(query, f"{DATA_PATH}{temp_files_name}{521+c-1:06d}.csv")
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

    time.sleep(0.25)

200/58275 done
400/58275 done
600/58275 done
800/58275 done
1000/58275 done
1200/58275 done
1400/58275 done
1600/58275 done
1800/58275 done
2000/58275 done
2200/58275 done
2400/58275 done
2600/58275 done
2800/58275 done
3000/58275 done
3200/58275 done
3400/58275 done
3600/58275 done
3800/58275 done
4000/58275 done
4200/58275 done
4400/58275 done
4600/58275 done
4800/58275 done
5000/58275 done
5200/58275 done
5400/58275 done
5600/58275 done
5800/58275 done
6000/58275 done
6200/58275 done
6400/58275 done
6600/58275 done
6800/58275 done
7000/58275 done
7200/58275 done
7400/58275 done
7600/58275 done
7800/58275 done
8000/58275 done
8200/58275 done
8400/58275 done
8600/58275 done
8800/58275 done
9000/58275 done
9200/58275 done
9400/58275 done
9600/58275 done
9800/58275 done
10000/58275 done
10200/58275 done
10400/58275 done
10600/58275 done
10800/58275 done
11000/58275 done
11200/58275 done
11400/58275 done
11600/58275 done
11800/58275 done
12000/58275 done
12200/58275 done
12400/58275 done

In [None]:
# Concatenate all the temporary files in a single csv file

pd.concat([pd.read_csv(f"{DATA_PATH}{temp_files_name}{i:06d}.csv") for i in range(1,813)], ignore_index=True).to_csv(DATA_PATH + "wikidata_people.csv", index=False)

In [None]:
# Define query to search for peoples' place of birth based on their wikidataID

generic_query = """
SELECT 
  ?wikidataID
  (SAMPLE(?itemLabel) AS ?nameSurname)
  (SAMPLE(?placeOfBirthLabel) AS ?placeOfBirth)
  (SAMPLE(COALESCE(?countryLabel, ?originCountryLabel)) AS ?country)
  (SAMPLE(?continentLabel) AS ?continent)
WHERE {
  VALUES ?wikidataID {
|
  }

  ?wikidataID rdfs:label ?itemLabel FILTER(LANG(?itemLabel) = "en") .
  
  OPTIONAL { 
    ?wikidataID wdt:P19 ?placeOfBirth .
    ?placeOfBirth rdfs:label ?placeOfBirthLabel FILTER(LANG(?placeOfBirthLabel) = "en") .
    OPTIONAL { 
      ?placeOfBirth wdt:P17 ?country .
      ?country rdfs:label ?countryLabel FILTER(LANG(?countryLabel) = "en") .
      OPTIONAL { 
        ?country wdt:P30 ?continent .
        ?continent rdfs:label ?continentLabel FILTER(LANG(?continentLabel) = "en") 
      }
    }
  }

  OPTIONAL {
    ?wikidataID wdt:P495 ?originCountry .
    ?originCountry rdfs:label ?originCountryLabel FILTER(LANG(?originCountryLabel) = "en") .
    OPTIONAL { 
        ?originCountry wdt:P30 ?continent .
        ?continent rdfs:label ?continentLabel FILTER(LANG(?continentLabel) = "en") 
      }
  }
}
GROUP BY ?wikidataID

"""

In [None]:
# We run the above defined SPARQL query on groups of 200 wikidataIDs, the results are saved in temporary files

def is_valid_format(string):
    return string.startswith('Q') and string[1:].isdigit()

people_complete = pd.read_csv(DATA_PATH + "people_complete.tsv", sep='\t')
wikidata_ids = people_complete.wikidata_id_actor.dropna()
wikidata_ids = wikidata_ids.apply(lambda x: x.split("/")[-1])
wikidata_ids = wikidata_ids[wikidata_ids.apply(is_valid_format)].values
wikidata_ids = list(wikidata_ids)

temp_files_name = "tempFiles/wikidata_people_country_temp_"

n = len(wikidata_ids)
step = 200

c = 1
low = step*(c-1)
while low < n:
    up = min(low + step, n)

    ids_string = "\n".join([f"wd:{wikidata_ids[i]}" for i in range(low,up)])
    
    query = generic_query.replace("|", ids_string)

    success = False
    try:
        success = download_wikidata_sparql_csv(query, f"{DATA_PATH}{temp_files_name}{c:06d}.csv")
    except:
        time.sleep(1)
    
    if success:
        low = up
        c += 1
        print(f"{up}/{n} done")
    else:
        print(f"{low} - {up} to redo")

    time.sleep(0.25)

200/111500 done
400/111500 done
600/111500 done
800/111500 done
1000/111500 done
1200/111500 done
1400/111500 done
1600/111500 done
1800/111500 done
2000/111500 done
2200/111500 done
2400/111500 done
2600/111500 done
2800/111500 done
3000/111500 done
3200/111500 done
3400/111500 done
3600/111500 done
3800/111500 done
4000/111500 done
4200/111500 done
4400/111500 done
4600/111500 done
4800/111500 done
5000/111500 done
5200/111500 done
5400/111500 done
5600/111500 done
5800/111500 done
6000/111500 done
6200/111500 done
6400/111500 done
6600/111500 done
6800/111500 done
7000/111500 done
7200/111500 done
7400/111500 done
7600/111500 done
7800/111500 done
8000/111500 done
8200/111500 done
8400/111500 done
8600/111500 done
8800/111500 done
9000/111500 done
9200/111500 done
9400/111500 done
9600/111500 done
9800/111500 done
10000/111500 done
10200/111500 done
10400/111500 done
10600/111500 done
10800/111500 done
11000/111500 done
11200/111500 done
11400/111500 done
11600/111500 done
11800/111

In [None]:
# Concatenate all the temporary files in a single csv file

pd.concat([pd.read_csv(f"{DATA_PATH}{temp_files_name}{i:06d}.csv") for i in range(1,559)], ignore_index=True).to_csv(DATA_PATH + "wikidata_people_country.csv", index=False)