# Knowledge Imbalance Analysis - Ranker (Wikidata)

Compiled by Fariz Darari, Ph.D. (Fasilkom UI)

## Task: Given a (Wikidata) RDF TTL file, rank the entities from the richest to poorest

In [6]:
# installation from Anaconda Prompt:
# conda install -c conda-forge rdflib=5.0.0

# ps: v6.0.1 has some bug so must downgrade the version

# also install tabulate lib for pretty printing Pandas data frames:
# conda install -c conda-forge tabulate

In [1]:
# import libs
import pandas as pd
import rdflib
import time

In [2]:
# load a TTL file
file_name = "20211218-0002-dataGraph.ttl" # Indonesian computer scientists
g = rdflib.Graph()
g.parse(file_name, format="turtle")
print(f"Number of triples: {len(g)}") # number of triples

Number of triples: 5032


In [3]:
# test query: get 5 triples
q = """
SELECT *
WHERE {
  ?s ?p ?o
} LIMIT 5
"""

q_res = g.query(q)
for row in q_res:
    print(f"{row.s} {row.p} {row.o}")

http://www.wikidata.org/entity/P6552 http://wikiba.se/ontology#reference http://www.wikidata.org/prop/reference/P6552
http://www.wikidata.org/entity/statement/Q57167805-87d611d5-4854-c8f3-b29e-c92b20ba9218 http://www.wikidata.org/prop/qualifier/value/P3744 http://www.wikidata.org/value/b79f6c964d53f062c99c37ac066ee9b4
http://www.wikidata.org/entity/P106 http://www.w3.org/2000/01/rdf-schema#label occupation
http://www.wikidata.org/entity/P549 http://wikiba.se/ontology#claim http://www.wikidata.org/prop/P549
http://www.wikidata.org/entity/Q63346808 http://www.w3.org/2004/02/skos/core#prefLabel Adila Alfa Krisnadhi


In [4]:
# test query: get direct properties and values of specific entity (same properties may appear several times)
entity_uri = "http://www.wikidata.org/entity/Q61852199"

q = '''
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT *
WHERE {{
  <{0}> ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
}}
'''.format(entity_uri)

q_res = g.query(q)
print(len(q_res))
for row in q_res:
    print(f"{row.p} {row.o}")

27
http://www.wikidata.org/prop/direct/P21 http://www.wikidata.org/entity/Q6581097
http://www.wikidata.org/prop/direct/P184 http://www.wikidata.org/entity/Q102302240
http://www.wikidata.org/prop/direct/P2888 https://scigraph.springernature.com/person.013155230077.37
http://www.wikidata.org/prop/direct/P496 0000-0001-6025-609X
http://www.wikidata.org/prop/direct/P69 http://www.wikidata.org/entity/Q158158
http://www.wikidata.org/prop/direct/P106 http://www.wikidata.org/entity/Q1622272
http://www.wikidata.org/prop/direct/P2456 134/6913
http://www.wikidata.org/prop/direct/P106 http://www.wikidata.org/entity/Q82594
http://www.wikidata.org/prop/direct/P4174 Fadirra
http://www.wikidata.org/prop/direct/P6634 farizdarari
http://www.wikidata.org/prop/direct/P31 http://www.wikidata.org/entity/Q5
http://www.wikidata.org/prop/direct/P106 http://www.wikidata.org/entity/Q1650915
http://www.wikidata.org/prop/direct/P184 http://www.wikidata.org/entity/Q51903108
http://www.wikidata.org/prop/direct/P6178

In [5]:
# test query: get distinct direct properties of specific entity (unique properties)
entity_uri = "http://www.wikidata.org/entity/Q61852199"

q = '''
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT ?p
WHERE {{
  <{0}> ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
}}
'''.format(entity_uri)

q_res = g.query(q)
print(len(q_res))
for row in q_res:
    print(f"{row.p}")

22
http://www.wikidata.org/prop/direct/P21
http://www.wikidata.org/prop/direct/P184
http://www.wikidata.org/prop/direct/P2888
http://www.wikidata.org/prop/direct/P496
http://www.wikidata.org/prop/direct/P69
http://www.wikidata.org/prop/direct/P106
http://www.wikidata.org/prop/direct/P2456
http://www.wikidata.org/prop/direct/P4174
http://www.wikidata.org/prop/direct/P6634
http://www.wikidata.org/prop/direct/P31
http://www.wikidata.org/prop/direct/P6178
http://www.wikidata.org/prop/direct/P1412
http://www.wikidata.org/prop/direct/P1153
http://www.wikidata.org/prop/direct/P2037
http://www.wikidata.org/prop/direct/P2038
http://www.wikidata.org/prop/direct/P103
http://www.wikidata.org/prop/direct/P2002
http://www.wikidata.org/prop/direct/P549
http://www.wikidata.org/prop/direct/P1960
http://www.wikidata.org/prop/direct/P166
http://www.wikidata.org/prop/direct/P108
http://www.wikidata.org/prop/direct/P27


In [6]:
def get_en_label(entity_uri, g):
    q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?label
WHERE {{
  <{0}> rdfs:label ?label .
  FILTER(LANG(?label)="en")
}}
""".format(entity_uri)
    q_res = g.query(q)
    for row in q_res:
        return str(row.label)

In [39]:
# core code
# todo: clean up

# retrieve entities based on class and filters
q = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT * WHERE {
    ?x wdt:P31 wd:Q5 . # instance-of human
    ?x wdt:P27 wd:Q252 . # citizenship Indonesia
    ?x wdt:P106 wd:Q82594 .  # occupation computer-scientist
}
"""

df_sum = None # dataframe to store retrieved entities
q_res = g.query(q)
print(len(q_res)) # number of retrieved entities

for row in q_res:
    entity_uri = str(row.x)
    entity_name = get_en_label(entity_uri, g)
    
    # get distinct direct properties
    q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT ?p ?pLabel
WHERE {{
  <{0}> ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
  ?fullP wikibase:directClaim ?p .
  ?fullP rdfs:label ?pLabel .
  FILTER(LANG(?pLabel)="en")
}}
""".format(entity_uri)
    q_res = g.query(q)

    # normalize and collect properties for each entity
    attributes = []
    for row in q_res:
        p_id = row.p.replace("http://www.wikidata.org/prop/direct/", "")
        p_label_normalized = row.pLabel.replace(" ", "-")
        attributes.append(f"{p_id}-{p_label_normalized}")
    
    # transform property existence information for each entity to pandas dataframe
    lst = [[entity_uri.replace("http://www.wikidata.org/entity/", ""), entity_name] + [1.0] * len(q_res)]
    df = pd.DataFrame(lst, columns=["META-QID", "META-Name"] + attributes)
    
    # combine property existence information for all entities
    if df_sum is None:
        df_sum = df
    else:
        df_sum = pd.concat([df_sum, df], sort=False, ignore_index=True).fillna(0)
    
df_sum

18


Unnamed: 0,META-QID,META-Name,P1960-Google-Scholar-author-ID,P21-sex-or-gender,P69-educated-at,P2038-ResearchGate-profile-ID,P106-occupation,P27-country-of-citizenship,"P1412-languages-spoken,-written-or-signed",P31-instance-of,...,P2578-studies,P735-given-name,P856-official-website,P1416-affiliation,P244-Library-of-Congress-authority-ID,P569-date-of-birth,P213-ISNI,P214-VIAF-ID,P7859-WorldCat-Identities-ID,P1006-Nationale-Thesaurus-voor-Auteurs-ID
0,Q57167398,Ayu Purwarianti,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Q101542203,Radityo Eko Prasojo,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Q20426405,Budi Rahardjo,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Q61912217,Benhard Sitohang,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Q61852199,Fariz Darari,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Q57167805,Peb Ruswono Aryan,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Q61911973,Dwi Hendratmo Widyantoro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Q97382926,Mirna Adriani,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Q63346808,Adila Alfa Krisnadhi,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Q67625696,Mohammad Yani,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
property_counters = df_sum[[x for x in list(df_sum.columns) if x.startswith("P")]].sum(axis=1)
df_sum.insert(loc=0, column='META-NumOfProps', value=property_counters)
df_sum

Unnamed: 0,META-NumOfProps,META-QID,META-Name,P1960-Google-Scholar-author-ID,P21-sex-or-gender,P69-educated-at,P2038-ResearchGate-profile-ID,P106-occupation,P27-country-of-citizenship,"P1412-languages-spoken,-written-or-signed",...,P2578-studies,P735-given-name,P856-official-website,P1416-affiliation,P244-Library-of-Congress-authority-ID,P569-date-of-birth,P213-ISNI,P214-VIAF-ID,P7859-WorldCat-Identities-ID,P1006-Nationale-Thesaurus-voor-Auteurs-ID
0,11.0,Q57167398,Ayu Purwarianti,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.0,Q101542203,Radityo Eko Prasojo,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.0,Q20426405,Budi Rahardjo,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.0,Q61912217,Benhard Sitohang,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22.0,Q61852199,Fariz Darari,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,23.0,Q57167805,Peb Ruswono Aryan,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,14.0,Q61911973,Dwi Hendratmo Widyantoro,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,9.0,Q97382926,Mirna Adriani,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10.0,Q63346808,Adila Alfa Krisnadhi,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,16.0,Q67625696,Mohammad Yani,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df_sum.sort_values(by=['META-NumOfProps'], ascending=False).head(5) # top-5 richest

Unnamed: 0,META-NumOfProps,META-QID,META-Name,P1960-Google-Scholar-author-ID,P21-sex-or-gender,P69-educated-at,P2038-ResearchGate-profile-ID,P106-occupation,P27-country-of-citizenship,"P1412-languages-spoken,-written-or-signed",...,P2578-studies,P735-given-name,P856-official-website,P1416-affiliation,P244-Library-of-Congress-authority-ID,P569-date-of-birth,P213-ISNI,P214-VIAF-ID,P7859-WorldCat-Identities-ID,P1006-Nationale-Thesaurus-voor-Auteurs-ID
5,23.0,Q57167805,Peb Ruswono Aryan,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,22.0,Q61852199,Fariz Darari,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,16.0,Q67625696,Mohammad Yani,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,16.0,Q12501375,Onno W. Purbo,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
13,16.0,Q106657765,Kabul Kurniawan,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df_sum.sort_values(by=['META-NumOfProps']).head(5) # top-5 poorest

Unnamed: 0,META-NumOfProps,META-QID,META-Name,P1960-Google-Scholar-author-ID,P21-sex-or-gender,P69-educated-at,P2038-ResearchGate-profile-ID,P106-occupation,P27-country-of-citizenship,"P1412-languages-spoken,-written-or-signed",...,P2578-studies,P735-given-name,P856-official-website,P1416-affiliation,P244-Library-of-Congress-authority-ID,P569-date-of-birth,P213-ISNI,P214-VIAF-ID,P7859-WorldCat-Identities-ID,P1006-Nationale-Thesaurus-voor-Auteurs-ID
10,7.0,Q61913638,Oskar Riandi,0.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.0,Q20426405,Budi Rahardjo,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.0,Q61912217,Benhard Sitohang,1.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,8.0,Q61912381,Inggriani Liem,0.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,9.0,Q61913857,Wikan Danar Sunindyo,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


References:

*   https://en.wikipedia.org/wiki/RDFLib
*   https://rdflib.readthedocs.io/en/stable/
*   https://www.youtube.com/watch?v=iYs8l-Z1tZE

