# Knowledge Imbalance Analysis - Ranker (Wikidata)

Compiled by Fariz Darari, Ph.D. (Fasilkom UI)

## Task: Given a (Wikidata) RDF TTL file, rank the entities from the richest to poorest

In [6]:
# installation from Anaconda Prompt:
# conda install -c conda-forge rdflib=5.0.0

# ps: v6.0.1 has some bug so must downgrade the version

# also install tabulate lib for pretty printing Pandas data frames:
# conda install -c conda-forge tabulate

In [1]:
# import libs
import pandas as pd
import rdflib
import time

In [2]:
# load a TTL file
file_name = "20211218-0002-dataGraph.ttl" # Indonesian computer scientists
g = rdflib.Graph()
g.parse(file_name, format="turtle")
print(f"Number of triples: {len(g)}") # number of triples

Number of triples: 5032


In [3]:
# test query: get 5 triples
q = """
SELECT *
WHERE {
  ?s ?p ?o
} LIMIT 5
"""

q_res = g.query(q)
for row in q_res:
    print(f"{row.s} {row.p} {row.o}")

http://www.wikidata.org/prop/direct/P213 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#DatatypeProperty
http://www.wikidata.org/reference/e7bf8ee8d2822d625d5431de386162580bfbdaa2 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://wikiba.se/ontology#Reference
http://www.wikidata.org/entity/statement/Q61911973-3676c654-4c47-61c9-85c0-dc5197b3e0c4 http://wikiba.se/ontology#rank http://wikiba.se/ontology#NormalRank
http://www.wikidata.org/entity/statement/Q61852199-b648c1e2-4720-5ebc-8dac-adfccb728569 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://wikiba.se/ontology#Statement
http://www.wikidata.org/prop/reference/P103 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://www.w3.org/2002/07/owl#ObjectProperty


In [4]:
# test query: get direct properties and values of specific entity (same properties may appear several times)
entity_uri = "http://www.wikidata.org/entity/Q61852199"

q = '''
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT *
WHERE {{
  <{0}> ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
}}
'''.format(entity_uri)

q_res = g.query(q)
print(len(q_res))
for row in q_res:
    print(f"{row.p} {row.o}")

27
http://www.wikidata.org/prop/direct/P2037 fadirra
http://www.wikidata.org/prop/direct/P2038 Fariz-Darari
http://www.wikidata.org/prop/direct/P1412 http://www.wikidata.org/entity/Q9240
http://www.wikidata.org/prop/direct/P496 0000-0001-6025-609X
http://www.wikidata.org/prop/direct/P31 http://www.wikidata.org/entity/Q5
http://www.wikidata.org/prop/direct/P2456 134/6913
http://www.wikidata.org/prop/direct/P106 http://www.wikidata.org/entity/Q1622272
http://www.wikidata.org/prop/direct/P549 233319
http://www.wikidata.org/prop/direct/P184 http://www.wikidata.org/entity/Q51903108
http://www.wikidata.org/prop/direct/P69 http://www.wikidata.org/entity/Q158158
http://www.wikidata.org/prop/direct/P2002 mrlogix
http://www.wikidata.org/prop/direct/P6634 farizdarari
http://www.wikidata.org/prop/direct/P108 http://www.wikidata.org/entity/Q534515
http://www.wikidata.org/prop/direct/P106 http://www.wikidata.org/entity/Q1650915
http://www.wikidata.org/prop/direct/P4174 Fadirra
http://www.wikidata.or

In [5]:
# test query: get distinct direct properties of specific entity (unique properties)
entity_uri = "http://www.wikidata.org/entity/Q61852199"

q = '''
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT ?p
WHERE {{
  <{0}> ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
}}
'''.format(entity_uri)

q_res = g.query(q)
print(len(q_res))
for row in q_res:
    print(f"{row.p}")

22
http://www.wikidata.org/prop/direct/P2037
http://www.wikidata.org/prop/direct/P2038
http://www.wikidata.org/prop/direct/P1412
http://www.wikidata.org/prop/direct/P496
http://www.wikidata.org/prop/direct/P31
http://www.wikidata.org/prop/direct/P2456
http://www.wikidata.org/prop/direct/P106
http://www.wikidata.org/prop/direct/P549
http://www.wikidata.org/prop/direct/P184
http://www.wikidata.org/prop/direct/P69
http://www.wikidata.org/prop/direct/P2002
http://www.wikidata.org/prop/direct/P6634
http://www.wikidata.org/prop/direct/P108
http://www.wikidata.org/prop/direct/P4174
http://www.wikidata.org/prop/direct/P1153
http://www.wikidata.org/prop/direct/P166
http://www.wikidata.org/prop/direct/P21
http://www.wikidata.org/prop/direct/P27
http://www.wikidata.org/prop/direct/P103
http://www.wikidata.org/prop/direct/P6178
http://www.wikidata.org/prop/direct/P2888
http://www.wikidata.org/prop/direct/P1960


In [7]:
def get_en_label(entity_uri, g):
    q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?label
WHERE {{
  <{0}> rdfs:label ?label .
  FILTER(LANG(?label)="en")
}}
""".format(entity_uri)
    q_res = g.query(q)
    for row in q_res:
        return str(row.label)

In [8]:
# core code
# todo: clean up

# class+filter
q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wb: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT * WHERE {
    ?x wdt:P31 wd:Q5 . # instance-of human
    ?x wdt:P27 wd:Q252 . # citizenship Indonesia
    ?x wdt:P106 wd:Q82594 .  # occupation computer-scientist
}

"""

df_sum = "" # empty dataframe

q_res = g.query(q)
print(len(q_res))
for row in q_res:
    entity_uri = str(row.x)
    print(entity_uri)
    entity_name = get_en_label(entity_uri, g)
    print(entity_name)
    
    ### start
    # get direct properties of each entity
    q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wb: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT ?p ?pLabel
WHERE {
  $entity_id$ ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
  ?fullP wb:directClaim ?p .
  ?fullP rdfs:label ?pLabel .
  FILTER(LANG(?pLabel)="en")
}
"""
    q = q.replace("$entity_id$", "<" + entity_uri + ">")

    q_res = g.query(q)
    print(len(q_res))
    # normalize and collect attributes for each entity
    attributes = []
    for row in q_res:
        p_id = row.p.replace("http://www.wikidata.org/prop/direct/", "")
        p_label_normalized = row.pLabel.replace(" ", "-")
        print(f"{p_id}-{p_label_normalized}")
        attributes.append(f"{p_id}-{p_label_normalized}")
    
    # attributes for each entity to pandas dataframe
    # also unifies attributes for each entity
    lst = [[entity_uri.replace("http://www.wikidata.org/entity/", ""), entity_name] + [1] * len(q_res)]
    print(lst)
    df = pd.DataFrame(lst, columns=["QID", "Name"]+attributes)
    print(df.to_markdown()) 
    if isinstance(df_sum,str):
        df_sum = df
    else:
        df_sum = pd.concat([df_sum, df], sort=False, ignore_index=True).fillna(0)
    ### end
    
df_sum

18
http://www.wikidata.org/entity/Q61912217
Benhard Sitohang
8
P31-instance-of
P27-country-of-citizenship
P21-sex-or-gender
P1412-languages-spoken,-written-or-signed
P1960-Google-Scholar-author-ID
P2456-DBLP-author-ID
P106-occupation
P39-position-held
[['Q61912217', 'Benhard Sitohang', 1, 1, 1, 1, 1, 1, 1, 1]]
|    | QID       | Name             |   P31-instance-of |   P27-country-of-citizenship |   P21-sex-or-gender |   P1412-languages-spoken,-written-or-signed |   P1960-Google-Scholar-author-ID |   P2456-DBLP-author-ID |   P106-occupation |   P39-position-held |
|---:|:----------|:-----------------|------------------:|-----------------------------:|--------------------:|--------------------------------------------:|---------------------------------:|-----------------------:|------------------:|--------------------:|
|  0 | Q61912217 | Benhard Sitohang |                 1 |                            1 |                   1 |                                           1 |              

http://www.wikidata.org/entity/Q61912381
Inggriani Liem
8
P31-instance-of
P69-educated-at
P27-country-of-citizenship
P21-sex-or-gender
P1412-languages-spoken,-written-or-signed
P2456-DBLP-author-ID
P106-occupation
P39-position-held
[['Q61912381', 'Inggriani Liem', 1, 1, 1, 1, 1, 1, 1, 1]]
|    | QID       | Name           |   P31-instance-of |   P69-educated-at |   P27-country-of-citizenship |   P21-sex-or-gender |   P1412-languages-spoken,-written-or-signed |   P2456-DBLP-author-ID |   P106-occupation |   P39-position-held |
|---:|:----------|:---------------|------------------:|------------------:|-----------------------------:|--------------------:|--------------------------------------------:|-----------------------:|------------------:|--------------------:|
|  0 | Q61912381 | Inggriani Liem |                 1 |                 1 |                            1 |                   1 |                                           1 |                      1 |                 1 |       

Radityo Eko Prasojo
15
P31-instance-of
P69-educated-at
P27-country-of-citizenship
P21-sex-or-gender
P103-native-language
P108-employer
P1412-languages-spoken,-written-or-signed
P184-doctoral-advisor
P1960-Google-Scholar-author-ID
P101-field-of-work
P496-ORCID-iD
P2456-DBLP-author-ID
P6634-LinkedIn-personal-profile-ID
P106-occupation
P1153-Scopus-author-ID
[['Q101542203', 'Radityo Eko Prasojo', 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
|    | QID        | Name                |   P31-instance-of |   P69-educated-at |   P27-country-of-citizenship |   P21-sex-or-gender |   P103-native-language |   P108-employer |   P1412-languages-spoken,-written-or-signed |   P184-doctoral-advisor |   P1960-Google-Scholar-author-ID |   P101-field-of-work |   P496-ORCID-iD |   P2456-DBLP-author-ID |   P6634-LinkedIn-personal-profile-ID |   P106-occupation |   P1153-Scopus-author-ID |
|---:|:-----------|:--------------------|------------------:|------------------:|-----------------------------:|--------

Unnamed: 0,QID,Name,P31-instance-of,P27-country-of-citizenship,P21-sex-or-gender,"P1412-languages-spoken,-written-or-signed",P1960-Google-Scholar-author-ID,P2456-DBLP-author-ID,P106-occupation,P39-position-held,...,P166-award-received,P2037-GitHub-username,P101-field-of-work,P1066-student-of,P2578-studies,P17-country,P1344-participant-in,P734-family-name,P18-image,P2013-Facebook-ID
0,Q61912217,Benhard Sitohang,1,1,1,1,1.0,1.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Q12501375,Onno W. Purbo,1,1,1,1,0.0,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Q61913794,Saiful Akbar,1,1,1,1,1.0,1.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Q57167254,Iping Supriana,1,1,1,1,1.0,1.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Q97382926,Mirna Adriani,1,1,1,1,1.0,1.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Q106657765,Kabul Kurniawan,1,1,1,1,1.0,1.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Q61912381,Inggriani Liem,1,1,1,1,0.0,1.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Q63346808,Adila Alfa Krisnadhi,1,1,1,1,1.0,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Q61913857,Wikan Danar Sunindyo,1,1,1,1,1.0,1.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Q61913638,Oskar Riandi,1,1,1,1,0.0,1.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
[x for x in list(df_sum.columns) if x not in ['QID','Name']]
df_sum['Meta-Counter'] = df_sum[[x for x in list(df_sum.columns) if x not in ['QID','Name']]].sum(axis=1)
df_sum.head()

Unnamed: 0,QID,Name,P1960-Google-Scholar-author-ID,P21-sex-or-gender,P106-occupation,P31-instance-of,P1153-Scopus-author-ID,P27-country-of-citizenship,P570-date-of-death,P2456-DBLP-author-ID,...,P1344-participant-in,P1066-student-of,P166-award-received,P549-Mathematics-Genealogy-Project-ID,P4174-Wikimedia-username,P2037-GitHub-username,P18-image,P2013-Facebook-ID,P734-family-name,Meta-Counter
0,Q97382926,Mirna Adriani,1.0,1,1,1,1.0,1,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
1,Q57303205,Fajar J. Ekaputra,1.0,1,1,1,1.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0
2,Q101542203,Radityo Eko Prasojo,1.0,1,1,1,1.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0
3,Q61911973,Dwi Hendratmo Widyantoro,1.0,1,1,1,1.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0
4,Q61913794,Saiful Akbar,1.0,1,1,1,1.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0


In [128]:
df_sum.sort_values(by=['Meta-Counter'], ascending=False).head(5) # top-5 richest

Unnamed: 0,QID,Name,P1960-Google-Scholar-author-ID,P21-sex-or-gender,P106-occupation,P31-instance-of,P1153-Scopus-author-ID,P27-country-of-citizenship,P570-date-of-death,P2456-DBLP-author-ID,...,P1344-participant-in,P1066-student-of,P166-award-received,P549-Mathematics-Genealogy-Project-ID,P4174-Wikimedia-username,P2037-GitHub-username,P18-image,P2013-Facebook-ID,P734-family-name,Meta-Counter
16,Q57167805,Peb Ruswono Aryan,1.0,1,1,1,1.0,1,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,23.0
12,Q61852199,Fariz Darari,1.0,1,1,1,1.0,1,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,22.0
9,Q67625696,Mohammad Yani,1.0,1,1,1,1.0,1,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
6,Q12501375,Onno W. Purbo,0.0,1,1,1,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0
7,Q106657765,Kabul Kurniawan,1.0,1,1,1,1.0,1,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0


In [87]:
df_sum.sort_values(by=['Meta-Counter']).head(5) # top-5 poorest

Unnamed: 0,QID,Name,P2456-DBLP-author-ID,P31-instance-of,"P1412-languages-spoken,-written-or-signed",P735-given-name,P106-occupation,P27-country-of-citizenship,P21-sex-or-gender,P69-educated-at,...,P17-country,P1344-participant-in,P1066-student-of,P1053-ResearcherID,P856-official-website,P1416-affiliation,P734-family-name,P18-image,P2013-Facebook-ID,Meta-Counter
0,Q61913638,Oskar Riandi,1.0,1,1,1.0,1,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
7,Q61912217,Benhard Sitohang,1.0,1,1,0.0,1,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
4,Q20426405,Budi Rahardjo,0.0,1,1,0.0,1,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
17,Q61912381,Inggriani Liem,1.0,1,1,0.0,1,1,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
11,Q61913857,Wikan Danar Sunindyo,1.0,1,1,0.0,1,1,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0


In [41]:
# querying for direct properties+labels of FD
entity_id = "Q61852199"
entity_name = get_en_label("http://www.wikidata.org/entity/" + entity_id, g)
print(entity_name)
q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wb: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT ?p ?pLabel
WHERE {
  $entity_id$ ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
  ?fullP wb:directClaim ?p .
  ?fullP rdfs:label ?pLabel .
  FILTER(LANG(?pLabel)="en")
}
"""
q = q.replace("$entity_id$", "<http://www.wikidata.org/entity/" + entity_id + ">")

q_res = g.query(q)
print(len(q_res))
attributes = []
for row in q_res:
    p_id = row.p.replace("http://www.wikidata.org/prop/direct/", "")
    p_label_normalized = row.pLabel.replace(" ", "-")
    print(f"{p_id}-{p_label_normalized}")
    attributes.append(f"{p_id}-{p_label_normalized}")
    
# to pandas dataframe
lst = [[entity_id, entity_name] + [1] * len(q_res)]
print(lst)
df = pd.DataFrame(lst, columns=["QID", "Name"]+attributes)
df


PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wb: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT ?label
WHERE {
  <http://www.wikidata.org/entity/Q61852199> rdfs:label ?label .
  FILTER(LANG(?label)="en")
}

Fariz Darari
22
P4174-Wikimedia-username
P496-ORCID-iD
P1153-Scopus-author-ID
P69-educated-at
P2888-exact-match
P1960-Google-Scholar-author-ID
P103-native-language
P108-employer
P2038-ResearchGate-profile-ID
P2456-DBLP-author-ID
P2037-GitHub-username
P184-doctoral-advisor
P31-instance-of
P1412-languages-spoken,-written-or-signed
P549-Mathematics-Genealogy-Project-ID
P6634-LinkedIn-personal-profile-ID
P166-award-received
P6178-Dimensions-author-ID
P106-occupation
P2002-Twitter-username
P27-country-of-citizenship
P21-sex-or-gender
[['Q61852199', 'Fariz Darari', 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


Unnamed: 0,QID,Name,P4174-Wikimedia-username,P496-ORCID-iD,P1153-Scopus-author-ID,P69-educated-at,P2888-exact-match,P1960-Google-Scholar-author-ID,P103-native-language,P108-employer,...,P31-instance-of,"P1412-languages-spoken,-written-or-signed",P549-Mathematics-Genealogy-Project-ID,P6634-LinkedIn-personal-profile-ID,P166-award-received,P6178-Dimensions-author-ID,P106-occupation,P2002-Twitter-username,P27-country-of-citizenship,P21-sex-or-gender
0,Q61852199,Fariz Darari,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [10]:
# save RDF graph to file
g.serialize(graph_file, format="turtle")
print("~~\nCompleted!")

~~
Completed!


References:

*   https://en.wikipedia.org/wiki/RDFLib
*   https://rdflib.readthedocs.io/en/stable/
*   https://www.youtube.com/watch?v=iYs8l-Z1tZE

