# Knowledge Imbalance Analysis - Ranker (Wikidata)

Compiled by Fariz Darari, Ph.D. (Fasilkom UI)

## Task: Given a (Wikidata) RDF TTL file, rank the entities from the richest to poorest

In [6]:
# installation from Anaconda Prompt:
# conda install -c conda-forge rdflib=5.0.0

# ps: v6.0.1 has some bug so must downgrade the version

# also install tabulate lib for pretty printing Pandas data frames:
# conda install -c conda-forge tabulate

In [9]:
# import libs
import pandas as pd
import rdflib
import time

In [10]:
# load a TTL file
file_name = "20211218-0002-dataGraph.ttl" # Indonesian computer scientists
g = rdflib.Graph()
g.parse(file_name, format="turtle")
print(f"Number of triples: {len(g)}") # number of triples

Number of triples: 5032


In [11]:
# test query: get 5 triples
q = """
SELECT *
WHERE {
  ?s ?p ?o
} LIMIT 5
"""

q_res = g.query(q)
for row in q_res:
    print(f"{row.s} {row.p} {row.o}")

http://www.wikidata.org/entity/P585 http://wikiba.se/ontology#referenceValue http://www.wikidata.org/prop/reference/value/P585
http://www.wikidata.org/entity/Q101542203 http://www.wikidata.org/prop/direct/P108 http://www.wikidata.org/entity/Q101542153
http://www.wikidata.org/entity/statement/Q106657765-d3f8081f-447f-d5a0-67e2-c2e9c3ea8057 http://www.w3.org/1999/02/22-rdf-syntax-ns#type http://wikiba.se/ontology#BestRank
http://www.wikidata.org/entity/Q61852199 http://schema.org/name Fariz Darari
http://www.wikidata.org/entity/P213 http://www.w3.org/2004/02/skos/core#prefLabel ISNI


In [12]:
# test query: get direct properties and values of specific entity (same properties may appear several times)
entity_uri = "http://www.wikidata.org/entity/Q61852199"

q = '''
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT *
WHERE {{
  <{0}> ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
}}
'''.format(entity_uri)

q_res = g.query(q)
print(len(q_res))
for row in q_res:
    print(f"{row.p} {row.o}")

27
http://www.wikidata.org/prop/direct/P2002 mrlogix
http://www.wikidata.org/prop/direct/P31 http://www.wikidata.org/entity/Q5
http://www.wikidata.org/prop/direct/P184 http://www.wikidata.org/entity/Q102302240
http://www.wikidata.org/prop/direct/P1960 Q6KKCZ4AAAAJ
http://www.wikidata.org/prop/direct/P69 http://www.wikidata.org/entity/Q1231433
http://www.wikidata.org/prop/direct/P2038 Fariz-Darari
http://www.wikidata.org/prop/direct/P4174 Fadirra
http://www.wikidata.org/prop/direct/P6634 farizdarari
http://www.wikidata.org/prop/direct/P2037 fadirra
http://www.wikidata.org/prop/direct/P106 http://www.wikidata.org/entity/Q82594
http://www.wikidata.org/prop/direct/P106 http://www.wikidata.org/entity/Q1650915
http://www.wikidata.org/prop/direct/P184 http://www.wikidata.org/entity/Q51903108
http://www.wikidata.org/prop/direct/P496 0000-0001-6025-609X
http://www.wikidata.org/prop/direct/P6178 013155230077.37
http://www.wikidata.org/prop/direct/P69 http://www.wikidata.org/entity/Q158158
http:/

In [13]:
# test query: get distinct direct properties of specific entity (unique properties)
entity_uri = "http://www.wikidata.org/entity/Q61852199"

q = '''
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT ?p
WHERE {{
  <{0}> ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
}}
'''.format(entity_uri)

q_res = g.query(q)
print(len(q_res))
for row in q_res:
    print(f"{row.p}")

22
http://www.wikidata.org/prop/direct/P2002
http://www.wikidata.org/prop/direct/P31
http://www.wikidata.org/prop/direct/P184
http://www.wikidata.org/prop/direct/P1960
http://www.wikidata.org/prop/direct/P69
http://www.wikidata.org/prop/direct/P2038
http://www.wikidata.org/prop/direct/P4174
http://www.wikidata.org/prop/direct/P6634
http://www.wikidata.org/prop/direct/P2037
http://www.wikidata.org/prop/direct/P106
http://www.wikidata.org/prop/direct/P496
http://www.wikidata.org/prop/direct/P6178
http://www.wikidata.org/prop/direct/P108
http://www.wikidata.org/prop/direct/P2456
http://www.wikidata.org/prop/direct/P1412
http://www.wikidata.org/prop/direct/P21
http://www.wikidata.org/prop/direct/P27
http://www.wikidata.org/prop/direct/P1153
http://www.wikidata.org/prop/direct/P103
http://www.wikidata.org/prop/direct/P166
http://www.wikidata.org/prop/direct/P549
http://www.wikidata.org/prop/direct/P2888


In [14]:
def get_en_label(entity_uri, g):
    q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?label
WHERE {{
  <{0}> rdfs:label ?label .
  FILTER(LANG(?label)="en")
}}
""".format(entity_uri)
    q_res = g.query(q)
    for row in q_res:
        return str(row.label)

In [15]:
# core code
# todo: clean up

# class+filter
q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wb: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT * WHERE {
    ?x wdt:P31 wd:Q5 . # instance-of human
    ?x wdt:P27 wd:Q252 . # citizenship Indonesia
    ?x wdt:P106 wd:Q82594 .  # occupation computer-scientist
}

"""

df_sum = "" # empty dataframe

q_res = g.query(q)
print(len(q_res))
for row in q_res:
    entity_uri = str(row.x)
    print(entity_uri)
    entity_name = get_en_label(entity_uri, g)
    print(entity_name)
    
    ### start
    # get direct properties of each entity
    q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wb: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT ?p ?pLabel
WHERE {
  $entity_id$ ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
  ?fullP wb:directClaim ?p .
  ?fullP rdfs:label ?pLabel .
  FILTER(LANG(?pLabel)="en")
}
"""
    q = q.replace("$entity_id$", "<" + entity_uri + ">")

    q_res = g.query(q)
    print(len(q_res))
    # normalize and collect attributes for each entity
    attributes = []
    for row in q_res:
        p_id = row.p.replace("http://www.wikidata.org/prop/direct/", "")
        p_label_normalized = row.pLabel.replace(" ", "-")
        print(f"{p_id}-{p_label_normalized}")
        attributes.append(f"{p_id}-{p_label_normalized}")
    
    # attributes for each entity to pandas dataframe
    # also unifies attributes for each entity
    lst = [[entity_uri.replace("http://www.wikidata.org/entity/", ""), entity_name] + [1] * len(q_res)]
    print(lst)
    df = pd.DataFrame(lst, columns=["QID", "Name"]+attributes)
    print(df.to_markdown()) 
    if isinstance(df_sum,str):
        df_sum = df
    else:
        df_sum = pd.concat([df_sum, df], sort=False, ignore_index=True).fillna(0)
    ### end
    
df_sum

18
http://www.wikidata.org/entity/Q61913638
Oskar Riandi
7
P2456-DBLP-author-ID
P106-occupation
P21-sex-or-gender
P31-instance-of
P27-country-of-citizenship
P735-given-name
P1412-languages-spoken,-written-or-signed
[['Q61913638', 'Oskar Riandi', 1, 1, 1, 1, 1, 1, 1]]
|    | QID       | Name         |   P2456-DBLP-author-ID |   P106-occupation |   P21-sex-or-gender |   P31-instance-of |   P27-country-of-citizenship |   P735-given-name |   P1412-languages-spoken,-written-or-signed |
|---:|:----------|:-------------|-----------------------:|------------------:|--------------------:|------------------:|-----------------------------:|------------------:|--------------------------------------------:|
|  0 | Q61913638 | Oskar Riandi |                      1 |                 1 |                   1 |                 1 |                            1 |                 1 |                                           1 |
http://www.wikidata.org/entity/Q57303205
Fajar J. Ekaputra
14
P496-ORCID-iD
P1

23
P6634-LinkedIn-personal-profile-ID
P496-ORCID-iD
P1153-Scopus-author-ID
P2013-Facebook-ID
P6178-Dimensions-author-ID
P2002-Twitter-username
P1053-ResearcherID
P2456-DBLP-author-ID
P18-image
P2037-GitHub-username
P734-family-name
P106-occupation
P2038-ResearchGate-profile-ID
P21-sex-or-gender
P69-educated-at
P31-instance-of
P27-country-of-citizenship
P1960-Google-Scholar-author-ID
P4174-Wikimedia-username
P1066-student-of
P2888-exact-match
P1412-languages-spoken,-written-or-signed
P39-position-held
[['Q57167805', 'Peb Ruswono Aryan', 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
|    | QID       | Name              |   P6634-LinkedIn-personal-profile-ID |   P496-ORCID-iD |   P1153-Scopus-author-ID |   P2013-Facebook-ID |   P6178-Dimensions-author-ID |   P2002-Twitter-username |   P1053-ResearcherID |   P2456-DBLP-author-ID |   P18-image |   P2037-GitHub-username |   P734-family-name |   P106-occupation |   P2038-ResearchGate-profile-ID |   P21-sex-or-gender | 

Kabul Kurniawan
16
P6634-LinkedIn-personal-profile-ID
P496-ORCID-iD
P1153-Scopus-author-ID
P1416-affiliation
P856-official-website
P2002-Twitter-username
P1053-ResearcherID
P2456-DBLP-author-ID
P106-occupation
P2038-ResearchGate-profile-ID
P21-sex-or-gender
P31-instance-of
P27-country-of-citizenship
P1960-Google-Scholar-author-ID
P2888-exact-match
P1412-languages-spoken,-written-or-signed
[['Q106657765', 'Kabul Kurniawan', 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
|    | QID        | Name            |   P6634-LinkedIn-personal-profile-ID |   P496-ORCID-iD |   P1153-Scopus-author-ID |   P1416-affiliation |   P856-official-website |   P2002-Twitter-username |   P1053-ResearcherID |   P2456-DBLP-author-ID |   P106-occupation |   P2038-ResearchGate-profile-ID |   P21-sex-or-gender |   P31-instance-of |   P27-country-of-citizenship |   P1960-Google-Scholar-author-ID |   P2888-exact-match |   P1412-languages-spoken,-written-or-signed |
|---:|:-----------|:----------------|------------

Unnamed: 0,QID,Name,P2456-DBLP-author-ID,P106-occupation,P21-sex-or-gender,P31-instance-of,P27-country-of-citizenship,P735-given-name,"P1412-languages-spoken,-written-or-signed",P496-ORCID-iD,...,P1006-Nationale-Thesaurus-voor-Auteurs-ID,P7859-WorldCat-Identities-ID,P569-date-of-birth,P2671-Google-Knowledge-Graph-ID,P213-ISNI,P214-VIAF-ID,P244-Library-of-Congress-authority-ID,P1416-affiliation,P856-official-website,P101-field-of-work
0,Q61913638,Oskar Riandi,1.0,1,1,1,1,1.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Q57303205,Fajar J. Ekaputra,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Q97382926,Mirna Adriani,1.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Q61852199,Fariz Darari,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Q61912217,Benhard Sitohang,1.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Q61912381,Inggriani Liem,1.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Q57167805,Peb Ruswono Aryan,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Q67625696,Mohammad Yani,0.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Q61911973,Dwi Hendratmo Widyantoro,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Q12501375,Onno W. Purbo,0.0,1,1,1,1,1.0,1,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0


In [16]:
[x for x in list(df_sum.columns) if x not in ['QID','Name']]
df_sum['Meta-Counter'] = df_sum[[x for x in list(df_sum.columns) if x not in ['QID','Name']]].sum(axis=1)
df_sum.head()

Unnamed: 0,QID,Name,P2456-DBLP-author-ID,P106-occupation,P21-sex-or-gender,P31-instance-of,P27-country-of-citizenship,P735-given-name,"P1412-languages-spoken,-written-or-signed",P496-ORCID-iD,...,P7859-WorldCat-Identities-ID,P569-date-of-birth,P2671-Google-Knowledge-Graph-ID,P213-ISNI,P214-VIAF-ID,P244-Library-of-Congress-authority-ID,P1416-affiliation,P856-official-website,P101-field-of-work,Meta-Counter
0,Q61913638,Oskar Riandi,1.0,1,1,1,1,1.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
1,Q57303205,Fajar J. Ekaputra,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0
2,Q97382926,Mirna Adriani,1.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
3,Q61852199,Fariz Darari,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0
4,Q61912217,Benhard Sitohang,1.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0


In [17]:
df_sum.sort_values(by=['Meta-Counter'], ascending=False).head(5) # top-5 richest

Unnamed: 0,QID,Name,P2456-DBLP-author-ID,P106-occupation,P21-sex-or-gender,P31-instance-of,P27-country-of-citizenship,P735-given-name,"P1412-languages-spoken,-written-or-signed",P496-ORCID-iD,...,P7859-WorldCat-Identities-ID,P569-date-of-birth,P2671-Google-Knowledge-Graph-ID,P213-ISNI,P214-VIAF-ID,P244-Library-of-Congress-authority-ID,P1416-affiliation,P856-official-website,P101-field-of-work,Meta-Counter
6,Q57167805,Peb Ruswono Aryan,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0
3,Q61852199,Fariz Darari,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0
9,Q12501375,Onno W. Purbo,0.0,1,1,1,1,1.0,1,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,16.0
11,Q106657765,Kabul Kurniawan,1.0,1,1,1,1,0.0,1,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,16.0
7,Q67625696,Mohammad Yani,0.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0


In [18]:
df_sum.sort_values(by=['Meta-Counter']).head(5) # top-5 poorest

Unnamed: 0,QID,Name,P2456-DBLP-author-ID,P106-occupation,P21-sex-or-gender,P31-instance-of,P27-country-of-citizenship,P735-given-name,"P1412-languages-spoken,-written-or-signed",P496-ORCID-iD,...,P7859-WorldCat-Identities-ID,P569-date-of-birth,P2671-Google-Knowledge-Graph-ID,P213-ISNI,P214-VIAF-ID,P244-Library-of-Congress-authority-ID,P1416-affiliation,P856-official-website,P101-field-of-work,Meta-Counter
0,Q61913638,Oskar Riandi,1.0,1,1,1,1,1.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
12,Q20426405,Budi Rahardjo,0.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
4,Q61912217,Benhard Sitohang,1.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
5,Q61912381,Inggriani Liem,1.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
13,Q61913857,Wikan Danar Sunindyo,1.0,1,1,1,1,0.0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0


In [41]:
# querying for direct properties+labels of FD
entity_id = "Q61852199"
entity_name = get_en_label("http://www.wikidata.org/entity/" + entity_id, g)
print(entity_name)
q = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wb: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT ?p ?pLabel
WHERE {
  $entity_id$ ?p ?o .
  FILTER(CONTAINS(STR(?p),"prop/direct/"))
  ?fullP wb:directClaim ?p .
  ?fullP rdfs:label ?pLabel .
  FILTER(LANG(?pLabel)="en")
}
"""
q = q.replace("$entity_id$", "<http://www.wikidata.org/entity/" + entity_id + ">")

q_res = g.query(q)
print(len(q_res))
attributes = []
for row in q_res:
    p_id = row.p.replace("http://www.wikidata.org/prop/direct/", "")
    p_label_normalized = row.pLabel.replace(" ", "-")
    print(f"{p_id}-{p_label_normalized}")
    attributes.append(f"{p_id}-{p_label_normalized}")
    
# to pandas dataframe
lst = [[entity_id, entity_name] + [1] * len(q_res)]
print(lst)
df = pd.DataFrame(lst, columns=["QID", "Name"]+attributes)
df


PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wb: <http://wikiba.se/ontology#>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT ?label
WHERE {
  <http://www.wikidata.org/entity/Q61852199> rdfs:label ?label .
  FILTER(LANG(?label)="en")
}

Fariz Darari
22
P4174-Wikimedia-username
P496-ORCID-iD
P1153-Scopus-author-ID
P69-educated-at
P2888-exact-match
P1960-Google-Scholar-author-ID
P103-native-language
P108-employer
P2038-ResearchGate-profile-ID
P2456-DBLP-author-ID
P2037-GitHub-username
P184-doctoral-advisor
P31-instance-of
P1412-languages-spoken,-written-or-signed
P549-Mathematics-Genealogy-Project-ID
P6634-LinkedIn-personal-profile-ID
P166-award-received
P6178-Dimensions-author-ID
P106-occupation
P2002-Twitter-username
P27-country-of-citizenship
P21-sex-or-gender
[['Q61852199', 'Fariz Darari', 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]


Unnamed: 0,QID,Name,P4174-Wikimedia-username,P496-ORCID-iD,P1153-Scopus-author-ID,P69-educated-at,P2888-exact-match,P1960-Google-Scholar-author-ID,P103-native-language,P108-employer,...,P31-instance-of,"P1412-languages-spoken,-written-or-signed",P549-Mathematics-Genealogy-Project-ID,P6634-LinkedIn-personal-profile-ID,P166-award-received,P6178-Dimensions-author-ID,P106-occupation,P2002-Twitter-username,P27-country-of-citizenship,P21-sex-or-gender
0,Q61852199,Fariz Darari,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


References:

*   https://en.wikipedia.org/wiki/RDFLib
*   https://rdflib.readthedocs.io/en/stable/
*   https://www.youtube.com/watch?v=iYs8l-Z1tZE

