In [1]:
import numpy as np
import sqlite3
import wikidata_utils as wdutils
import pandas as pd
import ast
from sample_size import sample_size

import qwikidata
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.sparql import (get_subclasses_of_item,
                              return_sparql_query_results)
from qwikidata.entity import WikidataItem, WikidataProperty, WikidataLexeme

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from tqdm.auto import tqdm
tqdm.pandas()

SEED=42

from IPython.display import clear_output

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


In [2]:
wdAPI = wdutils.CachedWikidataAPI(
    cache_path = '../wikidata_entity_cache.p',
    save_every_x_queries=np.inf
)

In [3]:
db = sqlite3.connect('../wikidata_claims_refs_parsed.db')
cursor = db.cursor()
claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
refs_columns = ['reference_id', 'reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']

In [4]:
# Checking first few elements
cursor.execute('select * from refs limit 5;')
head_df = pd.DataFrame(cursor.fetchall())
head_df.columns = refs_columns
head_df

Unnamed: 0,reference_id,reference_property_id,reference_index,reference_datatype,reference_value
0,7c6b234780b3ee79e341952bbe69fd0e93298176,P854,0,url,{'value': 'http://gwpapers.virginia.edu/histor...
1,7c6b234780b3ee79e341952bbe69fd0e93298176,P123,0,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
2,dd6c0c287c2d4dfd674f577b65ac1d6b875005ae,P248,0,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
3,dd6c0c287c2d4dfd674f577b65ac1d6b875005ae,P304,0,string,"{'value': '389', 'type': 'string'}"
4,21e05588017fe6569fd5fb71be9fc6b97812a49b,P248,0,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."


In [13]:
text_reference_sampled_df_html = pd.read_csv('text_reference_sampled_df_html.csv')
text_reference_sampled_df_html

Unnamed: 0,reference_id,reference_property_id,reference_datatype,url,netloc,netloc_agg,error_msg,code,content-type,final_url,reason,language_crawl,language_crawl_score,sampling_weight_vb,sampling_weight,html
0,390d6c6e68a32e11f8d7b0883cda0557db529fe6,P4656,url,https://en.wikipedia.org/w/index.php?title=G._...,en.wikipedia.org,en.wikipedia.org,none,200,text/html; charset=UTF-8,https://en.wikipedia.org/w/index.php?title=G._...,OK,en,0.934613,321744|13,24749.538462,"<html class=""client-js ve-available"" lang=""en""..."
1,49b9aec8e10815611ff0379a34d5fd7c3830566e,P4656,url,https://en.wikipedia.org/w/index.php?title=Seb...,en.wikipedia.org,en.wikipedia.org,none,200,text/html; charset=UTF-8,https://en.wikipedia.org/w/index.php?title=Seb...,OK,en,0.646651,321744|13,24749.538462,"<html class=""client-js ve-available"" lang=""en""..."
2,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,P4656,url,https://en.wikipedia.org/w/index.php?title=Ave...,en.wikipedia.org,en.wikipedia.org,none,200,text/html; charset=UTF-8,https://en.wikipedia.org/w/index.php?title=Ave...,OK,en,0.842498,321744|13,24749.538462,"<html class=""client-js ve-available"" lang=""en""..."
3,76b04346ad57869d9e5ae1007ba8343d708ab6f9,P4656,url,https://en.wikipedia.org/w/index.php?title=Yut...,en.wikipedia.org,en.wikipedia.org,none,200,text/html; charset=UTF-8,https://en.wikipedia.org/w/index.php?title=Yut...,OK,en,0.608352,321744|13,24749.538462,"<html class=""client-js ve-available"" lang=""en""..."
4,1ee71b39caf6df395c64b436fea4895692812d38,P4656,url,https://en.wikipedia.org/w/index.php?title=Dol...,en.wikipedia.org,en.wikipedia.org,none,200,text/html; charset=UTF-8,https://en.wikipedia.org/w/index.php?title=Dol...,OK,en,0.856581,321744|13,24749.538462,"<html class=""client-js ve-available"" lang=""en""..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
671,30bba624155e8c3a681add4b08967cfb23821a4b,P854,url,https://indiancine.ma/CCE,indiancine.ma,indiancine.ma,none,200,text/html; charset=utf-8,https://indiancine.ma/CCE,OK,en,0.612925,993|13,76.384615,"<html><head>\n <meta charset=""utf-8"">\n..."
672,fcfe8a0995f157f013786f42fb33dd928c3ebb9c,P854,url,https://indiancine.ma/ARC,indiancine.ma,indiancine.ma,none,200,text/html; charset=utf-8,https://indiancine.ma/ARC,OK,en,0.753929,993|13,76.384615,"<html><head>\n <meta charset=""utf-8"">\n..."
673,b39988975bcaf5f41ff4143e7624873d5dafba0d,P854,url,https://indiancine.ma/BQK,indiancine.ma,indiancine.ma,none,200,text/html; charset=utf-8,https://indiancine.ma/BQK,OK,en,0.576731,993|13,76.384615,"<html><head>\n <meta charset=""utf-8"">\n..."
674,eca1aeeae63d107ee1bc18b85fce09b1d51c3323,P854,url,https://indiancine.ma/BUR,indiancine.ma,indiancine.ma,none,200,text/html; charset=utf-8,https://indiancine.ma/BUR,OK,en,0.666638,993|13,76.384615,"<html><head>\n <meta charset=""utf-8"">\n..."


In [27]:
def reference_id_to_claim_id(reference_id):
    try:
        np.random.seed(SEED)
        cursor.execute(f'select claim_id from claims_refs where reference_id="{reference_id}"')
        sql_result = cursor.fetchall()
        #return sql_result
        randomly_chosen_claim_id = np.array(sql_result).reshape(-1)
        return randomly_chosen_claim_id
    except Exception:
        print(reference_id)
        print(sql_result)
        raise
        
def reference_id_to_claim_data(reference_id):
    claim_ids = reference_id_to_claim_id(reference_id)
    r = []
    for claim_id in claim_ids:
        #print(claim_id)
        cursor.execute(f'select * from claims where claim_id="{claim_id}";')
        d = cursor.fetchall()
        r = r + d
    return r

claim_data = []
for reference_id in text_reference_sampled_df_html.reference_id:
    data = reference_id_to_claim_data(reference_id)    
    #print(data)
    data = [(reference_id,) + t for t in data]
    claim_data = claim_data + data
    #break

In [28]:
claim_df = pd.DataFrame(claim_data, columns = ['reference_id'] + claims_columns)
claim_df

Unnamed: 0,reference_id,entity_id,claim_id,rank,property_id,datatype,datavalue
0,390d6c6e68a32e11f8d7b0883cda0557db529fe6,Q5512528,Q5512528$81E8AD02-28AF-4AE3-8ACD-047C30B40B01,normal,P20,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
1,49b9aec8e10815611ff0379a34d5fd7c3830566e,Q12149940,Q12149940$C9FE5F0C-78FD-4ECE-B1C2-16A763B8ED4E,normal,P140,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
2,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,Q583556,Q583556$929DCD2F-3B4A-41B5-89E7-EEC549077834,normal,P740,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
3,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,Q583556,Q583556$2AD75C92-953E-47CB-93A6-D082674F4231,normal,P2031,time,"{'value': {'time': '+1972-00-00T00:00:00Z', 't..."
4,76b04346ad57869d9e5ae1007ba8343d708ab6f9,Q7586053,Q7586053$AD344012-A522-4A8A-AF64-16F859E2EA9A,normal,P2031,time,"{'value': {'time': '+1983-00-00T00:00:00Z', 't..."
...,...,...,...,...,...,...,...
1687,93c70463e9b27bd9a1d62a170b23cc55ced0f7a4,Q58481796,Q58481796$D8F8A626-D563-4960-B5B2-FB8B403247FD,normal,P31,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
1688,93c70463e9b27bd9a1d62a170b23cc55ced0f7a4,Q58481796,Q58481796$D84041E5-3E60-4E3E-B8BF-AE64FE62FB71,normal,P5987,external-id,"{'value': 'BCY', 'type': 'string'}"
1689,93c70463e9b27bd9a1d62a170b23cc55ced0f7a4,Q58481796,Q58481796$9778A520-F082-4A6F-9CF2-80BFF3B90645,normal,P495,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
1690,93c70463e9b27bd9a1d62a170b23cc55ced0f7a4,Q58481796,Q58481796$E2ECFB6D-5879-46E1-93D7-E54363EF7276,normal,P364,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."


In [29]:
#reference_id_to_claim_data('93c70463e9b27bd9a1d62a170b23cc55ced0f7a4')

#cursor.execute('select * from claims_refs where reference_id=\'ca5a493f7843383b127c7cff4f7c9c26af9b2f87\'')
#sql_result = cursor.fetchall()
#sql_result

In [31]:
def claim_id_to_claim_url(claim_id):
    claim_id_parts = claim_id.split('$')
    return f'https://www.wikidata.org/wiki/{claim_id_parts[0]}#{claim_id}'

In [32]:
#print(claim_id_to_claim_url('Q58481743$72999E39-348D-4DD3-8272-D4A24B9E7894'))

In [33]:
claim_df.datatype.value_counts()

wikibase-item       876
time                387
external-id         142
string               90
quantity             89
monolingualtext      78
url                  16
globe-coordinate     14
Name: datatype, dtype: int64

In [69]:
BAD_DATATYPES = ['external-id','commonsMedia','url', 'globe-coordinate', 'wikibase-lexeme', 'wikibase-property']

assert claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reference_id.unique().shape\
    == claim_df.reference_id.unique().shape

print(claim_df.reference_id.unique().shape[0])

672


In [70]:
claim_df = claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reset_index(drop=True)

In [82]:
claim_df[['entity_label','entity_label_lan']] = pd.DataFrame(
    claim_df.entity_id.apply(wdAPI.get_label, non_language_set=True).tolist()
)
claim_df[['property_label','property_label_lan']] = pd.DataFrame(
    claim_df.property_id.apply(wdAPI.get_label, non_language_set=True).tolist()
)

In [76]:
import ast
import pdb
from datetime import datetime
def get_object_label_given_datatype(row):
    dt = row['datatype']
    dv = row['datavalue']
    
    dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
    if dv in ['somevalue', 'novalue']:
        return (dv, 'no_lan')
    if dt not in dt_types:
        print(dt)
        raise ValueError
    else:
        try:
            if dt == dt_types[0]:
                return wdAPI.get_label(ast.literal_eval(dv)['value']['id'], True) #get label here
            elif dt == dt_types[1]:
                dv = ast.literal_eval(dv)
                return (dv['value']['text'], dv['value']['language'])
            elif dt == dt_types[2]:
                dv = ast.literal_eval(dv)
                amount, unit = dv['value']['amount'], dv['value']['unit']
                if amount[0] == '+':
                    amount = amount[1:]
                if str(unit) == '1':
                    return (str(amount), 'en')
                else:
                    unit_entity_id = unit.split('/')[-1]
                    unit = wdAPI.get_label(unit_entity_id, True)#get label here
                    return (' '.join([amount, unit[0]]), unit[1])
            elif dt == dt_types[3]:
                dv = ast.literal_eval(dv)
                time = dv['value']['time']
                timezone = dv['value']['timezone']
                precision = dv['value']['precision']
                assert dv['value']['after'] == 0 and dv['value']['before'] == 0

                sufix = 'BC' if time[0] == '-' else ''
                time = time[1:]

                if precision == 11: #date
                    return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y') + sufix, 'en')
                elif precision == 10: #month
                    try:
                        return (datetime.strptime(time, '%Y-%m-00T00:00:%SZ').strftime("%B of %Y") + sufix, 'en')
                    except ValueError:
                        return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime("%B of %Y") + sufix, 'en')
                elif precision == 9: #year
                    try:
                        return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y') + sufix, 'en')
                    except ValueError:
                        return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y') + sufix, 'en')
                elif precision == 8: #decade
                    try:
                        return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en')
                    except ValueError:
                        return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en')
                elif precision == 7: #century
                    try:
                        parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ')
                    except ValueError:
                        parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ')
                    finally:                        
                        return (turn_to_century_or_millennium(
                            parsed_time.strftime('%Y'), mode='C'
                        ) + sufix, 'en')
                elif precision == 6: #millennium
                    try:
                        parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ')
                    except ValueError:
                        parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ')
                    finally:                        
                        return (turn_to_century_or_millennium(
                            parsed_time.strftime('%Y'), mode='M'
                        ) + sufix, 'en')
                elif precision == 4: #hundred thousand years 
                    timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
                    timeint = round(timeint/1e5,1)
                    return (str(timeint) + 'hundred thousand years' + sufix, 'en')
                elif precision == 3: #million years 
                    timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
                    timeint = round(timeint/1e6,1)
                    return (str(timeint) + 'million years' + sufix, 'en')
                elif precision == 0: #billion years 
                    timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
                    timeint = round(timeint/1e9,1)
                    return (str(timeint) + 'billion years' +sufix, 'en')
            elif dt == dt_types[4]:
                return (ast.literal_eval(dv)['value'], 'en')
        except ValueError as e:
            pdb.set_trace()
            raise e
            
def turn_to_century_or_millennium(y, mode):
    y = str(y)
    if mode == 'C':
        div = 100
        group = int(y.rjust(3, '0')[:-2])
        mode_name = 'century'
    elif mode == 'M':
        div = 1000
        group = int(y.rjust(4, '0')[:-3])
        mode_name = 'millenium'
    else:        
        raise ValueError('Use mode = C for century and M for millennium')
        
    if int(y)%div != 0:
        group += 1
    group = str(group)

    group_suffix = (
        'st' if group[-1] == '1' else (
            'nd' if group[-1] == '2' else (
                'rd' if group[-1] == '3' else 'th'
            )
        )
    )

    return ' '.join([group+group_suffix, mode_name])

In [77]:
claim_df[['object_label','object_label_lan']] = pd.DataFrame(
    claim_df.apply(get_object_label_given_datatype, axis=1).tolist()
)

In [117]:
text_reference_sampled_df_html[text_reference_sampled_df_html['netloc'] == 'www.disease-ontology.org'].reference_id.tolist()

['2c190136876455fcda5619d61e22f50a45422cc2',
 '38b71d197b012e168326bf6ebe5ed1159850fc9f',
 '899ee9e4a795b714c940e63d1a850a383b51a88c',
 'ae1ef222087a9501699b6728d9f1e1f8297b3400',
 '1ab4a3c0edc80e3347c09d725196d9f27993d692',
 '216589ad290e07faa9771ffeb47250bf86c09025',
 '47d9a92158db0e056d690d6b426ac4b61254c267',
 '89eefe7731a8fb1fc19b2e6d21801eb5152bb8cb',
 '5ed7ac0179f23b1cc20da925cf5e022fba8d3279',
 'd9165440fff6db6ab2dcbddcc170ef8718cddb5d',
 '645a5e7f3426ef946ded265bed30414226442546',
 'c6400d7cc5d51c19f69c2973d3a27abcf65030ba',
 '5bd19975a36603f098b746856af1ce3013c4fbcb']

In [118]:
claim_df[claim_df.reference_id.isin(
    text_reference_sampled_df_html[text_reference_sampled_df_html['netloc'] == 'www.disease-ontology.org'].reference_id.tolist()
)]

Unnamed: 0,reference_id,entity_id,claim_id,rank,property_id,datatype,datavalue,entity_label,entity_label_lan,property_label,property_label_lan,object_label,object_label_lan
846,2c190136876455fcda5619d61e22f50a45422cc2,Q18556388,Q18556388$FDE57475-321D-4523-A993-F2621F361E70,normal,P1748,string,"{'value': 'C27626', 'type': 'string'}",granulomatous endometritis,en,NCI Thesaurus ID,en,C27626,en
847,38b71d197b012e168326bf6ebe5ed1159850fc9f,Q848343,Q848343$905E7953-FA78-4FBF-9C01-8084C7CA9B7C,normal,P279,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",glucose-6-phosphate dehydrogenase deficiency,en,subclass of,en,carbohydrate metabolic disorder,en
848,899ee9e4a795b714c940e63d1a850a383b51a88c,Q18557497,Q18557497$B091AF1C-86C0-40A7-A595-7E55CF5F5F80,normal,P279,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",lung occult adenocarcinoma,en,subclass of,en,adenocarcinoma of the lung,en
849,ae1ef222087a9501699b6728d9f1e1f8297b3400,Q207133,Q207133$688D31EA-99C0-4AFE-B8D5-0092BC3D7DB3,normal,P279,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",Klinefelter's syndrome,en,subclass of,en,chromosomal duplication syndrome,en
850,1ab4a3c0edc80e3347c09d725196d9f27993d692,Q7204592,Q7204592$F8AC07D4-C4CF-4ED2-B206-D0459C4CC979,normal,P279,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",pleomorphic lipoma,en,subclass of,en,lipoma,en
851,216589ad290e07faa9771ffeb47250bf86c09025,Q298230,Q298230$FD77C4EB-D1D8-4C7C-9F74-DCCDF489998E,normal,P1748,string,"{'value': 'C9224', 'type': 'string'}",esophagitis,en,NCI Thesaurus ID,en,C9224,en
852,216589ad290e07faa9771ffeb47250bf86c09025,Q298230,Q298230$A6A1AFD4-4FF7-404C-81ED-9E766AC836F8,normal,P1692,string,"{'value': '530.10', 'type': 'string'}",esophagitis,en,ICD-9-CM,en,530.10,en
853,216589ad290e07faa9771ffeb47250bf86c09025,Q298230,Q298230$92B8CF28-1299-44B6-AF66-55C3DC3561EA,normal,P1692,string,"{'value': '530.1', 'type': 'string'}",esophagitis,en,ICD-9-CM,en,530.1,en
854,47d9a92158db0e056d690d6b426ac4b61254c267,Q1031536,Q1031536$F11CFC65-05A1-4FD6-87F5-F3C92D33FF7A,normal,P1748,string,"{'value': 'C84609', 'type': 'string'}",campomelic dysplasia,en,NCI Thesaurus ID,en,C84609,en
855,47d9a92158db0e056d690d6b426ac4b61254c267,Q1031536,Q1031536$6905FE68-2C51-4CED-ABDD-ADD3F8BCD16B,normal,P1748,string,"{'value': 'C120205', 'type': 'string'}",campomelic dysplasia,en,NCI Thesaurus ID,en,C120205,en


In [105]:
print(claim_id_to_claim_url('Q1031536$F11CFC65-05A1-4FD6-87F5-F3C92D33FF7A'))

https://www.wikidata.org/wiki/Q1031536#Q1031536$F11CFC65-05A1-4FD6-87F5-F3C92D33FF7A


In [124]:
for row in claim_df.apply(lambda x : x['property_id']+'_'+x['property_label'], axis=1)\
    .value_counts().reset_index().iterrows():
    print(row)

(0, index    P31_instance of
0                    161
Name: 0, dtype: object)
(1, index    P569_date of birth
0                       151
Name: 1, dtype: object)
(2, index    P570_date of death
0                       120
Name: 2, dtype: object)
(3, index    P39_position held
0                       65
Name: 3, dtype: object)
(4, index    P571_inception
0                    50
Name: 4, dtype: object)
(5, index    P217_inventory number
0                           44
Name: 5, dtype: object)
(6, index    P186_made from material
0                             43
Name: 6, dtype: object)
(7, index    P195_collection
0                     42
Name: 7, dtype: object)
(8, index    P1476_title
0                 42
Name: 8, dtype: object)
(9, index    P136_genre
0                39
Name: 9, dtype: object)
(10, index    P276_location
0                   36
Name: 10, dtype: object)
(11, index    P21_sex or gender
0                       35
Name: 11, dtype: object)
(12, index    P17_country
0         

In [90]:
claim_df[claim_df.object_label == 'no-label']

Unnamed: 0,reference_id,entity_id,claim_id,rank,property_id,datatype,datavalue,entity_label,entity_label_lan,property_label,property_label_lan,object_label,object_label_lan
