In [1]:
import numpy as np
import sqlite3
import wikidata_utils as wdutils
import pandas as pd
import ast
from sample_size import sample_size

import qwikidata
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.sparql import (get_subclasses_of_item,
                              return_sparql_query_results)
from qwikidata.entity import WikidataItem, WikidataProperty, WikidataLexeme

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from tqdm.auto import tqdm
tqdm.pandas()

SEED=42

from IPython.display import clear_output

INFO: Pandarallel will run on 24 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


In [2]:
wdAPI = wdutils.CachedWikidataAPI(
    cache_path = '../wikidata_entity_cache.p',
    save_every_x_queries=np.inf
)

In [3]:
db = sqlite3.connect('../wikidata_claims_refs_parsed.db')
cursor = db.cursor()
claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
refs_columns = ['reference_id', 'reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']

In [4]:
# Checking first few elements
cursor.execute('select * from refs limit 5;')
head_df = pd.DataFrame(cursor.fetchall())
head_df.columns = refs_columns
head_df

Unnamed: 0,reference_id,reference_property_id,reference_index,reference_datatype,reference_value
0,7c6b234780b3ee79e341952bbe69fd0e93298176,P854,0,url,{'value': 'http://gwpapers.virginia.edu/histor...
1,7c6b234780b3ee79e341952bbe69fd0e93298176,P123,0,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
2,dd6c0c287c2d4dfd674f577b65ac1d6b875005ae,P248,0,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
3,dd6c0c287c2d4dfd674f577b65ac1d6b875005ae,P304,0,string,"{'value': '389', 'type': 'string'}"
4,21e05588017fe6569fd5fb71be9fc6b97812a49b,P248,0,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."


In [5]:
text_reference_sampled_df_html = pd.read_csv('text_reference_sampled_df_html.csv')

In [75]:
def reference_id_to_claim_id(reference_id):
    try:
        np.random.seed(SEED)
        cursor.execute(f'select claim_id from claims_refs where reference_id="{reference_id}"')
        sql_result = cursor.fetchall()
        return sql_result
        randomly_chosen_claim_id = np.array(sql_result).reshape(-1)
        return randomly_chosen_claim_id
    except Exception:
        print(reference_id)
        print(sql_result)
        raise
        
def reference_id_to_claim_data(reference_id):
    claim_ids = reference_id_to_claim_id(reference_id)
    r = []
    for claim_id in claim_ids:
        cursor.execute(f'select * from claims where claim_id="{claim_id}";')
        d = cursor.fetchall()
        r = r + d
    return r

claim_data = []
for reference_id in text_reference_sampled_df_html.reference_id:
    data = reference_id_to_claim_data(reference_id)
    data = [(reference_id,) + t for t in data]
    claim_data = claim_data + data

In [76]:
reference_id_to_claim_id('a5a493f7843383b127c7cff4f7c9c26af9b2f87')

#cursor.execute('select * from claims_refs where reference_id=\'ca5a493f7843383b127c7cff4f7c9c26af9b2f87\'')
#sql_result = cursor.fetchall()
#sql_result

[]

In [48]:
claim_df = pd.DataFrame(claim_data, columns = ['reference_id'] + claims_columns)
claim_df

Unnamed: 0,reference_id,entity_id,claim_id,rank,property_id,datatype,datavalue
0,390d6c6e68a32e11f8d7b0883cda0557db529fe6,Q5512528,Q5512528$81E8AD02-28AF-4AE3-8ACD-047C30B40B01,normal,P20,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
1,49b9aec8e10815611ff0379a34d5fd7c3830566e,Q12149940,Q12149940$C9FE5F0C-78FD-4ECE-B1C2-16A763B8ED4E,normal,P140,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
2,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,Q583556,Q583556$929DCD2F-3B4A-41B5-89E7-EEC549077834,normal,P740,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
3,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,Q583556,Q583556$2AD75C92-953E-47CB-93A6-D082674F4231,normal,P2031,time,"{'value': {'time': '+1972-00-00T00:00:00Z', 't..."
4,76b04346ad57869d9e5ae1007ba8343d708ab6f9,Q7586053,Q7586053$AD344012-A522-4A8A-AF64-16F859E2EA9A,normal,P2031,time,"{'value': {'time': '+1983-00-00T00:00:00Z', 't..."
...,...,...,...,...,...,...,...
1608,ddebe4072a35e892ac423dc9837b8f3a5e022766,Q58434264,Q58434264$353DC8CF-6AFD-4707-95B1-E7373A8FE08B,normal,P5987,external-id,"{'value': 'SJ', 'type': 'string'}"
1609,78eb8ba32e77a9434161dabc2dee3b4dede3fa0b,Q58481743,Q58481743$387B7A64-8D7E-45F8-914B-D44856E7AA39,normal,P31,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."
1610,78eb8ba32e77a9434161dabc2dee3b4dede3fa0b,Q58481743,Q58481743$DE0D1567-2FD0-4537-8D53-8C7622579B5F,normal,P5987,external-id,"{'value': 'BCB', 'type': 'string'}"
1611,78eb8ba32e77a9434161dabc2dee3b4dede3fa0b,Q58481743,Q58481743$72999E39-348D-4DD3-8272-D4A24B9E7894,normal,P364,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'..."


In [58]:
print(claim_id_to_claim_url('Q58481743$72999E39-348D-4DD3-8272-D4A24B9E7894'))

https://www.wikidata.org/wiki/Q58481743#Q58481743$72999E39-348D-4DD3-8272-D4A24B9E7894


In [52]:
def claim_id_to_claim_url(claim_id):
    claim_id_parts = claim_id.split('$')
    return f'https://www.wikidata.org/wiki/{claim_id_parts[0]}#{claim_id}'

In [49]:
claim_df.datatype.value_counts()

wikibase-item       752
time                368
external-id         232
string               84
quantity             77
monolingualtext      59
url                  19
globe-coordinate     15
commonsMedia          7
Name: datatype, dtype: int64

In [56]:
claim_df[~claim_df.datatype.isin(['external-id','commonsMedia','url'])].reference_id.unique().shape

(555,)

In [59]:
claim_df.reference_id.unique().shape

(676,)

In [63]:
claim_df[claim_df.datatype == 'url'].claim_id.apply(claim_id_to_claim_url).apply(print);

https://www.wikidata.org/wiki/Q105087119#Q105087119$83048AD7-8176-4A67-AF23-4A5E2C502527
https://www.wikidata.org/wiki/Q1662094#Q1662094$DE4FBDB0-EFEF-4A7B-ACF6-8C90BAB5C062
https://www.wikidata.org/wiki/Q7016850#Q7016850$429B359C-7367-40F8-B3BC-1F0A9224FCF4
https://www.wikidata.org/wiki/Q2073209#Q2073209$FC7F78A6-DB7E-476F-A4C7-057556A7FB60
https://www.wikidata.org/wiki/Q2073209#Q2073209$83503759-1A55-4001-989D-D2A58445467D
https://www.wikidata.org/wiki/Q3612859#Q3612859$4DF49BC8-5FCF-4483-92A0-BBEE9B335754
https://www.wikidata.org/wiki/Q54619570#Q54619570$5CC320C6-AA28-47C4-BDC4-6534B4BDA4D0
https://www.wikidata.org/wiki/Q30287960#Q30287960$C256B756-3EB7-4D53-9F32-F18FC5BE1998
https://www.wikidata.org/wiki/Q54623675#Q54623675$55D1A6CA-8E9C-43BC-BC35-80243D845AAC
https://www.wikidata.org/wiki/Q54826499#Q54826499$EC23DFF0-8C28-471C-B2B2-E54F8E485484
https://www.wikidata.org/wiki/Q848343#Q848343$7F31C2D2-6BC9-4135-A4EC-88D8DC0F0E4E
https://www.wikidata.org/wiki/Q18557497#Q18557497$69C7A