In [1]:
import numpy as np
import sqlite3
import wikidata_utils as wdutils
import pandas as pd
import ast
from sample_size import sample_size

import qwikidata
from qwikidata.json_dump import WikidataJsonDump
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.sparql import (get_subclasses_of_item,
                              return_sparql_query_results)
from qwikidata.entity import WikidataItem, WikidataProperty, WikidataLexeme

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

from tqdm.auto import tqdm
tqdm.pandas()

SEED=42

from IPython.display import clear_output

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
#from importlib import reload
#wdAPI.save_entity_cache(force=True)
#reload(wdutils)

In [None]:
wdAPI = wdutils.CachedWikidataAPI(
    cache_path = '/home/k20036346/Repos/RefSupPipeline/wikidata_claim_data/wikidata_entity_cache.p',
    save_every_x_queries=np.inf
)

In [None]:
db = sqlite3.connect('/archive/group_simperl/wikidata/wikidata_claims_refs_parsed.db')
cursor = db.cursor()
claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
refs_columns = ['reference_id', 'reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']

In [None]:
# Checking first few elements
cursor.execute('select * from refs limit 5;')
head_df = pd.DataFrame(cursor.fetchall())
head_df.columns = refs_columns
head_df

In [None]:
text_reference_sampled_df_html = pd.read_csv('text_reference_sampled_df_html.csv')
text_reference_sampled_df_html

In [None]:
def reference_id_to_claim_id(reference_id):
    try:
        np.random.seed(SEED)
        cursor.execute(f'select claim_id from claims_refs where reference_id="{reference_id}"')
        sql_result = cursor.fetchall()
        #return sql_result
        randomly_chosen_claim_id = np.array(sql_result).reshape(-1)
        return randomly_chosen_claim_id
    except Exception:
        print(reference_id)
        print(sql_result)
        raise
        
def reference_id_to_claim_data(reference_id):
    claim_ids = reference_id_to_claim_id(reference_id)
    r = []
    for claim_id in claim_ids:
        #print(claim_id)
        cursor.execute(f'select * from claims where claim_id="{claim_id}";')
        d = cursor.fetchall()
        r = r + d
    return r

claim_data = []
for reference_id in text_reference_sampled_df_html.reference_id:
    data = reference_id_to_claim_data(reference_id)    
    #print(data)
    data = [(reference_id,) + t for t in data]
    claim_data = claim_data + data
    #break

In [None]:
claim_df = pd.DataFrame(claim_data, columns = ['reference_id'] + claims_columns)
claim_df

In [10]:
#reference_id_to_claim_data('93c70463e9b27bd9a1d62a170b23cc55ced0f7a4')

#cursor.execute('select * from claims_refs where reference_id=\'ca5a493f7843383b127c7cff4f7c9c26af9b2f87\'')
#sql_result = cursor.fetchall()
#sql_result

In [11]:
def claim_id_to_claim_url(claim_id):
    claim_id_parts = claim_id.split('$')
    return f'https://www.wikidata.org/wiki/{claim_id_parts[0]}#{claim_id}'

In [12]:
#print(claim_id_to_claim_url('Q58481743$72999E39-348D-4DD3-8272-D4A24B9E7894'))

In [13]:
claim_df.datatype.value_counts()

wikibase-item       926
time                389
external-id         149
string              101
quantity             91
monolingualtext      82
url                  15
globe-coordinate     12
commonsMedia          1
Name: datatype, dtype: int64

In [14]:
BAD_DATATYPES = ['external-id','commonsMedia','url', 'globe-coordinate', 'wikibase-lexeme', 'wikibase-property']

assert claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reference_id.unique().shape\
    == claim_df.reference_id.unique().shape

print(claim_df.reference_id.unique().shape[0])

676


In [15]:
claim_df = claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reset_index(drop=True)

In [16]:
claim_df[['entity_label','entity_label_lan']] = pd.DataFrame(
    claim_df.entity_id.apply(wdAPI.get_label, non_language_set=True).tolist()
)
claim_df[['property_label','property_label_lan']] = pd.DataFrame(
    claim_df.property_id.apply(wdAPI.get_label, non_language_set=True).tolist()
)

claim_df[['entity_alias','entity_alias_lan']] = pd.DataFrame(
    claim_df.entity_id.apply(wdAPI.get_alias, non_language_set=True).tolist()
)
claim_df[['property_alias','property_alias_lan']] = pd.DataFrame(
    claim_df.property_id.apply(wdAPI.get_alias, non_language_set=True).tolist()
)

claim_df[['entity_desc','entity_desc_lan']] = pd.DataFrame(
    claim_df.entity_id.apply(wdAPI.get_desc, non_language_set=True).tolist()
)
claim_df[['property_desc','property_desc_lan']] = pd.DataFrame(
    claim_df.property_id.apply(wdAPI.get_desc, non_language_set=True).tolist()
)

In [140]:
claim_df[['object_label','object_label_lan']] = pd.DataFrame(
    claim_df.apply(lambda x : wdAPI.get_object_label_given_datatype(x['datatype'], x['datavalue']), axis=1).tolist()
)

claim_df[['object_alias','object_alias_lan']] = pd.DataFrame(
    claim_df.apply(lambda x : wdAPI.get_object_alias_given_datatype(x['datatype'], x['datavalue']), axis=1).tolist()
)

claim_df[['object_desc','object_desc_lan']] = pd.DataFrame(
    claim_df.apply(lambda x : wdAPI.get_object_desc_given_datatype(x['datatype'], x['datavalue']), axis=1).tolist()
)

Wikidata redirect detected.  Input entity id=Q20085892. Returned entity id=Q484188.


In [53]:
claim_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1589 entries, 0 to 1588
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   reference_id        1589 non-null   object
 1   entity_id           1589 non-null   object
 2   claim_id            1589 non-null   object
 3   rank                1589 non-null   object
 4   property_id         1589 non-null   object
 5   datatype            1589 non-null   object
 6   datavalue           1589 non-null   object
 7   entity_label        1589 non-null   object
 8   entity_label_lan    1589 non-null   object
 9   property_label      1589 non-null   object
 10  property_label_lan  1589 non-null   object
 11  entity_alias        1589 non-null   object
 12  entity_alias_lan    1589 non-null   object
 13  property_alias      1589 non-null   object
 14  property_alias_lan  1589 non-null   object
 15  entity_desc         1589 non-null   object
 16  entity_desc_lan     1589

In [54]:
# Removing bad object labels
claim_df = claim_df[claim_df['object_label_lan'] != 'no_lan'].reset_index(drop=True)
print(claim_df.reference_id.unique().shape[0])

676


In [55]:
for row in claim_df.apply(lambda x : x['property_id']+'_'+x['property_label'], axis=1)\
    .value_counts().reset_index().iterrows():
    print(row)

(0, index    P31_instance of
0                    158
Name: 0, dtype: object)
(1, index    P569_date of birth
0                       155
Name: 1, dtype: object)
(2, index    P570_date of death
0                       118
Name: 2, dtype: object)
(3, index    P39_position held
0                       65
Name: 3, dtype: object)
(4, index    P1476_title
0                 44
Name: 4, dtype: object)
(5, index    P571_inception
0                    43
Name: 5, dtype: object)
(6, index    P186_made from material
0                             43
Name: 6, dtype: object)
(7, index    P217_inventory number
0                           42
Name: 7, dtype: object)
(8, index    P195_collection
0                     40
Name: 8, dtype: object)
(9, index    P21_sex or gender
0                       37
Name: 9, dtype: object)
(10, index    P136_genre
0                37
Name: 10, dtype: object)
(11, index    P276_location
0                   34
Name: 11, dtype: object)
(12, index    P17_country
0         

In [56]:
claim_df.to_csv('text_reference_claims_df.csv', index=None)

In [2]:
claim_df = pd.read_csv('text_reference_claims_df.csv')

In [3]:
claim_df

Unnamed: 0,reference_id,entity_id,claim_id,rank,property_id,datatype,datavalue,entity_label,entity_label_lan,property_label,property_label_lan,object_label,object_label_lan
0,390d6c6e68a32e11f8d7b0883cda0557db529fe6,Q5512528,Q5512528$81E8AD02-28AF-4AE3-8ACD-047C30B40B01,normal,P20,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",G. V. Raja,en,place of death,en,Kullu Valley,en
1,49b9aec8e10815611ff0379a34d5fd7c3830566e,Q12149940,Q12149940$C9FE5F0C-78FD-4ECE-B1C2-16A763B8ED4E,normal,P140,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",Sebastian Sabol,en,religion,en,Greek catholic church,en
2,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,Q583556,Q583556$929DCD2F-3B4A-41B5-89E7-EEC549077834,normal,P740,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",Average White Band,en,location of formation,en,Dundee,en
3,ab3e9ada7246257ffbfb86fa90a54f25e45a704e,Q583556,Q583556$2AD75C92-953E-47CB-93A6-D082674F4231,normal,P2031,time,"{'value': {'time': '+1972-00-00T00:00:00Z', 't...",Average White Band,en,work period (start),en,1972,en
4,76b04346ad57869d9e5ae1007ba8343d708ab6f9,Q7586053,Q7586053$AD344012-A522-4A8A-AF64-16F859E2EA9A,normal,P2031,time,"{'value': {'time': '+1983-00-00T00:00:00Z', 't...",Yutaka Higuchi,en,work period (start),en,1983,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,8bb93e520f7ca8208f24078434b8f5f2ba8b520b,Q58484448,Q58484448$DB944078-CA87-4617-B942-5B290F2144ED,normal,P31,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",Gul Sanobar,en,instance of,en,film,en
1568,8bb93e520f7ca8208f24078434b8f5f2ba8b520b,Q58484448,Q58484448$9611AF89-1BD2-4D49-9CBB-E3EC52C7765D,normal,P364,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",Gul Sanobar,en,original language of film or TV show,en,Hindi,en
1569,8bb93e520f7ca8208f24078434b8f5f2ba8b520b,Q58484448,Q58484448$E03C8ED8-0593-4D47-B002-6371900A238E,normal,P462,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",Gul Sanobar,en,color,en,black and white,en
1570,6e6204011de4f4176362a6f4067486a94fc6ac3e,Q58435606,Q58435606$474B340F-3E0A-451C-8844-C02092719DD0,normal,P31,wikibase-item,"{'value': {'entity-type': 'item', 'numeric-id'...",Jai Bhawani,en,instance of,en,film,en
