In [58]:
import pywikibot
import requests
import sys
import os 
from pathlib import Path
import json
import re 
import time
from datetime import date
from os.path import exists
from pywikibot.data import api


# https://stackoverflow.com/a/66303932
project_path = Path(os.path.dirname(os.path.realpath("__file__"))).parent
sys.path.append(str(project_path))
sys.path.append(str(project_path / 'scripts'))

import scripts.utils.wiki_serialization as ws
import scripts.utils.wikidata_utils as wd
import scripts.utils.wiki_queries as wq


In [59]:
site = pywikibot.Site("wikidata", "wikidata")
repo = site.data_repository()

WIKI_BASE_URL = "https://www.wikidata.org"
WIKI_QUERY_URL = "https://query.wikidata.org"


# Q154287 record with lots of multimedia in test.wikidata 

records = {
    'Q5': 'human',
    'Q30': 'United States',
    'Q487604': 'Martha Graham',
    'Q16973731': 'Dianne McIntyre',
    'Q753828': 'Essex',
    'Q76': 'Barack Obama',
    'Q50602076': 'Karl Hirlmeier' # no alias
}

def print_json(data):
    print(json.dumps(data, indent=2, ensure_ascii=False))

## get item

In [60]:
new_item = pywikibot.ItemPage(repo, title="Q50602076")
new_item.exists()

True

In [61]:
new_item_json = new_item.toJSON()

# print_json(new_item_json)

## format item; this is what will be returned by the api

In [5]:

start = time.time()
item_data = wd.format_display_item(new_item, site)
stop = time.time()
print(stop-start)



0.9287600517272949


In [6]:
# print_json(item_data)

## test how long it takes to get labels for all properties and items in an item

In [7]:
all_ids = wd.get_ids_for_item(new_item, new_item_json, include_pids=True, include_qids=True)
q_ids = wd.get_ids_for_item(new_item, new_item_json, include_pids=False, include_qids=True)
p_ids = wd.get_ids_for_item(new_item, new_item_json, include_pids=True, include_qids=False)


connect to wikimedia api

In [8]:
start = time.time()
all_ids_dict = wq.fetch_and_format_labels_for_ids(all_ids, lang="en")
stop = time.time()
print('all', stop-start, len(all_ids))


start = time.time()
q_ids_dict = wq.fetch_and_format_labels_for_ids(q_ids, lang="en")
stop = time.time()
print('qids', stop-start, len(q_ids))


start = time.time()
p_ids_dict = wq.fetch_and_format_labels_for_ids(p_ids, lang="en")
stop = time.time()
print('pids', stop-start, len(p_ids))

all 1.5953569412231445 70
qids 0.8146529197692871 28
pids 0.7379720211029053 42


use sqarql query 

In [9]:
start = time.time()
results = wq.fetch_and_format_labels_for_ids_sqarql(all_ids)
stop = time.time()
print('all_ids', stop-start, len(all_ids))


start = time.time()
results = wq.fetch_and_format_labels_for_ids_sqarql(q_ids)
stop = time.time()
print('q_ids', stop-start, len(q_ids))

start = time.time()
results = wq.fetch_and_format_labels_for_ids_sqarql(p_ids)
stop = time.time()
print('q_ids', stop-start, len(p_ids))

all_ids 0.4774019718170166 70
q_ids 0.2958390712738037 28
q_ids 0.25041985511779785 42


In [10]:
start = time.time()

ids = wd.create_id_label_dictionary(new_item, new_item_json)

stop = time.time()
print('ids', stop-start, len(ids))

ids 0.46909403800964355 70


## how to call dir methods

In [11]:
getattr('ab', 'upper')

<function str.upper>

In [12]:
callable(getattr('ab', 'upper'))

True

In [13]:
foo = 'ab'
for attr in dir(foo):
    if attr == 'upper':
        bar = getattr(foo, attr)()
        
bar
        

'AB'

## call every method and attribute in dir() for the claims in an iten

In [14]:
def test_attribute(attr, claim):
    if not attr.startswith('_'):
        print('-------')

        try:
            if callable(getattr(claim.target, attr)):
                print(attr, 'method')
                try:
                    print(getattr(claim.target, attr)())
                except:
                    pass
            else:
                print(attr, 'attr')
                try:
                    print(getattr(claim.target, attr))
                except:
                    pass
        except:
            pass



In [15]:
site = pywikibot.Site("test", "wikidata")
repo = site.data_repository()

# Q154287 record with multimedia test.wikidata 
test_item = pywikibot.ItemPage(repo, title="Q154287")

count = 0
for prop, claims in test_item.claims.items():
    if count > 0:
        break
    for claim in claims:
        if count > 0:
            break
        count += 1
        
        print(claim.target)
        # print(dir(claim.target))

        for attr in dir(claim.target):
            pass
            # test_attribute(attr, claim)

count

[[commons:File:Hum Hum Waterfall.jpg]]


1

## inspect each claim  with a given property

In [16]:
for prop, claims in new_item.claims.items():
    if prop == 'P214':
        for claim in claims:
            continue
        


## check if fetch common media function works on all file tyes

In [17]:
files = [
    # image
    'File:Оробко Василь 1916.tif',
    'File:Deepika Prasain.png',
    'File:Human-body-gif.gif',
    'File:Gingerbread_House_Essex_CT.jpg', 
    'File:Yousef Jadallah.webp',
    'File:AddictionDependence-de.xcf',
    # drawing
    'File:Pattern example.svg',
    # audio
    'File:Connectiong people - morse code.mid',
    'File:Universal Declaration of Human Rights - sot.flac',
    'File:Human fart.wav',
    'File:Perros jugando.mp3',
    'File:Columba_palumbus_birdsong.ogg',
    # viedo
    'File:Saving Oliver from the Dog Meat Trade - -YulinIsEverywhere.webm',
    'File:TRAPS-Brao people.mpg',
    'File:A domestic dog snoring.ogv',
    # office
    'File:The complete dog book (IA completedogbook00brue).pdf',
    'File:The Dog in Health and Disease.Djvu',
    # 3d
    'File:Human hip bone.stl'
]

results = wq.fetch_commons_media_metadata(site, files)

print(len(results), len(files))

18 18


In [18]:
# results[0]

In [19]:

for res in results:
    print('----')
    print(res['imageinfo'][0]['mediatype'], 
          res['imageinfo'][0]['mime'], len(res['imageinfo'][0].keys()))
    
    # print(list(datum.keys()))

----
BITMAP image/tiff 14
----
BITMAP image/png 13
----
BITMAP image/gif 14
----
BITMAP image/jpeg 13
----
BITMAP image/webp 13
----
BITMAP image/x-xcf 13
----
DRAWING image/svg+xml 13
----
AUDIO audio/midi 12
----
AUDIO audio/x-flac 13
----
AUDIO audio/wav 13
----
AUDIO audio/mpeg 13
----
AUDIO application/ogg 13
----
VIDEO video/webm 14
----
VIDEO video/mpeg 14
----
VIDEO application/ogg 14
----
OFFICE application/pdf 14
----
OFFICE image/vnd.djvu 14
----
3D application/sla 13


In [20]:
data = wq.format_commons_media_metadata_results(results)

# print(json.dumps(data, indent=2, ensure_ascii=False))


In [21]:

for res in data.values():
    print( res['mediatype'],  res['mime'], len(res.keys()))
    


BITMAP image/tiff 12
BITMAP image/png 12
BITMAP image/gif 12
BITMAP image/jpeg 12
BITMAP image/webp 12
BITMAP image/x-xcf 12
DRAWING image/svg+xml 12
AUDIO audio/midi 6
AUDIO audio/x-flac 7
AUDIO audio/wav 7
AUDIO audio/mpeg 7
AUDIO application/ogg 7
VIDEO video/webm 13
VIDEO video/mpeg 13
VIDEO application/ogg 13
OFFICE application/pdf 13
OFFICE image/vnd.djvu 13
3D application/sla 12


In [39]:
foo = wq.fetch_wikidata_languages(site)
# results = foo['wbcontentlanguages']
# results


In [40]:
results = foo['query']['wbcontentlanguages']

In [62]:
item_lang_codes = set()
    
for  lang, value in new_item.labels.items():
    item_lang_codes.add(lang)

for  lang, value in new_item.descriptions.items():
    item_lang_codes.add(lang)

for  lang, value in new_item.aliases.items():
    item_lang_codes.add(lang)

len(item_lang_codes)

4

In [67]:
def format_get_item_languages(results, item_lang_codes):
    item_langs = {}

    for lang in results.values():
        if lang['code'] in item_lang_codes:
            item_langs[lang['code']] = lang['name'] 
            
    return item_langs
    


{'ast': 'Asturian', 'de': 'German', 'en': 'English', 'nl': 'Dutch'}