In [1]:
import requests
import time

In [27]:
import pandas as pd
import numpy as np

## Get wikidata IDs for publishers

In [3]:
email = "jportenoy@ourresearch.org"

In [4]:
cursor = '*'

select = ",".join((
    'ids',
))

endpoint = "publishers"

# loop through pages
publisher_ids = []
loop_index = 0
while cursor:
    
    # set cursor value and request page from OpenAlex
    url = f'https://api.openalex.org/{endpoint}?select={select}&cursor={cursor}'
    if email:
        url += f'&mailto={email}'
    r = requests.get(url)
    page_with_results = r.json()
    if loop_index == 0:
        print(f"meta count property is {page_with_results['meta']['count']}")
    
    results = page_with_results['results']
    publisher_ids.extend(results)

    # update cursor to meta.next_cursor
    cursor = page_with_results['meta']['next_cursor']
    loop_index += 1
    if loop_index in [5, 10, 20, 50, 100] or loop_index % 500 == 0:
        print(f'{loop_index} api requests made so far')
print(f'done. made {loop_index} api requests. collected {len(publisher_ids)} works')

meta count property is 7910
5 api requests made so far
10 api requests made so far
20 api requests made so far
50 api requests made so far
100 api requests made so far
done. made 318 api requests. collected 7910 works


In [5]:
publisher_ids

[{'ids': {'openalex': 'https://openalex.org/P4310311775',
   'ror': 'https://ror.org/055j8ya05',
   'wikidata': 'https://www.wikidata.org/entity/Q998470'}},
 {'ids': {'openalex': 'https://openalex.org/P4310320990',
   'ror': 'https://ror.org/02scfj030',
   'wikidata': 'https://www.wikidata.org/entity/Q746413'}},
 {'ids': {'openalex': 'https://openalex.org/P4310319965',
   'ror': 'https://ror.org/0117jxy09',
   'wikidata': 'https://www.wikidata.org/entity/Q21096327'}},
 {'ids': {'openalex': 'https://openalex.org/P4310320595',
   'ror': 'https://ror.org/02xe89706',
   'wikidata': 'https://www.wikidata.org/entity/Q1479654'}},
 {'ids': {'openalex': 'https://openalex.org/P4310319900',
   'wikidata': 'https://www.wikidata.org/entity/Q176916'}},
 {'ids': {'openalex': 'https://openalex.org/P4310320503',
   'wikidata': 'https://www.wikidata.org/entity/Q767319'}},
 {'ids': {'openalex': 'https://openalex.org/P4310311647',
   'ror': 'https://ror.org/052gg0110',
   'wikidata': 'https://www.wikidata

In [6]:
data = []
for item in publisher_ids:
    data.append({
        'openalex_id': item['ids']['openalex'],
        'wikidata_id': item['ids'].get('wikidata', None),
    })
df_publishers_wikidata_ids = pd.DataFrame(data)

In [7]:
df_publishers_wikidata_ids['wikidata_id'].isna().value_counts().sort_index()

False    7580
True      330
Name: wikidata_id, dtype: int64

In [8]:
url = 'https://query.wikidata.org/sparql'
wikidata_ids = df_publishers_wikidata_ids['wikidata_id'].dropna().apply(lambda x: x.split('/')[-1])
size = 500
loop_index = 0
wikidata_results = []
for list_index in range(0, len(wikidata_ids), size):
    subset = wikidata_ids.iloc[list_index:list_index+size]
    wikidata_ids_str = " ".join([f"wd:{item}" for item in subset.values])
    query = f"""
    SELECT ?item ?logoImage ?image WHERE {{
      VALUES ?item {{ {wikidata_ids_str} }}
      OPTIONAL {{?item wdt:P154 ?logoImage}}
      OPTIONAL {{?item wdt:P18 ?image}}
    }}
    """
    r = requests.get(url, params = {'format': 'json', 'query': query})
    this_wikidata_results = r.json()
    wikidata_results.extend(this_wikidata_results['results']['bindings'])
    loop_index += 1
    time.sleep(1)
print(f"collected {len(wikidata_results)} records using {loop_index} api calls")

collected 7746 records using 16 api calls


In [61]:
data = []
for item in wikidata_results:
    logo_image = item['logoImage']['value'] if 'logoImage' in item else None
    image = item['image']['value'] if 'image' in item else None
    data.append({
        'wikidata_id': item['item']['value'],
        'logoImageUri': logo_image,
        'imageUri': image,
    })
df_publishers_wikidata_images = pd.DataFrame(data).drop_duplicates()

In [62]:
df_publishers_wikidata_images['wikidata_id'] = df_publishers_wikidata_images['wikidata_id'].str.replace('http://', 'https://')

In [63]:
df = df_publishers_wikidata_ids.merge(df_publishers_wikidata_images, how='left', on='wikidata_id')

In [64]:
df['logoImageUri'].notna().value_counts().sort_index()

False    6885
True     1187
Name: logoImageUri, dtype: int64

In [65]:
df['imageUri'].notna().value_counts().sort_index()

False    6024
True     2048
Name: imageUri, dtype: int64

In [66]:
df[(df['logoImageUri'].isna())&df['imageUri'].notna()]['wikidata_id']

0         https://www.wikidata.org/entity/Q998470
12        https://www.wikidata.org/entity/Q912887
16        https://www.wikidata.org/entity/Q247556
24       https://www.wikidata.org/entity/Q2420769
34       https://www.wikidata.org/entity/Q1508259
                          ...                    
7924     https://www.wikidata.org/entity/Q5542217
7927     https://www.wikidata.org/entity/Q9170880
7928     https://www.wikidata.org/entity/Q9379713
7932    https://www.wikidata.org/entity/Q18391894
7940     https://www.wikidata.org/entity/Q2370088
Name: wikidata_id, Length: 1248, dtype: object

In [67]:
df.iloc[0]['imageUri']

'http://commons.wikimedia.org/wiki/Special:FilePath/Reed%20Elsevier%2C%20Radarweg%2029%20Amsterdam.jpg'

Instructions on https://commons.wikimedia.org/wiki/Commons:Reusing_content_outside_Wikimedia/technical

In [68]:
url_thumbnail_template = "https://commons.wikimedia.org/w/index.php?title=Special:Redirect/file/{filename}&width=300"

In [89]:
data = []
for _, row in df.dropna(subset='wikidata_id').iterrows():
    if not pd.isna(row['logoImageUri']):
        img_uri = row['logoImageUri']
        wikidata_prop = 'logo image'
    elif not pd.isna(row['imageUri']):
        img_uri = row['imageUri']
        wikidata_prop = 'image'
    else:
        img_uri = None
        wikidata_prop = None
    if img_uri:
        img_uri = img_uri.split('/')[-1]
        img_uri = url_thumbnail_template.format(filename=img_uri)
        data.append({
            'openalex_id': row['openalex_id'],
            'wikidata_id': row['wikidata_id'],
            'thumbnail_url': img_uri,
            'wikidata_prop': wikidata_prop,
        })
df_out = pd.DataFrame(data).drop_duplicates()

In [91]:
df_out.to_csv('../output/publishers_image_thumbnail_urls.csv', index=False)