## Build main dataset of descriptives from CORE.ac.uk API

Generate the JSON objects for the descriptive data of each repository:

In [1]:
import requests
import json
from pprint import pprint
import pandas as pd

pd.options.display.max_columns = None

API_ENDPOINT = "https://api.core.ac.uk/v3/"

'''Functions to retrieve data from CORE API v3. Authorised using the ./apikey from core.ac.uk.
Based on examples provided by CORE at https://github.com/oacore/apiv3-webinar/
'''

def get_API_Key() -> str:
    '''Retrieve the API key from project root folder.'''
    with open("./apikey", "r") as apikey_file:
        api_key = apikey_file.readlines()[0].strip()
    return api_key

def get_core_providers_details(country_code, api_key) -> list:
    """ Gets all descriptive details for all Core.ac.uk UK-based data providers"""
    results = base_query_api("search/data-providers", "location.countryCode:" + country_code, api_key)
    list_of_dicts = []
    for provider in results['results']:
        list_of_dicts.append(provider)

    return list_of_dicts

def strip_http(df_in: pd.DataFrame) -> pd.DataFrame:
    df_in['URL'] = df_in['URL'].str.replace('http://', '')

    return df_in


def strip_https(df_in: pd.DataFrame) -> pd.DataFrame:
    df_in['URL'] = df_in['URL'].str.replace('https://', '')

    return df_in

def base_query_api(url_fragment: str, query: str, api_key: str, limit=300):
    ''''''
    headers = {"Authorization": "Bearer " + api_key}
    query = {"q": query, "limit": limit}
    response = requests.post(f"{API_ENDPOINT}{url_fragment}", data=json.dumps(query), headers=headers)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error code {response.status_code}, {response.content}")
        
api_key = get_API_Key()
data = get_core_providers_details('GB', get_API_Key())

[Optional] Print the data:

In [2]:
display(data)

[{'id': 2,
  'openDoarId': 1589,
  'name': 'Abertay Research Portal',
  'email': 'repository@abertay.ac.uk',
  'uri': None,
  'oaiPmhUrl': 'https://rke.abertay.ac.uk/ws/oai',
  'homepageUrl': 'http://repository.abertay.ac.uk',
  'source': None,
  'software': 'pure',
  'metadataFormat': 'oai_dc',
  'createdDate': '2011-05-09T22:12:10+01:00',
  'location': {'countryCode': 'GB', 'latitude': 56.4599, 'longitude': -2.9888},
  'logo': 'https://api.core.ac.uk/data-providers/2/logo',
  'type': 'REPOSITORY',
  'stats': None},
 {'id': 6,
  'openDoarId': 1850,
  'name': 'Anglia Ruskin Research Online',
  'email': 'arro@anglia.ac.uk',
  'uri': None,
  'oaiPmhUrl': 'https://arro.anglia.ac.uk/cgi/oai2',
  'homepageUrl': 'https://arro.anglia.ac.uk',
  'source': None,
  'software': 'eprints',
  'metadataFormat': 'rioxx',
  'createdDate': '2011-05-09T22:15:53+01:00',
  'location': {'countryCode': 'GB', 'latitude': 51.7413, 'longitude': 0.4743},
  'logo': 'https://api.core.ac.uk/data-providers/6/logo',


-Create a dataframe from this data  
-Rename oaiPmhUrl to URL  
-Remove http/s from urls  
-Set index column to be URL  


In [23]:
df_all_provider_details = pd.DataFrame.from_dict(get_core_providers_details('gb', get_API_Key()))
df_all_provider_details.rename(columns= {'oaiPmhUrl':'URL'},inplace=True)
df_all_provider_details.rename(columns= {'software':'ris_software'},inplace=True)
strip_http(df_all_provider_details)
strip_https(df_all_provider_details)
df_all_provider_details.set_index(keys='URL', inplace=True)

[Optional] Print the dataframe

In [24]:
pd.options.display.max_columns = None
pd.options.display.max_rows = 300
display(df_all_provider_details)

Unnamed: 0_level_0,id,openDoarId,name,email,uri,homepageUrl,source,ris_software,metadataFormat,createdDate,location,logo,type,stats
URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
rke.abertay.ac.uk/ws/oai,2,1589.0,Abertay Research Portal,repository@abertay.ac.uk,,http://repository.abertay.ac.uk,,pure,oai_dc,2011-05-09T22:12:10+01:00,"{'countryCode': 'GB', 'latitude': 56.4599, 'lo...",https://api.core.ac.uk/data-providers/2/logo,REPOSITORY,
arro.anglia.ac.uk/cgi/oai2,6,1850.0,Anglia Ruskin Research Online,arro@anglia.ac.uk,,https://arro.anglia.ac.uk,,eprints,rioxx,2011-05-09T22:15:53+01:00,"{'countryCode': 'GB', 'latitude': 51.7413, 'lo...",https://api.core.ac.uk/data-providers/6/logo,REPOSITORY,
www.repository.cam.ac.uk/oai/request,27,109.0,Apollo,support@repository.cam.ac.uk,,https://www.repository.cam.ac.uk,,dspace,oai_dc,2011-05-13T09:01:27+01:00,"{'countryCode': 'gb', 'latitude': 52.208, 'lon...",https://api.core.ac.uk/data-providers/27/logo,REPOSITORY,
ecrystals.chem.soton.ac.uk/cgi/oai2,36,289.0,eCrystals - Southampton,,,http://ecrystals.chem.soton.ac.uk,,eprints,oai_dc,2011-05-09T22:33:25+01:00,"{'countryCode': 'GB', 'latitude': 50.9342, 'lo...",https://api.core.ac.uk/data-providers/36/logo,REPOSITORY,
gala.gre.ac.uk/cgi/oai2,51,1756.0,Greenwich Academic Literature Archive,gala@gre.ac.uk,,http://gala.gre.ac.uk,,eprints,rioxx,2011-07-07T16:59:10+01:00,"{'countryCode': 'GB', 'latitude': 51.482038, '...",https://api.core.ac.uk/data-providers/51/logo,REPOSITORY,
repository.jisc.ac.uk/cgi/oai2,55,1063.0,Jisc Repository,repository@jisc.ac.uk,,http://repository.jisc.ac.uk,,eprints,oai_dc,2011-07-07T16:59:03+01:00,"{'countryCode': 'GB', 'latitude': 51.3814, 'lo...",https://api.core.ac.uk/data-providers/55/logo,REPOSITORY,
kar.kent.ac.uk/cgi/oai2,57,1328.0,Kent Academic Repository,ResearchSupport@kent.ac.uk,,https://kar.kent.ac.uk,,eprints,rioxx,2011-07-06T08:30:37+01:00,"{'countryCode': 'GB', 'latitude': 51.2923, 'lo...",https://api.core.ac.uk/data-providers/57/logo,REPOSITORY,
eprints.nottingham.ac.uk/cgi/oai2,80,226.0,Nottingham ePrints,eprints@nottingham.ac.uk,,http://eprints.nottingham.ac.uk,,eprints,rioxx,2011-07-07T16:03:40+01:00,"{'countryCode': 'GB', 'latitude': 52.9382, 'lo...",https://api.core.ac.uk/data-providers/80/logo,REPOSITORY,
radar.brookes.ac.uk/radar/oai,93,1680.0,Oxford Brookes University: RADAR,radar@brookes.ac.uk,,https://radar.brookes.ac.uk,,,oai_dc,2011-07-07T16:11:00+01:00,"{'countryCode': 'GB', 'latitude': 51.793, 'lon...",https://api.core.ac.uk/data-providers/93/logo,REPOSITORY,
epubs.stfc.ac.uk/repub/oai,106,57.0,ePubs: the open archive for STFC research publ...,alastair.duncan@stfc.ac.uk,,https://epubs.stfc.ac.uk,,,oai_dc,2011-07-07T16:11:06+01:00,"{'countryCode': 'GB', 'latitude': 51.5668, 'lo...",https://api.core.ac.uk/data-providers/106/logo,REPOSITORY,


## Additional data

**Russell_member** was manually added to the dataset by compiling a list of RG institutions from https://russellgroup.ac.uk/.


russell_members = [
        "University of Birmingham",
        "University of Bristol",
        "University of Cambridge",
        "Cardiff University",
        "Durham University",
        "University of Edinburgh",
        "University of Exeter",
        "University of Glasgow",
        "Imperial College London",
        "King's College London",
        "University of Leeds",
        "University of Liverpool",
        "London School of Economics & Political Science",
        "University of Manchester",
        "Newcastle University",
        "University of Nottingham",
        "University of Oxford",
        "Queen Mary, University of London",
        "Queen's University Belfast",
        "University of Sheffield",
        "University of Southampton",
        "University College London",
        "University of Warwick",
        "University of York",
    ]

**RSE_group** was added manually from data provided at https://github.com/socrse/rse-groups/blob/master/groups.toml

**ris_software_enum** was created 


In [41]:
import numpy as np
df_all_provider_details['ris_software'] = df_all_provider_details['ris_software'].str.lower()

ris_conditions = [
    (df_all_provider_details['ris_software'].str.contains('pure')),
    (df_all_provider_details['ris_software'].str.contains('eprints')),
    (df_all_provider_details['ris_software'].str.contains('dspace')),
    (df_all_provider_details['ris_software'].str.contains('worktribe')),
    (df_all_provider_details['ris_software'].str.contains('figshare')),
    (df_all_provider_details['ris_software'].str.contains('haplo')),
    (df_all_provider_details['ris_software'].str.contains('esploro')),
    df_all_provider_details['ris_software'] == None
    ]

# create a list of the values we want to assign for each condition
values = ['pure', 'eprints', 'dspace', 'worktribe', 'figshare', 'hjaplo', 'esploro', None]

# create a new column and use np.select to assign values to it using our lists as arguments
df_all_provider_details = np.select(ris_conditions, values)

# display updated DataFrame
display(df_all_provider_details)


TypeError: invalid entry 0 in condlist: should be boolean ndarray

Manual_Num_sw_records

Category