# Reading and analysing the ORCID public profiles
This notebook describes the process of extracting and analyising data from the 2017 public data release. The analysis uses the activities extract of the profiles in JSON format (https://doi.org/10.6084/m9.figshare.5479792.v1).

The method is based on the one used by Bohannon (2017, https://doi.org.10.1126/science.aal1189) for which dataset and scripts can be found here: http://dx.doi.org/10.5061/dryad.48s16.

Here, the complete ORCID profiles are used to add information about researcher urls to the dataset for analysis of current affiliations.


In [None]:
# python tarfile module is too memory expensive for reading the uncompressed archive. 
# Use command line to extract the archive onto an external hard drive

#tar -xzvf public_profiles_API2.0_2017_10_json.tar.gz -C ~/destination

In [3]:
import json, os, sys
import pandas as pd

## Setup
Load a couple of profiles to adapt the functions to the new ORCID message schema and the use of whole profiles:

In [6]:
#load my own ORCID profile to check contents
json.load(open("/media/eva/Eva-passport/ORCIDpubData2017/public_profiles_API-2.0_2017_10_json/9/0000-0003-4965-2969.json"))


{'activities-summary': {'educations': {'education-summary': [{'created-date': {'value': 1499540565501},
     'department-name': 'Computer Science',
     'end-date': None,
     'last-modified-date': {'value': 1499540565501},
     'organization': {'address': {'city': 'St Andrews',
       'country': 'GB',
       'region': None},
      'disambiguated-organization': None,
      'name': 'University of St Andrews'},
     'path': '/0000-0003-4965-2969/education/4229471',
     'put-code': 4229471,
     'role-title': 'Management and Information Technology',
     'source': {'source-client-id': None,
      'source-name': {'value': 'Eva Borger'},
      'source-orcid': {'host': 'orcid.org',
       'path': '0000-0003-4965-2969',
       'uri': 'http://orcid.org/0000-0003-4965-2969'}},
     'start-date': {'day': {'value': '12'},
      'month': {'value': '09'},
      'year': {'value': '2016'}},
     'visibility': 'public'},
    {'created-date': {'value': 1499540208691},
     'department-name': 'Medicine

In [None]:
#load an empty ORCID profile to check contents
json.load(open("/media/eva/Eva-passport/ORCIDpubData2017/public_profiles_API-2.0_2017_10_json/x/0000-0003-2914-115X.json"))


## The functions needed to load the profiles

In [52]:
#the original file generator enumerated each file. Needed a workaround as we are iterating through subfolders. 
#running just the for-loop results in the same strucutre.
def file_generator(json_dir):
    ''' Using a generator allows pausing and restarting
    without having to figure out where you left off. '''
    n = 0
    for root, directories, files in os.walk(json_dir):
            item = None
            for filename in files:
                m = n
                item = m, os.path.join(root, filename)
                n += 1
                yield (item)
        
def get_profiles(affiliation_data, url_data, json_files, stop = None):
    ''' Iterate over JSON files and process them '''
    for n, filepath in json_files:
        # keep track of progress
        sys.stdout.flush()
        sys.stdout.write('\r{}'.format(filepath))
        # terminate if stop is specified and reached
        if stop is not None and n >= stop:
            return
        # process this JSON file and harvest the data
        if filepath.endswith(".json"):
            with open(filepath) as f:
                profile = json.load(f)
                for row in get_affiliations(profile):
                    affiliation_data.append(row)
                for row in get_urls(profile):
                    url_data.append(row)

def has_education(profile):
    ''' This tests whether the profile has any educatino affiliations '''
    try:
        if profile["activities-summary"]["educations"]["education-summary"] != None:
            return True
    except:
        return False

def has_employment(profile):
    ''' This tests whether the profile has any employment affiliations '''
    try:
        if profile["activities-summary"]["employments"]["employment-summary"] != None:
            return True
    except:
        return False
    
def has_url(profile):
    ''' This tests whether the profile has any urls '''
    try:
        if profile["person"]["researcher-urls"]["researcher-url"] != None:
            return True
    except:
        return False

def get_affiliations(profile):
    ''' For each profile, extract all affiliations and metadata '''
    profile_data = []
    orcid_id = None
    if has_education(profile):
        orcid_id = profile["orcid-identifier"]["path"]
        for edu in profile["activities-summary"]["educations"]["education-summary"]:
            row = [orcid_id]
            row.append(edu["organization"]["address"]["country"])
            try:
                row.append(edu["organization"]["name"])
            except:
                row.append(None)
            try:
                row.append(edu["organization"]["disambiguated-organization"]["disambiguated-organization-identifier"])
            except:
                row.append(None)
            try:
                row.append(edu["start-date"]["year"]["value"])
            except:
                row.append(None)
            try:
                row.append(edu["end-date"]["year"]["value"])
            except:
                row.append(None)
            try:
                row.append(edu["role-title"])
            except:
                row.append(None)
            row.append(edu["path"][21:30]) #Add education keyword
            try:
                row.append(edu["soure"]["source-name"])
            except:
                row.append(None)
            profile_data.append(row)
    if has_education(profile):
        if orcid_id is None:
            orcid_id = profile["orcid-identifier"]["path"]
        for empl in profile["activities-summary"]["employments"]["employment-summary"]:
            row = [orcid_id]
            row.append(empl["organization"]["address"]["country"])
            try:
                row.append(empl["organization"]["name"])
            except:
                row.append(None)
            try:
                row.append(empl["organization"]["disambiguated-organization"]["disambiguated-organization-identifier"])
            except:
                row.append(None)
            try:
                row.append(empl["start-date"]["year"]["value"])
            except:
                row.append(None)
            try:
                row.append(empl["end-date"]["year"]["value"])
            except:
                row.append(None)
            try:
                row.append(empl["role-title"])
            except:
                row.append(None)
            row.append(empl["path"][21:31]) #Add employement keyword
            try:
                row.append(empl["source"]["source-name"]["value"])
            except:
                row.append(None)
            profile_data.append(row)
    return profile_data

def get_urls(profile):
    ''' For each profile, extract all affiliations and metadata '''
    profile_data = []
    orcid_id = None
    if has_url(profile):
        orcid_id = profile["orcid-identifier"]["path"]
        for url in profile["person"]["researcher-urls"]["researcher-url"]:
            row = [orcid_id]
            row.append(url["url-name"])
            try:
                row.append(url["url"]["value"])
            except:
                row.append(None)
            try:
                row.append(url["source"]["source-name"]["value"])
            except:
                row.append(None)
            profile_data.append(row)
    return profile_data

### Testing

In [53]:
json_dir = "/media/eva/Eva-passport/ORCIDpubData2017/public_profiles_API-2.0_2017_10_json/0"
json_files = file_generator(json_dir)

In [54]:
affiliation_data = []
url_data = []

In [55]:
%%time
get_profiles(affiliation_data, url_data, json_files, stop=25)

/media/eva/Eva-passport/ORCIDpubData2017/public_profiles_API-2.0_2017_10_json/0/0000-0002-1319-0750.jsonCPU times: user 907 ms, sys: 202 ms, total: 1.11 s
Wall time: 1.66 s


In [57]:
aff = pd.DataFrame(affiliation_data, columns = ["orcid_id", "country", "organization_name", 
                              "organization_id", "start_year", "end_year", "affiliation_role" ,"affiliation_type","source"])

url = pd.DataFrame(url_data, columns =["orcid_id", "url_name", "url", "source"])

In [58]:
print ("affiliations:", len(aff), "; unique profiles:", aff.orcid_id.nunique())
print ("urls:", len(url), "; unique profiles:", url.orcid_id.nunique())

affiliations: 5 ; unique profiles: 2
urls: 8 ; unique profiles: 3


In [59]:
aff.head()

Unnamed: 0,orcid_id,country,organization_name,organization_id,start_year,end_year,affiliation_role,affiliation_type,source
0,0000-0001-5000-1640,KR,Sogang University Graduate School of Internati...,92200.0,2005.0,2016.0,Ph.D,education,
1,0000-0001-5000-1640,KR,Citizens' Alliance for North Korean Human Rights,,2004.0,,Deputy Director General,employment,Joanna Hosaniak
2,0000-0001-5000-4390,IN,University of Delhi,28742.0,,1986.0,PhD,education,
3,0000-0001-5000-4390,IN,University of Delhi,28742.0,,1981.0,,education,
4,0000-0001-5000-4390,GB,King's College London,,,,Reader,employment,Sanjukta Deb


In [60]:
url.head()

Unnamed: 0,orcid_id,url_name,url,source
0,0000-0001-5000-2520,UCL IRIS Profile,http://iris.ucl.ac.uk/iris/browse/profile?upi=...,UCL ORCID Registration
1,0000-0001-5001-2390,hoc tieng anh moi ngay,http://www.hoctienganhmoingay.biz/2015/05/hoc-...,Ngo Nguyet
2,0000-0001-5001-4070,Web Address,http://www.cosmeticdentistrycenter.com/,Dmitriy Epelboym
3,0000-0001-5001-4070,Google+,https://plus.google.com/+Cosmeticdentistrycenter,Dmitriy Epelboym
4,0000-0001-5001-4070,Facebook,https://www.facebook.com/nycosmeticdentistryce...,Dmitriy Epelboym


### Reading in all data
After successful testing of the setup, the code can now be run with all data files

In [None]:
json_dir = "/media/eva/Eva-passport/ORCIDpubData2017/public_profiles_API-2.0_2017_10_json"
json_files = file_generator(json_dir)

In [None]:
#data = [] #commenting this out, so we don't accidentally reset the data frame!

In [None]:
%%time
get_profiles(data, json_files)

In [None]:
df = pd.DataFrame(data, columns = ["orcid_id", "country", "organization_name", 
                              "Ringgold_id", "start_year", "end_year", "affiliation_role"])
df.head()

In [None]:
len(df), df.orcid_id.nunique()

In [None]:
affiliation_without_dates = df[((df["start_year"].isnull()) & (df["end_year"].isnull()))]
len(affiliation_without_dates), affiliation_without_dates.orcid_id.nunique()

In [None]:
UStA_all = df[(df.organization_name == "University of St Andrews")]
len(UStA_all), UStA_all.orcid_id.nunique()

In [None]:
df.dtypes

In [None]:
df[df.orcid_id =="0000-0003-4965-2969"]