# Getting Researcher Focused Demographics from the Dimensions API


By Using the API and processing the author_affiliations, it is possible to derive quite a number of researcher demographics

In [None]:
import pandas as pd
from dimcli.shortcuts import dslquery_json as dslquery

## 1 Start with a publication selection

In [None]:
def searchPubs(limit=1000, skip=0):
    data = """search publications 
          where year in [2017:2018]
          and journal.id = "jour.1048844"
          and type="article"
          return publications[id+author_affiliations]
          limit {} skip {}
    """.format(limit,skip)
    return data

In [None]:
def dslsearchpublications():
    skip = 0
    pubs = []
    total_pubs = []
    result = {}
    while (skip == 0) or (len(pubs) == 1000):
        pubs = dslquery(searchPubs(skip=skip)).get('publications',[])
        total_pubs += pubs
        skip += 1000      
                            
    return total_pubs

In [None]:
pubs = dslsearchpublications()

## 2 Extract researcher ids from the publications

In [None]:
researchers = [ auth.get('researcher_id')
                for p in pubs
                for auth in p.get('author_affiliations',[{}])[0]
                   if auth.get('researcher_id') is not None ]



In [None]:
len(set(researchers))

## 3 Get the publication histories for each of the researchers retrieved

Have a cup of tea whilst it completes...

In [None]:
def publicationsfromresearchers(researcherids,limit=1000,skip=0):
    searchstring = """
    search publications
       where
        researchers.id in [{}]
    return publications[id+year+author_affiliations] limit {} skip {}
    """.format(",".join([ '"{}"'.format(r) for r in researcherids]),limit,skip)
    print (searchstring)
    return searchstring

In [None]:
def dslsearchpublicationsR(ids):
    skip = 0
    pubs = []
    total_pubs = []
    result = {}
    while (skip == 0) or (len(pubs) == 1000):
        pubs = dslquery(publicationsfromresearchers(ids,skip=skip)).get('publications',[])
        total_pubs += pubs
        skip += 1000      
                            
    return total_pubs

In [None]:
def publicationsfromresearcherlist(researcherids):
    pubs=[]
    
    idchunks = [researcherids[x:x + 500] for x in range(0, len(researcherids), 500)]
    for ids in idchunks:
        results = dslsearchpublicationsR(ids)
        pubs += results
        print(len(pubs))
        
    return pubs

In [None]:
respubs = publicationsfromresearcherlist(list(set(researchers)))

## 4 Extract researcher activity from the output

In [None]:
full_researchers = [ dict( researcher_id = auth.get('researcher_id'),
                      first_name = auth.get('first_name'),
                      last_name = auth.get('last_name'),    
                      year = p.get('year')
                    )
                for p in respubs
                for auth in p.get('author_affiliations',[{}])[0]
                   if auth.get('researcher_id') in list(set(researchers)) ]

In [None]:
full_researchers[40]

In [None]:
fridf = pd.DataFrame(full_researchers).drop_duplicates()
fridf.head()

## 5 Derive demographic details for the authors

* Based on first name, guess the gender of the author
* Based on the number of years, they have published, calculate a publication age (excluding years they have not published in)

In [None]:
import numpy as np
import gender_guesser.detector as gender_guesser

In [None]:
d = gender_guesser.Detector(case_sensitive=False)

In [None]:
# define an aggregate function for the gender guesser

In [None]:
def getgender(series):
    fnl = list(set([d.get_gender(name.split(' ')[0])
             for name in series.tolist()]))
    gender = [g for g in fnl if g != 'unknown']
    if len(gender) == 1:
        return gender[0]
    else:
        return 'unknown'
    return 

In [None]:
frdf = fridf.groupby('researcher_id'). \
     agg({'first_name': ['max',getgender],
          'year': ['min','max',pd.Series.nunique]
         }).reset_index()

In [None]:
frdf.columns = ['researcher_id', 'first_name', 'gender', 'min_year', 'max_year', 'publication_age']
frdf = frdf.set_index('researcher_id')

In [None]:
frdf.head()

## 6 Produce a gender profile for the resaerchers identified

In [None]:
frdf[['gender','first_name']].groupby(['gender']).count()

## 7 Create a Histogram of researchers by publication age

In [None]:
frdf[['publication_age']].hist(bins=5)

## ..By publication age and gender

In [None]:
frdf.loc[frdf['gender'] == 'male'][['publication_age']].hist(bins=5)

In [None]:
frdf.loc[frdf['gender'] == 'female'][['publication_age']].hist(bins=5)

In [None]:
frdf.loc[frdf['gender'] == 'andy'][['publication_age']].hist(bins=5)

## 8 Combine Resaercher Demographic details with Publication properties

* Get the number of authors per paper

In [None]:
publications = [ dict(
                      pub_id = i,
                      author_position = j,
                      researcher_id = auth.get('researcher_id')
                    )
                for i,p in enumerate(pubs)
                for j, auth in enumerate(p.get('author_affiliations',[{}])[0])
                    if auth.get('researcher_id') is not None
                   ]

In [None]:
pub_df = pd.DataFrame(publications)

In [None]:
pub_df.head()

## Authors per paper

In [None]:
pub_df_grouped = pub_df.groupby('pub_id').count()
pub_df_grouped.columns = ['count authors', 'count_researchers']
pub_df_grouped.head()

In [None]:
pub_df_grouped[['count authors']].hist(bins=11)

## Senior publication age per paper

In [None]:
pub_df = pub_df.set_index('researcher_id')

In [None]:
pub_df.head()

In [None]:
#pgdf = pdf.set_index('FORid').join(gdf[['number_of_grants','FORid']].set_index('FORid'))

pubsjoined = pub_df.join(frdf)
pubsjoined.head()

In [None]:
pubsjoined[['pub_id','publication_age']].groupby('pub_id').max().hist()

In [None]:
pmaxage = pubsjoined[['pub_id','publication_age']].groupby('pub_id').max()

In [None]:
pmerged = pd.merge(pubsjoined, pmaxage, on='pub_id', how='left')
pmerged.head(5)

# Create a histogram of author position of the senior author (by publication age)

In [None]:
pmfiltered = pmerged.loc[pmerged['publication_age_x'] == pmerged['publication_age_y']]

In [None]:
pmfiltered[['author_position']].hist(bins=100)

# Get the gender of the senior author (by publication age) on the paper

In [None]:
pmfiltered[['gender','pub_id']].groupby('gender').count()

---
# Want to learn more?

Check out the [Dimensions API Lab](https://digital-science.github.io/dimensions-api-lab/) website, which contains many tutorials and reusable Jupyter notebooks for scholarly data analytics. 