<a href="https://colab.research.google.com/github/digital-science/dimensions-api-lab" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open Dimensions API Lab In Google Colab"/></a>

# Part 1: Extracting a Journal's Publications+Reseachers Dataset

## Install Dimensions Library and login

In [None]:
try:
  from google.colab import files
  %load_ext google.colab.data_table
  COLAB_ENV = True
  !pip install dimcli plotly_express  -U
  !mkdir data # to save temp data 
except:
  COLAB_ENV = False


# common libraries
import pandas as pd
from pandas.io.json import json_normalize
import time
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
from getpass import getpass
# FINALLY..
import dimcli
from dimcli.shortcuts import *

# set up for exports
if not COLAB_ENV:
  from plotly.offline import init_notebook_mode # needed for exports 
  init_notebook_mode(connected=True)

##
# LOG IN 
##

USERNAME = "m.pasin@digital-science.com"  #@param {type: "string"}

if not USERNAME:
  print("====\nERROR: Please enter a valid Dimensions API username")
else:
  password = getpass('====\nEnter password here')
  print('=> username is', USERNAME)
  print('=> password is', "*" * len(password))
  dimcli.login(USERNAME, password)
  dsl = dimcli.Dsl()


## Select a Journal and Extract All Publications Metadata

In [2]:
#@title Select a journal from the dropdown
#@markdown If the journal isn't there, you can try type in the exact name instead.

journal_title = "Nature Genetics" #@param ['Nature', 'The Science of Nature', 'Nature Communications', 'Nature Biotechnology', 'Nature Medicine', 'Nature Genetics', 'Nature Neuroscience', 'Nature Structural & Molecular Biology', 'Nature Methods', 'Nature Cell Biology', 'Nature Immunology', 'Nature Reviews Drug Discovery', 'Nature Materials', 'Nature Physics', 'Nature Reviews Neuroscience', 'Nature Nanotechnology', 'Nature Reviews Genetics', 'Nature Reviews Urology', 'Nature Reviews Molecular Cell Biology', 'Nature Precedings', 'Nature Reviews Cancer', 'Nature Photonics', 'Nature Reviews Immunology', 'Nature Reviews Cardiology', 'Nature Reviews Gastroenterology & Hepatology', 'Nature Reviews Clinical Oncology', 'Nature Reviews Endocrinology', 'Nature Reviews Neurology', 'Nature Chemical Biology', 'Nature Reviews Microbiology', 'Nature Geoscience', 'Nature Reviews Rheumatology', 'Nature Climate Change', 'Nature Reviews Nephrology', 'Nature Chemistry', 'Nature Digest', 'Nature Protocols', 'Nature Middle East', 'Nature India', 'Nature China', 'Nature Plants', 'Nature Microbiology', 'Nature Ecology & Evolution', 'Nature Astronomy', 'Nature Energy', 'Nature Human Behaviour', 'AfCS-Nature Molecule Pages', 'Human Nature', 'Nature Reviews Disease Primers', 'Nature Biomedical Engineering', 'Nature Reports Stem Cells', 'Nature Reviews Materials', 'Nature Sustainability', 'Nature Catalysis', 'Nature Electronics', 'Nature Reviews Chemistry', 'Nature Metabolism', 'Nature Reviews Physics', 'Nature Machine Intelligence', 'NCI Nature Pathway Interaction Database', 'Nature Reports: Climate Change'] {allow-input: true}
start_year = 2015  #@param {type: "number"}
#@markdown ---

# PS 
# To get titles from the API one can do this:
# > %dsldf search publications where journal.title~"Nature" and publisher="Springer Nature" return journal limit 100
# > ", ".join([f"'{x}'" for x in list(dsl_last_results.title)]) 
#

pubs = dslqueryall(f"""search publications where 
    journal.title="{journal_title}" and 
    year>={start_year} 
    return publications[basics+altmetric+times_cited]""")

# save the data as well
dfpubs = pubs.as_dataframe()
dfpubs.to_csv("data/1.pubs_metadata_with_metrics.csv")

1000 / 1371
1371 / 1371


In [3]:
# preview the publications 
pubs.as_dataframe()

Output hidden; open in https://colab.research.google.com to view.

In [9]:
# preview the authors data 
authors = pubs.as_dataframe_authors()
authors.to_csv("data/1.publications_authors.csv", index=False)
authors

Unnamed: 0,affiliations,corresponding,current_organization_id,first_name,is_bogus,last_name,orcid,researcher_id,pub_id
0,"[{'id': 'grid.493090.7', 'name': 'Université B...",True,,Pierre,,Vabres,,,pub.1121383028
1,"[{'id': 'grid.493090.7', 'name': 'Université B...",,,Arthur,,Sorlin,,,pub.1121383028
2,"[{'id': 'grid.5386.8', 'name': 'Cornell Univer...",,,Stanislav S.,,Kholmanskikh,,,pub.1121383028
3,"[{'id': 'grid.134996.0', 'name': 'Centre Hospi...",,,Bénédicte,,Demeer,,,pub.1121383028
4,"[{'id': 'grid.5613.1', 'name': 'University of ...",,,Judith,,St-Onge,,,pub.1121383028
5,"[{'id': 'grid.5613.1', 'name': 'University of ...",,,Yannis,,Duffourd,,,pub.1121383028
6,"[{'id': 'grid.5613.1', 'name': 'University of ...",,,Paul,,Kuentz,,,pub.1121383028
7,"[{'id': 'grid.493090.7', 'name': 'Université B...",,,Jean-Benoît,,Courcet,,,pub.1121383028
8,"[{'id': 'grid.493090.7', 'name': 'Université B...",,,Virginie,,Carmignac,,,pub.1121383028
9,"[{'id': 'grid.5613.1', 'name': 'University of ...",,,Philippine,,Garret,,,pub.1121383028


In [12]:
# preview the affiliations data 
affiliations = pubs.as_dataframe_authors_affiliations()
affiliations.to_csv("data/1.publications_authors_affiliations.csv", index=False)
affiliations

Unnamed: 0,aff_city,aff_city_id,aff_country,aff_country_code,aff_id,aff_name,aff_state,aff_state_code,pub_id,researcher_id,first_name,last_name
0,Besançon,3033123.0,France,FR,grid.493090.7,Université Bourgogne Franche-Comté,,,pub.1121383028,,Pierre,Vabres
1,Dijon,3021372.0,France,FR,grid.5613.1,University of Burgundy,,,pub.1121383028,,Pierre,Vabres
2,Besançon,3033123.0,France,FR,grid.493090.7,Université Bourgogne Franche-Comté,,,pub.1121383028,,Arthur,Sorlin
3,Dijon,3021372.0,France,FR,grid.5613.1,University of Burgundy,,,pub.1121383028,,Arthur,Sorlin
4,Ithaca,5122432.0,United States,US,grid.5386.8,Cornell University,New York,US-NY,pub.1121383028,,Stanislav S.,Kholmanskikh
5,Amiens,3037854.0,France,FR,grid.134996.0,Centre Hospitalier Universitaire D' Amiens,,,pub.1121383028,,Bénédicte,Demeer
6,Dijon,3021372.0,France,FR,grid.5613.1,University of Burgundy,,,pub.1121383028,,Judith,St-Onge
7,Besançon,3033123.0,France,FR,grid.493090.7,Université Bourgogne Franche-Comté,,,pub.1121383028,,Judith,St-Onge
8,Montreal,6077243.0,Canada,CA,grid.63984.30,McGill University Health Centre,Quebec,CA-QC,pub.1121383028,,Judith,St-Onge
9,Dijon,3021372.0,France,FR,grid.5613.1,University of Burgundy,,,pub.1121383028,,Yannis,Duffourd


## Some stats about authors

* count how many authors in total 
* count how many authors have a researcher ID
* count how many unique researchers IDs we have in total

In [11]:
researchers = authors.query("researcher_id!=''")
#
df = pd.DataFrame({
    'measure' : ['Authors in total (non unique)', 'Authors with a researcher ID', 'Authors with a researcher ID (unique)'],
    'count' : [len(authors), len(researchers), researchers['researcher_id'].nunique()],
})
px.bar(df, x="measure", y="count", title=f"Author stats for {journal_title} (from {start_year})")

In [None]:
# save the researchers data to a file
researchers.to_csv("data/1.authors_with_researchers_id.csv")

## Apprendix: A quick look at authors *without a Researcher ID*

We're not going to try to disambiguate them here, but still it's good to have a quick look at them... 

Looks like the most common surname is `Wang`, while the most common first name is an empty value

In [23]:
authors_without_id = authors.query("researcher_id==''")
authors_without_id[['first_name', 'last_name']].describe()


Wang     38
Li       24
Zhang    23
Liu      18
Chen     18
Xu       15
Smith    10
Huang    10
Kim      10
Zhou      9
Name: last_name, dtype: int64

Top Ten surnames seem all Chinese.. 

In [24]:
authors_without_id['last_name'].value_counts()[:10]

Wang     38
Li       24
Zhang    23
Liu      18
Chen     18
Xu       15
Smith    10
Huang    10
Kim      10
Zhou      9
Name: last_name, dtype: int64

### Any common patterns? 

If we try to group the data by name+surname we can see some interesting patterns 

* some entries are things which are not persons (presumably the results of bad source data in Dimensions, eg from the publisher) 
* there are some apparently meaningful name+surname combinations with a lot of hits
* not many Chinese names in the top ones 



In [26]:
test = authors_without_id.groupby(["first_name", "last_name"]).size()
test.sort_values(ascending=False, inplace=True)
test.head(50)

first_name    last_name                
              the 23andMe Research Team    4
Qing          Xu                           3
John H        Reynolds                     3
Hao           Wang                         3
David         Fitzpatrick                  3
Runpeng       Liu                          3
Robin         Tremblay                     3
Harriet       de Wit                       3
James         MacKillop                    3
Bin           Zhang                        3
Jianhua       Chu                          3
Yan           Jiang                        3
Lihua         Guo                          3
Michael A     Long                         3
Michael C     Avery                        3
Michael A.    Wheeler                      3
Mohammad S    Rashid                       3
Guoping       Feng                         2
Luke          Healy                        2
Andrew        McKenzie                     2
Joachim       Hallmayer                    2
David         S

## Conclusion and next steps

For the next tasks, we will **focus on the disambiguated authors** as the ID-links will let us carry out useful analyses.

We can save these results though and try to do some manual disambiguation later. Also, adding a simple google-search URL can help in making sense of these data quickly.

In [27]:
from urllib.parse import quote

out = []
for index, value in test.items():
    # compose a simple URL of the form 'https://www.google.com/search?q=tonu+esko'
    if index[0] or index[1]:
        n, s = quote(index[0]), quote(index[1])
        url = f"https://www.google.com/search?q={n}+{s}"
    else:
        url = ""
    d = {'name': index[0] , 'surname' : index[1] , 'frequency' : value , 'search_url' : url }
    out.append(d)

dftest = pd.DataFrame.from_dict(out)
# set order of columns
dftest = dftest[['name', 'surname', 'frequency', 'search_url']]

dftest.head(20)

Unnamed: 0,name,surname,frequency,search_url
0,,the 23andMe Research Team,4,https://www.google.com/search?q=+the%2023andMe...
1,Qing,Xu,3,https://www.google.com/search?q=Qing+Xu
2,John H,Reynolds,3,https://www.google.com/search?q=John%20H+Reynolds
3,Hao,Wang,3,https://www.google.com/search?q=Hao+Wang
4,David,Fitzpatrick,3,https://www.google.com/search?q=David+Fitzpatrick
5,Runpeng,Liu,3,https://www.google.com/search?q=Runpeng+Liu
6,Robin,Tremblay,3,https://www.google.com/search?q=Robin+Tremblay
7,Harriet,de Wit,3,https://www.google.com/search?q=Harriet+de%20Wit
8,James,MacKillop,3,https://www.google.com/search?q=James+MacKillop
9,Bin,Zhang,3,https://www.google.com/search?q=Bin+Zhang


In [None]:
# save the data
#
dftest.to_csv("data/1.authors_not_disambiguated_frequency.csv", header=True)

In [None]:
if COLAB_ENV:
  files.download("data/1.authors_not_disambiguated_frequency.csv")
  files.download("data/1.authors_with_researchers_id.csv")
  files.download("data/1.publications_authors.csv")
  files.download("data/1.publications_authors_affiliations.csv")
  files.download("data/1.pubs_metadata_with_metrics.csv")

That's it! 

Now let's go and open this in [Google Sheets](https://docs.google.com/spreadsheets/)...