# Dissecting PubMed
## Which content is covered by the Library? and Open Access?
#### Floriane Muller & Pablo Iriarte, Geneva University Library, Switzerland


## Merge PubMed metadata with DOAJ

The DOAJ metadata was downloaded on 04.05.2018 from https://doaj.org/faq#metadata

File CSV: doaj_20180504_0830_utf8.csv


In [3]:
# display the full content of rows (non truncated)
import pandas as pd
pd.set_option('display.max_colwidth', -1)

# Extract ISSN and year from DOAJ
doaj = pd.read_csv('data/sources/doaj/doaj_20180504_0830_utf8.csv', delimiter=',', header=0, usecols=['Journal title', 'Journal ISSN (print version)', 'Journal EISSN (online version)', 'First calendar year journal provided online Open Access content'])
doaj

Unnamed: 0,Journal title,Journal ISSN (print version),Journal EISSN (online version),First calendar year journal provided online Open Access content
0,Revista de Microbiologia,0001-3714,1678-9881,1998
1,Anais da Academia Brasileira de Ciências,0001-3765,1678-2690,2000
2,ACME,0001-494X,2282-0035,2014
3,Acta Dermato-Venereologica,0001-5555,1651-2057,1998
4,Acta Mycologica,0001-625X,2353-074X,2006
5,Acta Societatis Botanicorum Poloniae,0001-6977,2083-9480,2011
6,Acta Stomatologica Croatica,0001-7019,1846-0410,1966
7,Acta Veterinaria Brno,0001-7213,1801-7576,1978
8,Africa Spectrum,0002-0397,1868-6869,2009
9,Revista Alergia México,0002-5151,2448-9190,2014


In [4]:
# rename columns for practical short names
doaj = doaj.rename(columns = {'Journal ISSN (print version)': 'pissn'})
doaj = doaj.rename(columns = {'Journal EISSN (online version)': 'eissn'})
doaj = doaj.rename(columns = {'First calendar year journal provided online Open Access content': 'syear'})
doaj

Unnamed: 0,Journal title,pissn,eissn,syear
0,Revista de Microbiologia,0001-3714,1678-9881,1998
1,Anais da Academia Brasileira de Ciências,0001-3765,1678-2690,2000
2,ACME,0001-494X,2282-0035,2014
3,Acta Dermato-Venereologica,0001-5555,1651-2057,1998
4,Acta Mycologica,0001-625X,2353-074X,2006
5,Acta Societatis Botanicorum Poloniae,0001-6977,2083-9480,2011
6,Acta Stomatologica Croatica,0001-7019,1846-0410,1966
7,Acta Veterinaria Brno,0001-7213,1801-7576,1978
8,Africa Spectrum,0002-0397,1868-6869,2009
9,Revista Alergia México,0002-5151,2448-9190,2014


In [5]:
# test years empty
doaj.loc[doaj['syear'].isnull()]

Unnamed: 0,Journal title,pissn,eissn,syear


In [14]:
doaj['syear'].dtype

dtype('int64')

In [15]:
# use eissn if pissn is null
doaj['issn'] = doaj['pissn']
doaj.loc[doaj['pissn'].isnull(), 'issn'] = doaj['eissn']
doaj

Unnamed: 0,Journal title,pissn,eissn,syear,issn
0,Revista de Microbiologia,0001-3714,1678-9881,1998,0001-3714
1,Anais da Academia Brasileira de Ciências,0001-3765,1678-2690,2000,0001-3765
2,ACME,0001-494X,2282-0035,2014,0001-494X
3,Acta Dermato-Venereologica,0001-5555,1651-2057,1998,0001-5555
4,Acta Mycologica,0001-625X,2353-074X,2006,0001-625X
5,Acta Societatis Botanicorum Poloniae,0001-6977,2083-9480,2011,0001-6977
6,Acta Stomatologica Croatica,0001-7019,1846-0410,1966,0001-7019
7,Acta Veterinaria Brno,0001-7213,1801-7576,1978,0001-7213
8,Africa Spectrum,0002-0397,1868-6869,2009,0002-0397
9,Revista Alergia México,0002-5151,2448-9190,2014,0002-5151


In [16]:
doaj.loc[doaj['issn'].isnull()]

Unnamed: 0,Journal title,pissn,eissn,syear,issn


In [17]:
# enrich with ISSN-L (linking)
# The ISSN -> ISSN-L table has been obtained from www.issn.org (free to download but only after filling a request on their web site)
issns = pd.read_csv('data/sources/issnl/issn2issnl.csv', delimiter='\t', header=None, names=['issn', 'issnl'])
issns

Unnamed: 0,issn,issnl
0,0000-0019,0000-0019
1,0000-0027,0000-0027
2,0000-0043,0000-0043
3,0000-0051,0000-0051
4,0000-006X,0000-006X
5,0000-0078,0000-0078
6,0000-0094,0000-0094
7,0000-0108,0000-0108
8,0000-0140,0000-0140
9,0000-0159,0000-0159


In [18]:
# merge issns
# rename column to merge
doaj = pd.merge(doaj, issns, on = 'issn', how='left')
doaj.loc[doaj['issnl'].isnull()]

Unnamed: 0,Journal title,pissn,eissn,syear,issn,issnl
2007,Gaziantep University Journal of Social Sciences,1303-0094,2149-5459,2007,1303-0094,
2017,Sigma Journal of Engineering and Natural Sciences,1304-7191,1304-7205,2004,1304-7191,
2026,Anadolu BİL Meslek Yüksekokulu Dergisi,1306-3375,2148-9998,2014,1306-3375,
2032,Duzce Universitesi Tip Fakültesi Dergisi,,1307-671X,2006,1307-671X,
2039,Turkish Studies,1308-2140,,2006,1308-2140,
2062,International Journal of Business and Management Studies,,1309-8047,2009,1309-8047,
2063,International Journal of Economics and Finance Studies,,1309-8055,2009,1309-8055,
2117,International Journal of Educational Technology,1327-7308,2476-0730,2017,1327-7308,
3331,Saudi Journal of Anaesthesia,1658-354X,0975-3125,2009,1658-354X,
3333,International Journal of Health Sciences,1658-3639,1658-7774,2007,1658-3639,


In [19]:
# export issns without issnl
doaj.loc[doaj['issnl'].isnull()].to_csv('data/sources/doaj/doaj_not_issnl.csv', sep='\t', encoding='utf-8', index=False)

In [20]:
# put ISSN at ISSN-L place if ISSN-L is empty
doaj.loc[doaj['issnl'].isnull(), 'issnl'] = doaj['issn']
doaj.loc[doaj['issnl'].isnull()]

Unnamed: 0,Journal title,pissn,eissn,syear,issn,issnl


In [21]:
# check dates
doaj['syear'].value_counts()

2013    1069
2014    1016
2012    955 
2015    883 
2011    855 
2010    772 
2016    733 
2009    665 
2008    589 
2007    515 
2006    390 
2017    369 
2005    350 
2004    262 
2001    241 
2002    231 
2003    227 
2000    215 
1999    135 
1998    121 
1997    109 
1996    83  
1995    49  
1994    44  
1991    37  
1990    34  
1993    33  
1992    29  
1983    25  
1989    22  
        ..  
1947    3   
1965    2   
1961    2   
1953    2   
1950    2   
1963    2   
1946    2   
1954    2   
1962    2   
1949    2   
1960    2   
1952    1   
1936    1   
1928    1   
1920    1   
1943    1   
1942    1   
1929    1   
1937    1   
1945    1   
1934    1   
1918    1   
1874    1   
1930    1   
1938    1   
1909    1   
1948    1   
1932    1   
1939    1   
1904    1   
Name: syear, dtype: int64

In [22]:
# Export results to CSV
doaj.to_csv('data/sources/doaj/doaj_ready_to_pubmed_merge.csv', sep='\t', encoding='utf-8', index=False)

## Merge PubMed and DAOJ journals information

In [23]:
# Restart kernel
# Open Pubmed data
import pandas as pd
pubmed = pd.read_csv('data/sources/pubmed/pmid_issn_issnl_year_clean.csv.gz', delimiter='\t',
                     dtype={'pmid': 'int', 'issn': 'object', 'issnl': 'object', 'year': 'int'}, 
                     header=0)
pubmed

Unnamed: 0,issn,issnl,pmid,year
0,1090-2104,0006-291x,2,1975
1,0006-2944,0006-2944,1,1975
2,0006-291X,0006-291x,3,1975
3,1090-2104,0006-291x,4,1975
4,0006-291X,0006-291x,6,1975
5,1873-2968,0006-2952,7,1975
6,1873-2968,0006-2952,8,1975
7,1090-2104,0006-291x,5,1975
8,0006-2952,0006-2952,9,1975
9,0006-2952,0006-2952,11,1975


In [24]:
# import doaj data
doaj = pd.read_csv('data/sources/doaj/doaj_ready_to_pubmed_merge.csv', delimiter='\t', header=0,
                       dtype={'issnl': 'object', 'syear': 'int'},
                       usecols=('issnl', 'syear'))
doaj

Unnamed: 0,syear,issnl
0,1998,0001-3714
1,2000,0001-3765
2,2014,0001-494X
3,1998,0001-5555
4,2006,0001-625X
5,2011,0001-6977
6,1966,0001-7019
7,1978,0001-7213
8,2009,0002-0397
9,2014,0002-5151


In [25]:
# normalize ISSNL
# remove blanks
doaj['issnl'] = doaj['issnl'].str.strip()
# convert to lower case
doaj['issnl'] = doaj['issnl'].str.lower()

In [26]:
# rename years
doaj = doaj.rename(columns = {'syear': 'doaj_syear'})

In [27]:
# merge PubMed with pjournals data by ISSNL
pubmed = pubmed.merge(doaj, on = 'issnl', how='left')
pubmed

Unnamed: 0,issn,issnl,pmid,year,doaj_syear
0,1090-2104,0006-291x,2,1975,
1,0006-2944,0006-2944,1,1975,
2,0006-291X,0006-291x,3,1975,
3,1090-2104,0006-291x,4,1975,
4,0006-291X,0006-291x,6,1975,
5,1873-2968,0006-2952,7,1975,
6,1873-2968,0006-2952,8,1975,
7,1090-2104,0006-291x,5,1975,
8,0006-2952,0006-2952,9,1975,
9,0006-2952,0006-2952,11,1975,


In [28]:
# Add DOAJ column
pubmed['DOAJ'] = 0
# Calculate if years are in range
pubmed.loc[pubmed['year'] >= pubmed['doaj_syear'], 'DOAJ'] = 1
pubmed

Unnamed: 0,issn,issnl,pmid,year,doaj_syear,DOAJ
0,1090-2104,0006-291x,2,1975,,0
1,0006-2944,0006-2944,1,1975,,0
2,0006-291X,0006-291x,3,1975,,0
3,1090-2104,0006-291x,4,1975,,0
4,0006-291X,0006-291x,6,1975,,0
5,1873-2968,0006-2952,7,1975,,0
6,1873-2968,0006-2952,8,1975,,0
7,1090-2104,0006-291x,5,1975,,0
8,0006-2952,0006-2952,9,1975,,0
9,0006-2952,0006-2952,11,1975,,0


In [29]:
# sort values and drop duplicates to keep the good rows (with 1)
pubmed = pubmed.sort_values(by=['DOAJ', 'pmid'], ascending=[False, True])
pubmed

Unnamed: 0,issn,issnl,pmid,year,doaj_syear,DOAJ
6707,0044-6025,0044-6025,7923,1975,1956.0,1
12141,0091-6765,0091-6765,13988,1976,1972.0,1
14578,0091-6765,0091-6765,16744,1976,1972.0,1
17744,0091-6765,0091-6765,20303,1976,1972.0,1
22643,0091-6765,0091-6765,25762,1977,1972.0,1
22645,0091-6765,0091-6765,25763,1977,1972.0,1
24802,0091-6765,0091-6765,28220,1978,1972.0,1
27566,0091-6765,0091-6765,31276,1978,1972.0,1
27570,0091-6765,0091-6765,31277,1978,1972.0,1
27571,0091-6765,0091-6765,31278,1978,1972.0,1


In [30]:
# drop duplicates
pubmed = pubmed.drop_duplicates(subset='pmid')

In [31]:
pubmed

Unnamed: 0,issn,issnl,pmid,year,doaj_syear,DOAJ
6707,0044-6025,0044-6025,7923,1975,1956.0,1
12141,0091-6765,0091-6765,13988,1976,1972.0,1
14578,0091-6765,0091-6765,16744,1976,1972.0,1
17744,0091-6765,0091-6765,20303,1976,1972.0,1
22643,0091-6765,0091-6765,25762,1977,1972.0,1
22645,0091-6765,0091-6765,25763,1977,1972.0,1
24802,0091-6765,0091-6765,28220,1978,1972.0,1
27566,0091-6765,0091-6765,31276,1978,1972.0,1
27570,0091-6765,0091-6765,31277,1978,1972.0,1
27571,0091-6765,0091-6765,31278,1978,1972.0,1


In [32]:
# PMIDs covered by DOAJ journals 
pubmed.loc[pubmed['DOAJ'] == 1].shape

(1479551, 6)

In [33]:
# export the result
pubmed.to_csv('data/results/pubmed_doaj.csv.gz', sep='\t', encoding='utf-8', index=False, compression='gzip')
pubmed[['pmid', 'DOAJ']].to_csv('data/results/pubmed_doaj_short.csv.gz', sep='\t', encoding='utf-8', index=False, compression='gzip')

In [34]:
# export the result for positives
pubmed.loc[pubmed['DOAJ'] == 1].to_csv('data/results/pubmed_doaj_positives.csv.gz', sep='\t', encoding='utf-8', index=False, compression='gzip')
pubmed.loc[pubmed['DOAJ'] == 1][['pmid', 'DOAJ']].to_csv('data/results/pubmed_doaj_positives_short.csv.gz', sep='\t', encoding='utf-8', index=False, compression='gzip')