# Dissecting PubMed
## Which content is covered by the Library? and Open Access?
#### Floriane Muller & Pablo Iriarte, Geneva University Library, Switzerland

# Matching PubMed with PMC datafile (Keys: PMC-PMID)

Source and description on metadata available: https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/#ftp

On 02.05.2018 we download the file "PMC-ids.csv.gz" available here: ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/PMC-ids.csv.gz (90 Mb csv file Gzipped)


In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

pmc = pd.read_csv('data/sources/pmc/PMC-ids.csv.gz', delimiter=',', header=0, compression='gzip')
pmc

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date
0,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,55,,PMC13900,11250746.0,,live
1,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,61,,PMC13901,11250747.0,,live
2,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,66,,PMC13902,11250748.0,,live
3,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,59,10.1186/bcr29,PMC13911,11056684.0,,live
4,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,64,,PMC13912,11400682.0,,live
5,Breast Cancer Res,1465-5411,1465-542X,1999,1,1,73,10.1186/bcr16,PMC13913,11056681.0,,live
6,Breast Cancer Res,1465-5411,1465-542X,1999,1,1,81,10.1186/bcr17,PMC13914,11056682.0,,live
7,Breast Cancer Res,1465-5411,1465-542X,1999,1,1,88,10.1186/bcr18,PMC13915,11056683.0,,live
8,Breast Cancer Res,1465-5411,1465-542X,2000,2,2,139,10.1186/bcr45,PMC13916,11056686.0,,live
9,Breast Cancer Res,1465-5411,1465-542X,2000,2,3,222,10.1186/bcr57,PMC13917,11056687.0,,live


In [2]:
# state te number of lignes and columns in the dataframe
pmc.shape

(4966742, 12)

In [3]:
# count the number of values in each column
pmc.count()

Journal Title    4966529
ISSN             3853474
eISSN            4277149
Year             4966742
Volume           4942351
Issue            4188496
Page             4966491
DOI              3297912
PMCID            4966742
PMID             4408531
Manuscript Id    656489 
Release Date     4966742
dtype: int64

In [4]:
# show which lines have neither PMID, nor DOI
pmc[pmc.PMID.isnull()& pmc.DOI.isnull()]

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date
10200,Proc Natl Acad Sci U S A,0027-8424,1091-6490,1996,93,25,14315,,PMC26127,,,live
11534,BMJ,0959-8138,1756-1833,2000,321,7276,1556,,PMC27559,,,live
13838,Bull Med Libr Assoc,0025-7338,,2001,89,1,85,,PMC31712,,,live
13839,Bull Med Libr Assoc,0025-7338,,2001,89,1,86,,PMC31713,,,live
13840,Bull Med Libr Assoc,0025-7338,,2001,89,1,87,,PMC31714,,,live
13841,Bull Med Libr Assoc,0025-7338,,2001,89,1,88,,PMC31715,,,live
13842,Bull Med Libr Assoc,0025-7338,,2001,89,1,89,,PMC31716,,,live
13843,Bull Med Libr Assoc,0025-7338,,2001,89,1,91,,PMC31717,,,live
13844,Bull Med Libr Assoc,0025-7338,,2001,89,1,93,,PMC31718,,,live
13845,Bull Med Libr Assoc,0025-7338,,2001,89,1,95,,PMC31719,,,live


In [5]:
# note to self: some (not all) of those are in PubMed nonetheless, ex. PMC5703221 is in fact PMID 29188191 
# We will need to match by PMID and PMCID

In [6]:
pmc['Release Date'].value_counts()

live          4889620
2018-06-01    5215   
2018-11-01    5009   
2018-09-01    4610   
2018-07-01    4552   
2018-10-01    4382   
2018-12-01    4337   
2018-08-01    4215   
2019-01-01    4209   
2019-02-01    3723   
2019-03-01    3531   
2019-04-01    3070   
2019-05-01    1487   
2019-06-01    342    
2018-08-15    313    
2018-06-15    303    
2018-07-15    292    
2018-09-15    289    
2018-05-15    283    
2018-11-15    251    
2018-12-15    230    
2018-10-15    226    
2019-01-15    222    
2019-02-15    218    
2019-03-15    212    
2018-09-05    200    
2018-07-02    184    
2018-06-07    174    
2018-07-05    161    
2018-06-19    155    
             ...     
2019-08-06    1      
2019-10-14    1      
2019-09-02    1      
2018-02-07    1      
2020-03-26    1      
2019-11-11    1      
2020-03-10    1      
2019-07-15    1      
2019-12-17    1      
2020-03-18    1      
2019-08-26    1      
2019-09-16    1      
2018-03-02    1      
2020-02-15    1      
2020-02-19

### Sort DOIs stuck in Page Column 

In [7]:
# Look at data where DOI column is empty to check whether DOI is available in another colmn
pmc[pmc.DOI.isnull()]

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date
0,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,55,,PMC13900,11250746.0,,live
1,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,61,,PMC13901,11250747.0,,live
2,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,66,,PMC13902,11250748.0,,live
4,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,64,,PMC13912,11400682.0,,live
16,Breast Cancer Res,1465-5411,1465-542X,2001,3,2,134,,PMC13924,11250760.0,,live
208,Proc Natl Acad Sci U S A,0027-8424,1091-6490,2000,98,3,1182,,PMC14729,11252306.0,,live
231,Mol Biol Cell,1059-1524,1939-4586,2000,11,1,1,,PMC14752,10637286.0,,live
232,Mol Biol Cell,1059-1524,1939-4586,2000,11,1,13,,PMC14753,10637287.0,,live
233,Mol Biol Cell,1059-1524,1939-4586,2000,11,1,23,,PMC14754,10637288.0,,live
234,Mol Biol Cell,1059-1524,1939-4586,2000,11,1,39,,PMC14755,10637289.0,,live


In [8]:
# Find dois that are hidden in page coloumn
import re
pat_doi = (r'(10\.(\d)+/(\S)+)')

In [9]:
# Test the patterns and see how many DOIs appear in the page column
pmc.loc[(pmc['DOI'].isnull()) & (pmc['Page'].str.contains(pat_doi))].shape

  from ipykernel import kernelapp as app


(272, 12)

In [10]:
# create a new column and store those DOI 
pmc.loc[(pmc['DOI'].isnull()) & (pmc['Page'].str.contains(pat_doi)), 'doi_in_pages'] = pmc['Page']

  from ipykernel import kernelapp as app


In [11]:
# Show lines with dois found hidden in pages
pmc.loc[pmc['doi_in_pages'].notnull()]

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,doi_in_pages
2540391,Biochim Biophys Acta,0006-3002,1878-2434,2012,,,10.1016/j.bbadis.2012.01.002,,PMC3340479,,NIHMS349375,live,10.1016/j.bbadis.2012.01.002
2715222,J Complement Integr Med,,1553-3840,2011,8,,10.2202/1553-3840.1157,,PMC3518418,22754942.0,NIHMS425650,live,10.2202/1553-3840.1157
3003374,Obstet Gynecol,0029-7844,1873-233X,2013,121,1,http://10.1097/AOG.0b013e318278ce86,,PMC3811068,23262934.0,NIHMS491223,live,http://10.1097/AOG.0b013e318278ce86
3056029,Obstet Gynecol,0029-7844,1873-233X,2013,121,1,http://10.1097/AOG.0b013e31827499a9,,PMC3864642,23262937.0,NIHMS533995,live,http://10.1097/AOG.0b013e31827499a9
3066501,Obstet Gynecol,0029-7844,1873-233X,2013,121,1,http://10.1097/AOG.0b013e31827a072c,,PMC3875219,23262925.0,NIHMS499188,live,http://10.1097/AOG.0b013e31827a072c
3071927,Obstet Gynecol,0029-7844,1873-233X,2013,121,2 0 1,http://10.1097/AOG.0b013e31827d8ad5,,PMC3880675,23344286.0,NIHMS539499,live,http://10.1097/AOG.0b013e31827d8ad5
3247965,Mol Imaging,1535-3508,1536-0121,2013,12,8,10.2310/7290.2013.00064,,PMC4060516,24447615.0,NIHMS583842,live,10.2310/7290.2013.00064
3290678,Mol Imaging,1535-3508,1536-0121,2013,12,8,10.2310/7290.2013.00065,,PMC4103900,24447617.0,NIHMS574576,live,10.2310/7290.2013.00065
3384362,Mol Imaging,1535-3508,1536-0121,2014,13,,10.2310/7290.2014.00015,,PMC4199087,24825818.0,NIHMS633251,live,10.2310/7290.2014.00015
3449653,Mol Imaging,1535-3508,1536-0121,2014,13,,DOI 10.2310/7290.2014.00020,,PMC4265553,25060207.0,NIHMS646694,live,DOI 10.2310/7290.2014.00020


In [12]:
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('http://10', '10', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('https://10', '10', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('http://dx.doi.org/10', '10', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('https://dx.doi.org/10', '10', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('dx.doi.org/10', '10', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('http://doi.org/10', '10', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('https://doi.org/10', '10', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('doi:', '', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('doi: ', '', case=False)
pmc['doi_in_pages'] = pmc['doi_in_pages'].str.replace('DOI ', '', case=False)

In [13]:
pmc.loc[pmc['doi_in_pages'].notnull()].sort_values(by='doi_in_pages')

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,doi_in_pages
3688761,J Aging Health,0898-2643,1552-6887,2012,24,2,doi: 10.1177/0898264312436513,,PMC4512172,,NIHMS589409,live,10.1177/0898264312436513
4884224,J Appl Polym Sci,0021-8995,1097-4628,2015,132,23,10.1002/APP.42054,,PMC5823705,29479115.0,NIHMS943018,live,10.1002/APP.42054
2540391,Biochim Biophys Acta,0006-3002,1878-2434,2012,,,10.1016/j.bbadis.2012.01.002,,PMC3340479,,NIHMS349375,live,10.1016/j.bbadis.2012.01.002
4925711,Psychol Men Masc,1524-9220,1939-151X,2017,2017,,10.1037/men0000132.,,PMC5868744,,NIHMS950329,2018-09-11,10.1037/men0000132.
4584101,Protoc exch,,2043-0116,2016,2016,,10.1038/protex.2016.037,,PMC5501460,28690779.0,NIHMS836609,live,10.1038/protex.2016.037
4287651,Dlib Mag,,1082-9873,2016,22,9-10,10.1045/september2016-mishra,,PMC5142764,27942200.0,NIHMS821086,live,10.1045/september2016-mishra
4455799,J Environ Eng (New York),0733-9372,1943-7870,2016,142,10,10.1061/(ASCE)EE.1943-7870.0001141,,PMC5364726,28348455.0,NIHMS847435,live,10.1061/(ASCE)EE.1943-7870.0001141
4755287,J Eng Mech,0733-9399,1943-7889,2012,138,10,10.1061/(ASCE)EM.1943-7889.0000430,,PMC5685556,29147066.0,NIHMS887851,live,10.1061/(ASCE)EM.1943-7889.0000430
4369466,ASCE ASME J Risk Uncertain Eng Syst A Civ Eng,,2376-7642,2016,2,4,10.1061/AJRUA6.0000867,,PMC5267494,28133626.0,NIHMS809665,live,10.1061/AJRUA6.0000867
4898507,Work Stress,0267-8373,,2018,2018,,10.1080/02678373.2018.1436615,,PMC5839332,,NIHMS944522,live,10.1080/02678373.2018.1436615


In [14]:
# (if required) search DOIs in page for all the dataframe and check whether dois found in pages are the same a DOI in DOI columns.

In [15]:
# (if chosen) decide to copy DOI present in doi_in_pages column to the DOI column, when pmc[pmc.DOI.isnull()& pmc.doi_in_pages.notnull()] 
pmc.loc[(pmc['DOI'].isnull()) & (pmc['doi_in_pages'].notnull()), 'DOI'] = pmc['doi_in_pages']
pmc.loc[pmc['doi_in_pages'].notnull()]

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,doi_in_pages
2540391,Biochim Biophys Acta,0006-3002,1878-2434,2012,,,10.1016/j.bbadis.2012.01.002,10.1016/j.bbadis.2012.01.002,PMC3340479,,NIHMS349375,live,10.1016/j.bbadis.2012.01.002
2715222,J Complement Integr Med,,1553-3840,2011,8,,10.2202/1553-3840.1157,10.2202/1553-3840.1157,PMC3518418,22754942.0,NIHMS425650,live,10.2202/1553-3840.1157
3003374,Obstet Gynecol,0029-7844,1873-233X,2013,121,1,http://10.1097/AOG.0b013e318278ce86,10.1097/AOG.0b013e318278ce86,PMC3811068,23262934.0,NIHMS491223,live,10.1097/AOG.0b013e318278ce86
3056029,Obstet Gynecol,0029-7844,1873-233X,2013,121,1,http://10.1097/AOG.0b013e31827499a9,10.1097/AOG.0b013e31827499a9,PMC3864642,23262937.0,NIHMS533995,live,10.1097/AOG.0b013e31827499a9
3066501,Obstet Gynecol,0029-7844,1873-233X,2013,121,1,http://10.1097/AOG.0b013e31827a072c,10.1097/AOG.0b013e31827a072c,PMC3875219,23262925.0,NIHMS499188,live,10.1097/AOG.0b013e31827a072c
3071927,Obstet Gynecol,0029-7844,1873-233X,2013,121,2 0 1,http://10.1097/AOG.0b013e31827d8ad5,10.1097/AOG.0b013e31827d8ad5,PMC3880675,23344286.0,NIHMS539499,live,10.1097/AOG.0b013e31827d8ad5
3247965,Mol Imaging,1535-3508,1536-0121,2013,12,8,10.2310/7290.2013.00064,10.2310/7290.2013.00064,PMC4060516,24447615.0,NIHMS583842,live,10.2310/7290.2013.00064
3290678,Mol Imaging,1535-3508,1536-0121,2013,12,8,10.2310/7290.2013.00065,10.2310/7290.2013.00065,PMC4103900,24447617.0,NIHMS574576,live,10.2310/7290.2013.00065
3384362,Mol Imaging,1535-3508,1536-0121,2014,13,,10.2310/7290.2014.00015,10.2310/7290.2014.00015,PMC4199087,24825818.0,NIHMS633251,live,10.2310/7290.2014.00015
3449653,Mol Imaging,1535-3508,1536-0121,2014,13,,DOI 10.2310/7290.2014.00020,10.2310/7290.2014.00020,PMC4265553,25060207.0,NIHMS646694,live,10.2310/7290.2014.00020


In [16]:
# Export PMID-PMCID and PMID live lists
pmc['PMC'] = 0
pmc.loc[pmc['Release Date'] == 'live', 'PMC'] = 1
pmc.loc[pmc['PMC'] == 1]

Unnamed: 0,Journal Title,ISSN,eISSN,Year,Volume,Issue,Page,DOI,PMCID,PMID,Manuscript Id,Release Date,doi_in_pages,PMC
0,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,55,,PMC13900,11250746.0,,live,,1
1,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,61,,PMC13901,11250747.0,,live,,1
2,Breast Cancer Res,1465-5411,1465-542X,2000,3,1,66,,PMC13902,11250748.0,,live,,1
3,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,59,10.1186/bcr29,PMC13911,11056684.0,,live,,1
4,Breast Cancer Res,1465-5411,1465-542X,1999,2,1,64,,PMC13912,11400682.0,,live,,1
5,Breast Cancer Res,1465-5411,1465-542X,1999,1,1,73,10.1186/bcr16,PMC13913,11056681.0,,live,,1
6,Breast Cancer Res,1465-5411,1465-542X,1999,1,1,81,10.1186/bcr17,PMC13914,11056682.0,,live,,1
7,Breast Cancer Res,1465-5411,1465-542X,1999,1,1,88,10.1186/bcr18,PMC13915,11056683.0,,live,,1
8,Breast Cancer Res,1465-5411,1465-542X,2000,2,2,139,10.1186/bcr45,PMC13916,11056686.0,,live,,1
9,Breast Cancer Res,1465-5411,1465-542X,2000,2,3,222,10.1186/bcr57,PMC13917,11056687.0,,live,,1


In [17]:
# export to CSV
pmc.loc[(pmc['PMC'] == 1) & (pmc['PMID'].notnull()) & (pmc['PMCID'].notnull())][['PMID', 'PMCID']].to_csv('data/sources/pmc/pmc_pmid_pmcid.csv.gz', sep='\t', index=False, encoding='utf-8', compression='gzip')
pmc.loc[(pmc['PMC'] == 1) & (pmc['PMID'].notnull()) & (pmc['PMCID'].notnull()) & (pmc['DOI'].notnull())][['PMID', 'PMCID', 'DOI']].to_csv('data/sources/pmc/pmc_pmid_pmcid_doi.csv.gz', sep='\t', index=False, encoding='utf-8', compression='gzip')
pmc.loc[(pmc['PMC'] == 1) & (pmc['PMID'].notnull()) & (pmc['PMCID'].notnull())][['PMID', 'PMC']].to_csv('data/sources/pmc/pmc_pmids.csv.gz', sep='\t', index=False, encoding='utf-8', compression='gzip')

In [18]:
pmc.loc[(pmc['PMC'] == 1) & (pmc['PMID'].notnull()) & (pmc['PMCID'].notnull()) & (pmc['DOI'].notnull())][['PMID', 'PMCID', 'DOI']].to_csv('data/sources/pmc/pmc_pmid_pmcid_doi.csv.gz', sep='\t', index=False, encoding='utf-8', compression='gzip')

In [1]:
# restart kernel
# open PMID list exported before
import pandas as pd
pmc = pd.read_csv('data/sources/pmc/pmc_pmids.csv.gz', 
                  dtype={'PMID': 'int', 'PMC': 'int'},
                  sep='\t', header=0, encoding='utf-8', compression='gzip')
# rename column to merge
pmc = pmc.rename(columns = {'PMID': 'pmid'})
pmc

Unnamed: 0,pmid,PMC
0,11250746,1
1,11250747,1
2,11250748,1
3,11056684,1
4,11400682,1
5,11056681,1
6,11056682,1
7,11056683,1
8,11056686,1
9,11056687,1


In [2]:
# dedup by pmid
pmc = pmc.drop_duplicates(subset='pmid')
pmc

Unnamed: 0,pmid,PMC
0,11250746,1
1,11250747,1
2,11250748,1
3,11056684,1
4,11400682,1
5,11056681,1
6,11056682,1
7,11056683,1
8,11056686,1
9,11056687,1


In [3]:
pmc.shape

(4334111, 2)

In [4]:
# open PubMed data merged with library journals coverage and unpaywall data
import pandas as pd
pubmed = pd.read_csv('data/results/pubmed_dois_library_unpaywall_pmids_year.csv.gz',
                   sep='\t', header=0, encoding='utf-8', compression='gzip')
pubmed

Unnamed: 0,pmid,year,PJ,EJ,is_free,unpaywall_not_matched,unpaywall_oa_false,unpaywall_oa_green,unpaywall_oa_gold,doi_eupmc,doi_apd,doi_pubmed
0,1,1975,1,1,0,,1.0,,,1.0,1.0,0.0
1,2,1975,1,1,0,,1.0,,,1.0,1.0,0.0
2,3,1975,1,1,0,,1.0,,,1.0,1.0,0.0
3,4,1975,1,1,0,,1.0,,,1.0,1.0,0.0
4,5,1975,1,1,0,,1.0,,,1.0,1.0,0.0
5,6,1975,1,1,0,,1.0,,,1.0,1.0,0.0
6,7,1975,1,0,0,,1.0,,,1.0,1.0,0.0
7,8,1975,1,0,0,,1.0,,,1.0,1.0,0.0
8,9,1975,1,0,0,,1.0,,,1.0,1.0,0.0
9,10,1975,1,0,0,,,1.0,,1.0,1.0,0.0


In [6]:
# merge PMC by PMID
pubmed = pubmed.merge(pmc, on='pmid', how='left')
pubmed

Unnamed: 0,pmid,year,PJ,EJ,is_free,unpaywall_not_matched,unpaywall_oa_false,unpaywall_oa_green,unpaywall_oa_gold,doi_eupmc,doi_apd,doi_pubmed,PMC
0,1,1975,1,1,0,,1.0,,,1.0,1.0,0.0,
1,2,1975,1,1,0,,1.0,,,1.0,1.0,0.0,
2,3,1975,1,1,0,,1.0,,,1.0,1.0,0.0,
3,4,1975,1,1,0,,1.0,,,1.0,1.0,0.0,
4,5,1975,1,1,0,,1.0,,,1.0,1.0,0.0,
5,6,1975,1,1,0,,1.0,,,1.0,1.0,0.0,
6,7,1975,1,0,0,,1.0,,,1.0,1.0,0.0,
7,8,1975,1,0,0,,1.0,,,1.0,1.0,0.0,
8,9,1975,1,0,0,,1.0,,,1.0,1.0,0.0,
9,10,1975,1,0,0,,,1.0,,1.0,1.0,0.0,


In [7]:
# replance NaN by 0
pubmed = pubmed.fillna(0)

In [8]:
# export final file
# export to final CSV
pubmed.to_csv('data/results/pubmed_library_unpaywall_pmc_pmids_year.csv.gz', sep='\t', index=False, encoding='utf-8', compression='gzip')
pubmed

Unnamed: 0,pmid,year,PJ,EJ,is_free,unpaywall_not_matched,unpaywall_oa_false,unpaywall_oa_green,unpaywall_oa_gold,doi_eupmc,doi_apd,doi_pubmed,PMC
0,1,1975,1,1,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,2,1975,1,1,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,3,1975,1,1,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,4,1975,1,1,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,5,1975,1,1,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
5,6,1975,1,1,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
6,7,1975,1,0,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
7,8,1975,1,0,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
8,9,1975,1,0,0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
9,10,1975,1,0,0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


## Compare PMC and Unpaywall

In [15]:
# count PMIDs with PMCID and OA in unpaywall
pubmed.loc[(pubmed['PMC'] == 1) & ((pubmed['unpaywall_oa_green'] == 1) | (pubmed['unpaywall_oa_gold'] == 1))].shape

(3439659, 13)

In [16]:
# count PMIDs with PMCID and OA gold in unpaywall
pubmed.loc[(pubmed['PMC'] == 1) & (pubmed['unpaywall_oa_gold'] == 1)].shape

(2503629, 13)

In [17]:
# count PMIDs with PMCID and OA green in unpaywall
pubmed.loc[(pubmed['PMC'] == 1) & (pubmed['unpaywall_oa_green'] == 1)].shape

(941720, 13)

In [20]:
# count PMIDs with PMCID and not OA in unpaywall
pubmed.loc[(pubmed['PMC'] == 1) & (pubmed['unpaywall_oa_false'] == 1)].shape

(128046, 13)

In [21]:
# count PMIDs with PMCID and DOI not matched in unpaywall
pubmed.loc[(pubmed['PMC'] == 1) & (pubmed['unpaywall_not_matched'] == 1)].shape

(36681, 13)

In [22]:
# count PMIDs with PMCID and without DOI
pubmed.loc[(pubmed['PMC'] == 1) & (pubmed['doi_eupmc'] == 0) & (pubmed['doi_apd'] == 0) & (pubmed['doi_pubmed'] == 0)].shape

(607323, 13)