# Library Access PennText Accuracy Analysis

https://github.com/greenelab/library-access

In [1]:
import pandas

In [2]:
url = "https://github.com/greenelab/library-access/raw/46a529c6b4dc1017dfc90172e5090e10086488e2/evaluate_library_access_from_output_tsv/manual-doi-checks.tsv"
df = pandas.read_table(url)
renamer = {
    'full_text_indicator_automated': 'penntext',
    'full_text_indicator_manual_outside_campus': 'open_access',
    'full_text_indicator_manual_inside_campus': 'penn_access',
}
df = df.rename(columns = renamer)[['doi'] + list(renamer.values())]
df.head(2)

Unnamed: 0,doi,penntext,open_access,penn_access
0,10.1007/bf01441062,0,0,0
1,10.20531/tfb.2016.44.1.11,0,1,1


In [3]:
assert df.query("open_access == 1").penn_access.all()
pandas.crosstab(df.open_access, df.penn_access, margins=True)

penn_access,0,1,All
open_access,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,59,68,127
1,0,73,73
All,59,141,200


In [4]:
# How many of the 100 articles where PennText indicates no access were open access?
len(df.query("penntext == 0 and open_access == 1"))

29

In [5]:
toll_df = df.query("open_access == 0")
# For toll access articles, how often does Penn have access, stratified by PennText status 
toll_df.groupby('penntext').penn_access.mean().map('{:.1%}'.format)

penntext
0    25.4%
1    89.3%
Name: penn_access, dtype: object

In [6]:
# Penn does not have access, despite PennText claiming otherwise
df.query("penn_access == 0 and penntext == 1")

Unnamed: 0,doi,penntext,open_access,penn_access
126,10.1109/chicc.2016.7554656,1,0,0
139,10.1080/09585192.2016.1242508,1,0,0
154,10.1109/tcyb.2016.2545688,1,0,0
169,10.1080/14754835.2014.923754,1,0,0
175,10.1080/01490419.2011.637154,1,0,0
194,10.1111/j.2044-8325.2012.02058.x,1,0,0


In [7]:
# Penn has subscription access that is unknown to PennText
toll_df.query("penn_access == 1 and penntext == 0")

Unnamed: 0,doi,penntext,open_access,penn_access
4,10.1111/j.1478-4408.1958.tb02258.x,0,0,1
5,10.1111/j.1550-7408.1962.tb02648.x,0,0,1
9,10.1136/bmj.1.4706.586,0,0,1
21,10.1002/14651858.cd008009.pub2,0,0,1
23,10.1002/prac.18430290165,0,0,1
26,10.1111/j.1468-5914.1986.tb00063.x,0,0,1
28,10.1097/ccm.0b013e31821b85c6,0,0,1
39,10.1007/s00261-016-0956-8,0,0,1
43,10.1107/s0108767388009286,0,0,1
46,10.1002/chin.197531174,0,0,1
