# Library Access PennText Accuracy Analysis

https://github.com/greenelab/library-access

In [1]:
import pathlib

import pandas

In [2]:
strat_df = pandas.read_table('manual-doi-checks.tsv')
renamer = {
    'full_text_indicator_automated': 'penntext',
    'full_text_indicator_manual_outside_campus': 'open_access',
    'full_text_indicator_manual_inside_campus': 'penn_access',
}
strat_df = strat_df.rename(columns = renamer)[['doi'] + list(renamer.values())]
strat_df['penntext_correct'] = (strat_df.penntext == strat_df.penn_access).astype(int)
strat_df.head(2)

Unnamed: 0,doi,penntext,open_access,penn_access,penntext_correct
0,10.1007/bf01441062,0,0,0,1
1,10.20531/tfb.2016.44.1.11,0,1,1,0


In [3]:
assert strat_df.query("open_access == 1").penn_access.all()
pandas.crosstab(strat_df.open_access, strat_df.penn_access, margins=True)

penn_access,0,1,All
open_access,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,59,68,127
1,0,73,73
All,59,141,200


In [4]:
# Accuracy within PennText strata
strat_accuracy = strat_df.groupby('penntext').penntext_correct.mean()
strat_accuracy

penntext
0    0.53
1    0.94
Name: penntext_correct, dtype: float64

In [5]:
# How many of the 100 articles where PennText indicates no access were open access?
len(strat_df.query("penntext == 0 and open_access == 1"))

29

In [6]:
toll_df = strat_df.query("open_access == 0")
# For toll access articles, how often does Penn have access, stratified by PennText status 
toll_df.groupby('penntext').penn_access.mean().map('{:.1%}'.format)

penntext
0    25.4%
1    89.3%
Name: penn_access, dtype: object

In [7]:
# Penn does not have access, despite PennText claiming otherwise
strat_df.query("penn_access == 0 and penntext == 1")

Unnamed: 0,doi,penntext,open_access,penn_access,penntext_correct
126,10.1109/chicc.2016.7554656,1,0,0,0
139,10.1080/09585192.2016.1242508,1,0,0,0
154,10.1109/tcyb.2016.2545688,1,0,0,0
169,10.1080/14754835.2014.923754,1,0,0,0
175,10.1080/01490419.2011.637154,1,0,0,0
194,10.1111/j.2044-8325.2012.02058.x,1,0,0,0


In [8]:
# Penn has subscription access that is unknown to PennText
toll_df.query("penn_access == 1 and penntext == 0")

Unnamed: 0,doi,penntext,open_access,penn_access,penntext_correct
4,10.1111/j.1478-4408.1958.tb02258.x,0,0,1,0
5,10.1111/j.1550-7408.1962.tb02648.x,0,0,1,0
9,10.1136/bmj.1.4706.586,0,0,1,0
21,10.1002/14651858.cd008009.pub2,0,0,1,0
23,10.1002/prac.18430290165,0,0,1,0
26,10.1111/j.1468-5914.1986.tb00063.x,0,0,1,0
28,10.1097/ccm.0b013e31821b85c6,0,0,1,0
39,10.1007/s00261-016-0956-8,0,0,1,0
43,10.1107/s0108767388009286,0,0,1,0
46,10.1002/chin.197531174,0,0,1,0


## Integrate with prevalence of PennText calls on all oaDOI DOIs

In [9]:
path = pathlib.Path('../data/library_coverage_xml_and_fulltext_indicators.tsv.xz')
penntext_df = pandas.read_table(path, compression='xz')
penntext_df = penntext_df.rename(columns={'full_text_indicator': 'penntext'})
penntext_df.tail(2)

Unnamed: 0,doi,penntext
290118,10.9799/ksfan.2012.25.2.215,0
290119,10.9799/ksfan.2013.26.3.502,0


In [10]:
penntext_proportions = penntext_df.penntext.value_counts(normalize=True)
penntext_proportions

1    0.84402
0    0.15598
Name: penntext, dtype: float64

In [11]:
# Overall accuracy (adjusting for stratificaiton)
sum(strat_accuracy * penntext_proportions)

0.87604808355163377

## Create a sample of 500 DOIs using as many existing calls as possible

In [12]:
# Desired counts if randomly selecting 500 DOIs
counts = (500 * penntext_proportions).round().astype(int)
counts

1    422
0     78
Name: penntext, dtype: int64

In [13]:
penntext_remaining_df = (
    penntext_df.query("doi not in @strat_df.doi")
    .sample(frac=1, random_state=0)
)
len(penntext_remaining_df)

289920

In [14]:
manual_df = pandas.concat([

    pandas.concat([
        strat_df.query("penntext == 0"),
        penntext_remaining_df.query("penntext == 0"),
    ]).iloc[:counts[0], :],

    pandas.concat([
        strat_df.query("penntext == 1"),
        penntext_remaining_df.query("penntext == 1"),
    ]).iloc[:counts[1], :],
])
manual_df = manual_df[['doi', 'penntext', 'penn_access', 'open_access']]
manual_df.head(2)

Unnamed: 0,doi,penntext,penn_access,open_access
0,10.1007/bf01441062,0,0.0,0.0
1,10.20531/tfb.2016.44.1.11,0,1.0,1.0


In [15]:
manual_df.tail(2)

Unnamed: 0,doi,penntext,penn_access,open_access
112072,10.1021/ic5005243,1,,
145766,10.1080/00221325.2016.1148659,1,,


In [16]:
len(manual_df)

500

In [17]:
manual_df.to_csv('manual-doi-checks-500.tsv', sep='\t', index=False, float_format='%.0g')