# Library Access PennText Accuracy Analysis on 500 random DOIs

https://github.com/greenelab/library-access

In [1]:
import pathlib

import pandas

In [2]:
formatter = '{:.1%}'.format

In [3]:
manual_df = pandas.read_table('manual-doi-checks-500.tsv')
manual_df['penntext_correct'] = (manual_df.penntext == manual_df.penn_access).astype(int)
manual_df.head(2)

Unnamed: 0,doi,penntext,penn_access,open_access,penn_access_date,open_access_date,penntext_correct
0,10.1007/bf01441062,0,0,0,2018-01-04,2018-01-04,1
1,10.20531/tfb.2016.44.1.11,0,1,1,2018-01-04,2018-01-04,0


## Summarizing manual access

In [4]:
assert manual_df.query("open_access == 1").penn_access.all()
pandas.crosstab(manual_df.open_access, manual_df.penn_access, margins=True)

penn_access,0,1,All
open_access,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,63,263,326
1,0,174,174
All,63,437,500


In [5]:
manual_df[['penntext', 'open_access', 'penn_access', 'penntext_correct']].mean().map(formatter)

penntext            84.4%
open_access         34.8%
penn_access         87.4%
penntext_correct    88.2%
dtype: object

## PennText accuracy

In [6]:
def summarize(df):
    row = pandas.Series()
    row['n_articles'] = len(df)
    row['n_penntext_correct'] = df.penntext_correct.sum()
    row['penntext_accuracy'] = formatter(df.penntext_correct.mean())
    return row

def summarize_by_columns(df, columns):
    """
    Group df by columns and calculate penntext accuracy for each group.
    """
    columns = list(columns)
    summary_df = manual_df.groupby(columns).apply(summarize).reset_index()
    for column in 'n_articles', 'n_penntext_correct':
        summary_df[column] = summary_df[column].astype(int)
    return summary_df

In [7]:
# Overall PennText accuracy
summarize(manual_df)

n_articles              500
n_penntext_correct      441
penntext_accuracy     88.2%
dtype: object

In [8]:
# Accuracy within open access category
summarize_by_columns(manual_df, ['open_access'])

Unnamed: 0,open_access,n_articles,n_penntext_correct,penntext_accuracy
0,0,326,290,89.0%
1,1,174,151,86.8%


In [9]:
# Accuracy within penn access category
summarize_by_columns(manual_df, ['penn_access'])

Unnamed: 0,penn_access,n_articles,n_penntext_correct,penntext_accuracy
0,0,63,41,65.1%
1,1,437,400,91.5%


In [10]:
# Accuracy within PennText category
summarize_by_columns(manual_df, ['penntext'])

Unnamed: 0,penntext,n_articles,n_penntext_correct,penntext_accuracy
0,0,78,41,52.6%
1,1,422,400,94.8%


In [11]:
# Accuracy within OA/Penn Access groups
summarize_by_columns(manual_df, ['open_access', 'penn_access'])

Unnamed: 0,open_access,penn_access,n_articles,n_penntext_correct,penntext_accuracy
0,0,0,63,41,65.1%
1,0,1,263,249,94.7%
2,1,1,174,151,86.8%


In [12]:
# Accuracy within OA/PennText groups
summarize_by_columns(manual_df, ['open_access', 'penntext'])

Unnamed: 0,open_access,penntext,n_articles,n_penntext_correct,penntext_accuracy
0,0,0,55,41,74.5%
1,0,1,271,249,91.9%
2,1,0,23,0,0.0%
3,1,1,151,151,100.0%


In [13]:
# Accuracy within Penn Access/PennText groups
summarize_by_columns(manual_df, ['penn_access', 'penntext'])

Unnamed: 0,penn_access,penntext,n_articles,n_penntext_correct,penntext_accuracy
0,0,0,41,41,100.0%
1,0,1,22,0,0.0%
2,1,0,37,0,0.0%
3,1,1,400,400,100.0%


In [14]:
# Accuracy by all possible contingencies
summarize_by_columns(manual_df, ['open_access', 'penn_access', 'penntext'])

Unnamed: 0,open_access,penn_access,penntext,n_articles,n_penntext_correct,penntext_accuracy
0,0,0,0,41,41,100.0%
1,0,0,1,22,0,0.0%
2,0,1,0,14,0,0.0%
3,0,1,1,249,249,100.0%
4,1,1,0,23,0,0.0%
5,1,1,1,151,151,100.0%


## DOIs with errors

In [15]:
# Penn does not have access, despite PennText claiming otherwise
manual_df.query("penn_access == 0 and penntext == 1")

Unnamed: 0,doi,penntext,penn_access,open_access,penn_access_date,open_access_date,penntext_correct
104,10.1109/chicc.2016.7554656,1,0,0,2018-01-03,2018-01-04,0
117,10.1080/09585192.2016.1242508,1,0,0,2018-01-03,2018-01-04,0
132,10.1109/tcyb.2016.2545688,1,0,0,2018-01-03,2018-01-04,0
147,10.1080/14754835.2014.923754,1,0,0,2018-01-03,2018-01-04,0
153,10.1080/01490419.2011.637154,1,0,0,2018-01-03,2018-01-04,0
172,10.1111/j.2044-8325.2012.02058.x,1,0,0,2018-01-03,2018-01-04,0
180,10.1049/el.2015.0457,1,0,0,2018-01-12,2018-01-17,0
222,10.1109/jrproc.1955.277857,1,0,0,2018-01-12,2018-01-17,0
235,10.2307/1882737,1,0,0,2018-01-12,2018-01-17,0
237,10.2174/0929867311320230002,1,0,0,2018-01-12,2018-01-17,0


In [16]:
# Penn has subscription access that is unknown to PennText
manual_df.query("open_access == 0 and penn_access == 1 and penntext == 0")

Unnamed: 0,doi,penntext,penn_access,open_access,penn_access_date,open_access_date,penntext_correct
4,10.1111/j.1478-4408.1958.tb02258.x,0,1,0,2018-01-04,2018-01-04,0
5,10.1111/j.1550-7408.1962.tb02648.x,0,1,0,2018-01-04,2018-01-04,0
9,10.1136/bmj.1.4706.586,0,1,0,2018-01-04,2018-01-04,0
21,10.1002/14651858.cd008009.pub2,0,1,0,2018-01-04,2018-01-04,0
23,10.1002/prac.18430290165,0,1,0,2018-01-04,2018-01-04,0
26,10.1111/j.1468-5914.1986.tb00063.x,0,1,0,2018-01-04,2018-01-04,0
28,10.1097/ccm.0b013e31821b85c6,0,1,0,2018-01-04,2018-01-04,0
39,10.1007/s00261-016-0956-8,0,1,0,2018-01-03,2018-01-04,0
43,10.1107/s0108767388009286,0,1,0,2018-01-03,2018-01-04,0
46,10.1002/chin.197531174,0,1,0,2018-01-03,2018-01-04,0


In [17]:
# Penn does not know that an article is open access (i.e. PennText is false)
manual_df.query("open_access == 1 and penn_access == 1 and penntext == 0")

Unnamed: 0,doi,penntext,penn_access,open_access,penn_access_date,open_access_date,penntext_correct
1,10.20531/tfb.2016.44.1.11,0,1,1,2018-01-04,2018-01-04,0
3,10.1038/s41550-016-0030,0,1,1,2018-01-04,2018-01-04,0
10,10.1093/molehr/gaq017,0,1,1,2018-01-04,2018-01-04,0
11,10.1093/mnras/89.4.329,0,1,1,2018-01-04,2018-01-04,0
12,10.1093/qjmed/hct203,0,1,1,2018-01-04,2018-01-04,0
13,10.1097/acm.0000000000000545,0,1,1,2018-01-04,2018-01-04,0
18,10.3855/jidc.4620,0,1,1,2018-01-04,2018-01-04,0
20,10.1016/0021-9517(79)90166-0,0,1,1,2018-01-04,2018-01-04,0
29,10.1515/ci.2008.30.1.8,0,1,1,2018-01-04,2018-01-04,0
34,10.1016/j.scient.2011.05.025,0,1,1,2018-01-04,2018-01-04,0
