In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

plt.style.use('ggplot')
%matplotlib inline

In [2]:
ALLEN_AI_RES_PATH = "../res/allenai/metadata.csv"

In [3]:
df_allen = pd.read_csv(ALLEN_AI_RES_PATH)[['source_x', 'title', 'abstract', 'publish_time', 'journal']]
df_allen['is_pr'] = ~df_allen.source_x.isin(['biorxiv', 'medrxiv']) # those sources are considered non-peer reviewed

print(len(df_allen.index))
df_allen.head(5)

44220


Unnamed: 0,source_x,title,abstract,publish_time,journal,is_pr
0,Elsevier,Intrauterine virus infections and congenital h...,Abstract The etiologic basis for the vast majo...,1972-12-31,American Heart Journal,True
1,Elsevier,Coronaviruses in Balkan nephritis,,1980-03-31,American Heart Journal,True
2,Elsevier,Cigarette smoking and coronary heart disease: ...,,1980-03-31,American Heart Journal,True
3,Elsevier,Clinical and immunologic studies in identical ...,"Abstract Middle-aged female identical twins, o...",1973-08-31,The American Journal of Medicine,True
4,Elsevier,Epidemiology of community-acquired respiratory...,Abstract Upper respiratory tract infections ar...,1985-06-28,The American Journal of Medicine,True


In [4]:
print('Removing', df_allen.title.isna().sum(), 'entries without titles')
df_allen = df_allen[~df_allen.title.isna()]
print('Remaining', len(df_allen.index))

Removing 224 entries without titles
Remaining 43996


In [5]:
print('Removing', df_allen.title.str.contains('Author index').sum(), 'entries with title of format <x>Author index<z>')
df_allen = df_allen[~df_allen.title.str.contains('Author index')]
print('Remaining', len(df_allen.index))

Removing 103 entries with title of format <x>Author index<z>
Remaining 43893


In [6]:
print('Removing', df_allen[df_allen.title == 'Authors'].count()[0], 'entries with title = "Authors"')
df_allen = df_allen[df_allen.title != 'Authors']
print('Remaining', len(df_allen.index))

Removing 3 entries with title = "Authors"
Remaining 43890


In [7]:
print('Removing', df_allen[df_allen.title == 'Index of Authors'].count()[0], 'entries with title = "Index of Authors"')
df_allen = df_allen[df_allen.title != 'Index of Authors']
print('Remaining', len(df_allen.index))

Removing 8 entries with title = "Index of Authors"
Remaining 43882


In [8]:
print('Removing', df_allen[df_allen.title == 'Information for Authors'].count()[0], 'entries with title = "Information for Authors"')
df_allen = df_allen[df_allen.title != 'Information for Authors']
print('Remaining', len(df_allen.index))

Removing 66 entries with title = "Information for Authors"
Remaining 43816


In [9]:
print('Removing', df_allen[df_allen.title == 'Instructions for Authors'].count()[0], 'entries with title = "Instructions for Authors"')
df_allen = df_allen[df_allen.title != 'Instructions for Authors']
print('Remaining', len(df_allen.index))

Removing 4 entries with title = "Instructions for Authors"
Remaining 43812


In [10]:
print('Removing', df_allen[df_allen.title == 'Authors’ reply'].count()[0], 'entries with title = "Authors’ reply"')
df_allen = df_allen[df_allen.title != 'Authors’ reply']
print('Remaining', len(df_allen.index))

Removing 1 entries with title = "Authors’ reply"
Remaining 43811


In [11]:
print('Removing', df_allen.title.str.contains('Author Index').sum(), 'entries with title of format <x>Author Index<z>')
df_allen = df_allen[~df_allen.title.str.contains('Author Index')]
print('Remaining', len(df_allen.index))

Removing 32 entries with title of format <x>Author Index<z>
Remaining 43779


In [12]:
print('Removing', df_allen.title.str.contains('subject index').sum(), 'entries with title of format <x>subject index<z>')
df_allen = df_allen[~df_allen.title.str.contains('subject index')]
print('Remaining', len(df_allen.index))

Removing 25 entries with title of format <x>subject index<z>
Remaining 43754


In [13]:
print('Removing', df_allen.title.str.contains('Cumulative').sum()-2, 'entries with title of format <x>Cumulative<z>')
df_allen = df_allen[(~df_allen.title.str.contains('Cumulative')) | (df_allen.source_x == 'medrxiv')]
print('Remaining', len(df_allen.index))

Removing 50 entries with title of format <x>Cumulative<z>
Remaining 43704


In [14]:
print('Removing', df_allen[df_allen.title == 'Index'].count()[0], 'entries with title = "Index"')
df_allen = df_allen[df_allen.title != 'Index']
print('Remaining', len(df_allen.index))

Removing 348 entries with title = "Index"
Remaining 43356


In [15]:
print('Removing', df_allen.title.str.contains('Subject Index').sum(), 'entries with title of format <x>Subject Index<z>')
df_allen = df_allen[~df_allen.title.str.contains('Subject Index')]
print('Remaining', len(df_allen.index))

Removing 100 entries with title of format <x>Subject Index<z>
Remaining 43256


In [16]:
print('Removing', df_allen.title.str.contains('Index to').sum(), 'entries with title of format <x>Index to<z>')
df_allen = df_allen[~df_allen.title.str.contains('Index to')]
print('Remaining', len(df_allen.index))

Removing 24 entries with title of format <x>Index to<z>
Remaining 43232


In [17]:
mask = (df_allen.title.str.contains('Index')) & (df_allen.source_x=='Elsevier') & (df_allen.abstract.fillna('Unknown') == 'Unknown')
print('Removing', df_allen[mask].count()[0], 'entries with title of format <x>Index<z>, source=Elsevier and no abstract')
df_allen = df_allen[~mask]
print('Remaining', len(df_allen.index))

Removing 39 entries with title of format <x>Index<z>, source=Elsevier and no abstract
Remaining 43193


In [18]:
df_allen[df_allen.title.str.contains('Index')]

Unnamed: 0,source_x,title,abstract,publish_time,journal,is_pr
17301,Elsevier,Independent research of China in Science Citat...,Abstract The study explores the characteristic...,2013-01-31,Journal of Informetrics,True
18222,CZI,The Author's Response: Case of the Index Patie...,,2020,J Korean Med Sci,True
20318,PMC,Clustering Heart Rate Dynamics Is Associated w...,BACKGROUND: Genetic polymorphisms in the gene ...,2011 May 4,PLoS One,True
21825,PMC,Establishing and Applying a Schistosomiasis Ea...,BACKGROUND: China has made remarkable progress...,2014 Apr 4,PLoS One,True
24794,PMC,Progression of the Radiologic Severity Index p...,BACKGROUND: Radiologic severity may predict ad...,2018 May 17,PLoS One,True
27005,PMC,Social Network Characteristics and Body Mass I...,OBJECTIVES: Research has shown that obesity ap...,2013 Nov 28,J Prev Med Public Health,True
27876,PMC,1583. The Utility of the Immunodeficiency Scor...,BACKGROUND: Respiratory viral infections in HC...,2018 Nov 26,Open Forum Infect Dis,True
28082,PMC,2792. Association of Body Mass Index with Rate...,BACKGROUND: Obesity is a serious public health...,2019 Oct 23,Open Forum Infect Dis,True
28455,PMC,Severe Acute Respiratory Syndrome (SARS) in Si...,Severe acute respiratory syndrome (SARS) is an...,2003 Jun,Emerg Infect Dis,True
28671,PMC,Index Patient and SARS Outbreak in Hong Kong,During the global outbreak of severe acute res...,2004 Feb,Emerg Infect Dis,True


In [19]:
df_elsevier = df_allen[df_allen.source_x=='Elsevier']
starts_with_abstract = len(df_elsevier[df_elsevier.abstract.fillna('').str.startswith('Abstract')].index)
abstract_is_na = df_elsevier.abstract.isna().sum()
print("Starting with 'Abstract'", starts_with_abstract)
print("No Abstract", abstract_is_na)
print("Rest", len(df_elsevier.index) - starts_with_abstract - abstract_is_na)

Starting with 'Abstract' 9324
No Abstract 5429
Rest 3713


In [20]:
len(df_elsevier.index)

18466

Starting with 'Abstract' 9324
No Abstract 5980
Rest 4167

In [21]:
9324+5980+4167

19471