In [26]:
import re
import json

import pandas
import sklearn.tree

In [32]:
pubmed_df = pandas.read_table('data/pubmed-plos.tsv.gz', compression='gzip')
success_df = pandas.read_table('data/dates.tsv')

In [12]:
with open('scraper.ipynb') as read_file:
    notebook = json.load(read_file)

In [13]:
for cell in notebook['cells']:
    if cell['execution_count'] == 7:
        outputs = cell['outputs']
        break

texts = list()
for output in outputs:
    texts.extend(output['text'])

text = ''.join(texts)

In [14]:
rows = list()
matches = re.findall(r'10\.1371.+?(?=10\.1371)', text, flags=re.DOTALL)
for match in matches:
    doi, error = match.split(' ', 1)
    rows.append((doi, error))

error_df = pandas.DataFrame(rows, columns=['doi', 'error'])

In [15]:
error_count_df = error_df.groupby('error').apply(lambda df: pandas.Series({'count': len(df)})).reset_index()
error_count_df['error_id'] = range(1, 1 + len(error_count_df))
error_count_df

Unnamed: 0,error,count,error_id
0,'NoneType' object is not subscriptable\n,285,1
1,'NoneType' object is not subscriptable\n\nErro...,155,2
2,'NoneType' object is not subscriptable\n\nErro...,1,3
3,'NoneType' object is not subscriptable\n/home/...,1,4
4,"UserWarning: Error in function (type, msg, asE...",1,5
5,need more than 1 value to unpack\n,495,6
6,need more than 1 value to unpack\n\nError in f...,281,7
7,need more than 1 value to unpack\n\nError in f...,1,8
8,need more than 1 value to unpack\n\nError in x...,1,9
9,need more than 2 values to unpack\n,1,10


In [16]:
error_df = error_df.merge(error_count_df)
error_df = error_df[error_df['count'] > 1]

In [17]:
error_df.head()

Unnamed: 0,doi,error,count,error_id
0,10.1371/journal.pmed.0010016,need more than 1 value to unpack\n\nError in f...,281,7
1,10.1371/journal.ppat.1002451,need more than 1 value to unpack\n\nError in f...,281,7
2,10.1371/journal.pmed.0030126,need more than 1 value to unpack\n\nError in f...,281,7
3,10.1371/journal.pbio.1001436,need more than 1 value to unpack\n\nError in f...,281,7
4,10.1371/journal.ppat.1004806,need more than 1 value to unpack\n\nError in f...,281,7


In [18]:
success_df.head()

Unnamed: 0,doi,received,accepted,published
0,10.1371/journal.ppat.1004931,2014-11-14,2015-05-04,2015-06-09
1,10.1371/journal.pntd.0001279,2010-10-18,2011-07-05,2011-08-16
2,10.1371/journal.pbio.1001585,2013-02-26,2013-05-02,2013-06-11
3,10.1371/journal.pmed.0050101,2007-08-14,2008-03-18,2008-05-27
4,10.1371/journal.pcbi.1003558,2013-07-09,2014-02-19,2014-04-17


In [19]:
success_df = success_df[['doi']]
success_df['error_id'] = 0
outcome_df = pandas.concat([success_df, error_df[['doi', 'error_id']]])

In [20]:
xy_df = outcome_df.merge(pubmed_df)
for column in ['date_accepted', 'date_epublish', 'date_medline', 'date_pubmed', 'date_received', 'epub_date']:
    xy_df[column] = pandas.to_datetime(xy_df[column])
xy_df.head()

Unnamed: 0,doi,error_id,date_accepted,date_epublish,date_medline,date_pubmed,date_received,epub_date,journal,journal_abbrev,pubdate,pubmed_id,pubtype
0,10.1371/journal.ppat.1004931,0,2015-05-04,2015-06-09,2015-06-10 06:00:00,2015-06-10 06:00:00,2014-11-14,2015-06-09,PLoS pathogens,PLoS Pathog,2015 Jun,26057557,Journal Article
1,10.1371/journal.pntd.0001279,0,2011-07-05,2011-08-16,2011-12-17 06:00:00,2011-08-23 06:00:00,2010-10-18,2011-08-16,PLoS neglected tropical diseases,PLoS Negl Trop Dis,2011 Aug,21858242,Journal Article
2,10.1371/journal.pbio.1001585,0,2013-05-02,2013-06-11,2014-01-07 06:00:00,2013-06-19 06:00:00,2013-02-26,2013-06-11,PLoS biology,PLoS Biol,2013,23776409,Journal Article
3,10.1371/journal.pmed.0050101,0,2008-03-18,NaT,2008-08-08 09:00:00,2008-05-30 09:00:00,2007-08-14,NaT,PLoS medicine,PLoS Med,2008 May 27,18507497,Journal Article
4,10.1371/journal.pcbi.1003558,0,2014-02-19,2014-04-17,2014-12-15 06:00:00,2014-04-20 06:00:00,2013-07-09,2014-04-17,PLoS computational biology,PLoS Comput Biol,2014 Apr,24743341,Journal Article


In [21]:
y = xy_df.error_id
x = xy_df.drop(['error_id'], axis=1)
#x = pandas.get_dummies(x)


In [22]:
import sklearn.preprocessing
encoder = sklearn.preprocessing.LabelEncoder()
x_enc = x.copy()
for column in x_enc.columns:
    x_enc[column] = encoder.fit_transform(x_enc[column])

In [23]:
x.head()

Unnamed: 0,doi,date_accepted,date_epublish,date_medline,date_pubmed,date_received,epub_date,journal,journal_abbrev,pubdate,pubmed_id,pubtype
0,10.1371/journal.ppat.1004931,2015-05-04,2015-06-09,2015-06-10 06:00:00,2015-06-10 06:00:00,2014-11-14,2015-06-09,PLoS pathogens,PLoS Pathog,2015 Jun,26057557,Journal Article
1,10.1371/journal.pntd.0001279,2011-07-05,2011-08-16,2011-12-17 06:00:00,2011-08-23 06:00:00,2010-10-18,2011-08-16,PLoS neglected tropical diseases,PLoS Negl Trop Dis,2011 Aug,21858242,Journal Article
2,10.1371/journal.pbio.1001585,2013-05-02,2013-06-11,2014-01-07 06:00:00,2013-06-19 06:00:00,2013-02-26,2013-06-11,PLoS biology,PLoS Biol,2013,23776409,Journal Article
3,10.1371/journal.pmed.0050101,2008-03-18,NaT,2008-08-08 09:00:00,2008-05-30 09:00:00,2007-08-14,NaT,PLoS medicine,PLoS Med,2008 May 27,18507497,Journal Article
4,10.1371/journal.pcbi.1003558,2014-02-19,2014-04-17,2014-12-15 06:00:00,2014-04-20 06:00:00,2013-07-09,2014-04-17,PLoS computational biology,PLoS Comput Biol,2014 Apr,24743341,Journal Article


In [24]:
classifier = sklearn.tree.DecisionTreeClassifier(max_depth = 3)
classifier = classifier.fit(x_enc.as_matrix(), y.as_matrix())

In [27]:
sklearn.tree.export_graphviz(classifier, 'figure/error-classifier.dot')

In [28]:
import pydotplus
import io

In [29]:
with open('figure/error-classifier.dot') as read_file:
    graph = pydotplus.graph_from_dot_data(read_file.read())
graph.write_pdf('figure/error-classifier.pdf') 

True

In [31]:
list(error_count_df.error)

["'NoneType' object is not subscriptable\n",
 "'NoneType' object is not subscriptable\n\nError in function (type, msg, asError = TRUE)  : \n  Couldn't resolve host 'biology.plosjournals.org'\n",
 "'NoneType' object is not subscriptable\n\nError in parse.response(r, parser, encoding = encoding) : \n  client error: (404) Not Found\n",
 'need more than 1 value to unpack\n',
 "need more than 1 value to unpack\n\nError in function (type, msg, asError = TRUE)  : \n  Couldn't resolve host 'biology.plosjournals.org'\n",
 'need more than 1 value to unpack\n\nError in function (type, msg, asError = TRUE)  : \n  Recv failure: Connection reset by peer\n',
 'need more than 1 value to unpack\n\nError in xml_apply(x, XML::xmlValue, ..., .type = character(1)) : \n  Unknown input of class: NULL\n',
 'need more than 2 values to unpack\n',
 "need more than 2 values to unpack\n\nError in function (type, msg, asError = TRUE)  : \n  Couldn't resolve host 'biology.plosjournals.org'\n"]