# Stats for the Rephetio project

In [1]:
import json
import export
import re

import pandas

## Retrieve export

Uncomment to download the Rephetio export from Thinklab

In [2]:
# ! python export.py --project rephetio --output export/rephetio.json

## Process export

In [3]:
with open('exported/rephetio.json') as read_file:
    export = json.load(read_file)
export['retrieved']

'2016-04-26T00:42:28.782606Z'

In [4]:
list(export.keys())

['comments', 'retrieved', 'threads', 'documents', 'profiles', 'notes']

In [5]:
profile_df = pandas.DataFrame(export['profiles'])
profile_df.head(2)

Unnamed: 0,first_name,last_name,url,username
0,Jesse,Spaulding,/u/jspauld,jspauld
1,Daniel,Himmelstein,/u/dhimmel,dhimmel


In [6]:
thread_df = pandas.DataFrame(export['threads'])
thread_df.head(2)

Unnamed: 0,document,doi,doi_field,profile,published,subject,topic_field,url,views
0,,10.15363/thinklab.d21,,17,2015-01-14T05:55:24.808111Z,How should we construct a catalog of drug indi...,"Bioinformatics,Natural Language Processing,Ind...",/discussion/how-should-we-construct-a-catalog-...,313
1,,10.15363/thinklab.d22,,17,2015-01-16T00:46:28.770398Z,Suggestions for additional information types?,"Bioinformatics,Chemoinformatics,Databases",/discussion/suggestions-for-additional-informa...,81


In [7]:
comment_df = pandas.DataFrame(export['comments'])
comment_df.head(2)

Unnamed: 0,body_html,body_md,profile,published,thread,url
0,<p>We are looking to construct a catalog of <a...,We are looking to construct a catalog of [indi...,17,2015-01-14T05:55:24.832895Z,21,/discussion/how-should-we-construct-a-catalog-...
1,<p>Are there any types of nodes or edges that ...,Are there any types of nodes or edges that you...,17,2015-01-16T00:46:28.796323Z,22,/discussion/suggestions-for-additional-informa...


In [8]:
note_df = pandas.DataFrame(export['notes'])
note_df.head(2)

Unnamed: 0,added,body_html,body_md,comment,profile,url
0,2015-01-23T05:05:37.375664Z,"<p>I don't see the ""attached reference"". Can y...","I don't see the ""attached reference"". Can you ...",38,17,/discussion/seeking-an-open-source-implementat...
1,2015-01-23T19:27:36.905732Z,<p>I'm talking about the reference you provide...,"I'm talking about the reference you provided, ...",38,23,/discussion/seeking-an-open-source-implementat...


In [9]:
comments = list(comment_df.body_md)
notes = list(note_df.body_md)

# Analyzing discussions

## Extract citations in comments

In [10]:
pattern = re.compile(r'\[(@10\..+?)\]')
dois = set()
for content in comments + notes:
    for match in re.findall(pattern, content):
        for doi in match.split(' '):
            if not doi.startswith('@10.'):
                continue
            doi = doi.lstrip('@')
            dois.add(doi)

dois = sorted(dois)

In [11]:
# Example DOIs
dois[:5]

['10.1001/jama.1994.03510380059038',
 '10.1001/jama.271.14.1103',
 '10.1002/0470114754',
 '10.1002/0471142905.hg1011s57',
 '10.1002/14651858.CD003256.pub2']

In [12]:
# Number of DOIs cited in discussions
len(dois)

270

In [13]:
# DOI registrants
registrants = sorted(set(doi.split('/', 1)[0][3:] for doi in dois))
len(registrants)

42

In [14]:
# Example DOI registrants
registrants[:5]

['1001', '1002', '1007', '1016', '1021']

## Participation

In [15]:
# Number of commentors
comment_df.profile.nunique()

40

In [16]:
# Number of comments
len(comments)

424

In [17]:
# Number of notes
len(notes)

144

## Number of characters

In [18]:
characters = sum(len(content.replace('\r\n', '\n')) for content in comments + notes)
print('{:,} characters in discussions'.format(characters))

694,254 characters in discussions


## Word count

In [19]:
import collections

words = collections.Counter()
for text in comments + notes:
    for word in text.split():
        word = word.lower()
        if len(word) == 1 and word != 'i':
            continue
        words[word] += 1

In [20]:
total_words = sum(words.values())
print('{:,} words in discussions equating to ~{:.2f} academic articles'.format(total_words, total_words / 6000))

83,546 words in discussions equating to ~13.92 academic articles


In [21]:
words.most_common(5)

[('the', 4171), ('to', 2161), ('of', 1993), ('and', 1549), ('for', 1239)]