# Stats for the Rephetio project

In [1]:
import json
import export
import re

import pandas

## Retrieve export

Uncomment to download the Rephetio export from Thinklab

In [2]:
! python export.py --project rephetio --output export/rephetio.json

## Process export

In [3]:
with open('export/rephetio.json') as read_file:
    export = json.load(read_file)
export['retrieved']

'2016-04-11T22:19:53.722649Z'

In [4]:
list(export.keys())

['comments', 'documents', 'notes', 'threads', 'profiles', 'retrieved']

In [22]:
thread_df = pandas.DataFrame([x['fields'] for x in export['threads']])
thread_df.head(2)

Unnamed: 0,document,doi_field,profile,published,subject,topic_field
0,,,17,2015-01-14T05:55:24.808Z,How should we construct a catalog of drug indi...,"Bioinformatics,Natural Language Processing,Ind..."
1,,,17,2015-01-16T00:46:28.770Z,Suggestions for additional information types?,"Bioinformatics,Chemoinformatics,Databases"


In [24]:
set(thread_df.doi_field)

{'', '10.1093/nar/gkv1075', None}

In [5]:
comment_df = pandas.DataFrame([x['fields'] for x in export['comments']])
comment_df.head(2)

Unnamed: 0,body_html,body_md,profile,published,thread
0,<p>We are looking to construct a catalog of <a...,We are looking to construct a catalog of [indi...,17,2015-01-14T05:55:24.832Z,21
1,<p>Are there any types of nodes or edges that ...,Are there any types of nodes or edges that you...,17,2015-01-16T00:46:28.796Z,22


In [6]:
note_df = pandas.DataFrame([x['fields'] for x in export['notes']])
note_df.head(2)

Unnamed: 0,added,body_html,body_md,comment,profile
0,2015-01-23T05:05:37.375Z,"<p>I don't see the ""attached reference"". Can y...","I don't see the ""attached reference"". Can you ...",38,17
1,2015-01-23T19:27:36.905Z,<p>I'm talking about the reference you provide...,"I'm talking about the reference you provided, ...",38,23


In [7]:
comments = list(comment_df.body_md)
notes = list(note_df.body_md)

# Analyzing discussions

## Extract citations in comments

In [8]:
pattern = re.compile(r'\[(@10\..+?)\]')
dois = set()
for content in comments + notes:
    for match in re.findall(pattern, content):
        for doi in match.split(' '):
            if not doi.startswith('@10.'):
                continue
            doi = doi.lstrip('@')
            dois.add(doi)

dois = sorted(dois)

In [9]:
# Example DOIs
dois[:5]

['10.1001/jama.1994.03510380059038',
 '10.1001/jama.271.14.1103',
 '10.1002/0471142905.hg1011s57',
 '10.1002/14651858.CD003256.pub2',
 '10.1002/asi.20438']

In [10]:
# Number of DOIs cited in discussions
len(dois)

248

In [11]:
# DOI registrants
registrants = sorted(set(doi.split('/', 1)[0][3:] for doi in dois))
len(registrants)

39

In [12]:
# Example DOI registrants
registrants[:5]

['1001', '1002', '1007', '1016', '1021']

## Participation

In [13]:
# Number of commentors
comment_df.profile.nunique()

40

In [14]:
# Number of comments
len(comments)

403

In [15]:
# Number of notes
len(notes)

133

## Number of characters

In [16]:
characters = sum(len(content.replace('\r\n', '\n')) for content in comments + notes)
print('{:,} characters in discussions'.format(characters))

662,501 characters in discussions


## Word count

In [32]:
import collections

words = collections.Counter()
for text in comments + notes:
    for word in text.split():
        word = word.lower()
        if len(word) == 1 and word != 'i':
            continue
        words[word] += 1

In [42]:
total_words = sum(words.values())
print('{:,} words in discussions equating to ~{:.2f} academic articles'.format(total_words, total_words / 6000))

79,781 words in discussions equating to ~13.30 academic articles


In [36]:
words.most_common(5)

[('the', 3929), ('to', 2064), ('of', 1893), ('and', 1483), ('for', 1178)]