# Tweets from bioRxiv and medRxiv

Find and display events from a single DOI prefix in a specified period of time, and find the most tweeted DOIs.

In [35]:
import sys
sys.path.insert(0, '..')

import pandas # data analysis library
import json
import datetime
import mrced2 # module to run event data queries
import os # some file manipulations
import math # some number manipulations
import altair.vegalite.v3 as alt # some data visualizations
from IPython.display import Markdown as md # some markdown manipulations
from datetime import datetime, timedelta # some date manipulations

In [36]:
email = "martin@front-matter.io"
prefix = "10.1101"
start_date = (datetime.today() - timedelta(7)).strftime('%Y-%m-%d')
end_date = datetime.today().strftime('%Y-%m-%d')

In [37]:
ed = mrced2.eventData(email = email)
ed.buildQuery({'obj-id.prefix' : prefix, 'source': 'twitter', 'rows': 0,'from-occurred-date' : start_date, 'until-occurred-date' : end_date})

https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&obj-id.prefix=10.1101&source=twitter&rows=0&from-occurred-date=2021-04-27&until-occurred-date=2021-05-04


In [38]:
ed.runQuery(retry = 5)

Event Data query started...
API query complete  200
output file written to 1101/tweets.json


In [39]:
pages = math.ceil(ed.events.getHits() / 1000)

13250 events found


In [40]:
email = "martin@front-matter.io"
prefix = "10.1101"
start_date = (datetime.today() - timedelta(7)).strftime('%Y-%m-%d')
end_date = datetime.today().strftime('%Y-%m-%d')

# find the all result pages for the search
ed = mrced2.eventData(email = email)
ed.getAllPages(pages, {'rows': 1000, 'obj-id.prefix' : prefix, 'source': 'twitter', 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}, fileprefix = '1101/tweets_') 

https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&rows=1000&obj-id.prefix=10.1101&source=twitter&from-occurred-date=2021-04-27&until-occurred-date=2021-05-04
Event Data query started...
API query complete  200
output file written to 1101/tweets_0000.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=bb266559-fac9-4fdf-8cca-8ec696c22d9b&rows=1000&obj-id.prefix=10.1101&source=twitter&from-occurred-date=2021-04-27&until-occurred-date=2021-05-04
Event Data query started...
API query complete  200
output file written to 1101/tweets_0001.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=fe149a6e-1473-4a2d-9f83-54beed28f9b2&rows=1000&obj-id.prefix=10.1101&source=twitter&from-occurred-date=2021-04-27&until-occurred-date=2021-05-04
Event Data query started...
API query complete  200
output file written to 1101/tweets_0002.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=94b200a0-06a1-4405-8310-ce40d1c4b766&rows=

Initialisation to look at the properties of the results.

In [41]:
jd1 = mrced2.eventRecord() # instance of a class to interpret the events
files = os.listdir('1101') # get all the filenames

jd1.mergeJsons(files, folder = '1101') # load the json event data from multiple files

failed to load .DS_Store
failed to load preprint_tweets_no_abstract.csv
failed to load .gitkeep
failed to load tweets_sorted.json
failed to load preprint_tweets.csv
output file written to 1101/tweets.json


In [44]:
js = json.load(open("1101/tweets.json"))
df=pandas.json_normalize(js, record_path = ['message', 'events'])
gdf = df.groupby(['obj_id']).size().reset_index(name='count').sort_values('count', ascending=False)
cdf = gdf[gdf['count'] >= 5]

In [45]:
email = "martin@front-matter.io"
rest = mrced2.restApi(email = email)

data = []

for index, row in cdf.iterrows():
    rest.runQuery(row)
    if rest.work is not None and rest.work["posted"] >= start_date:
        data.append(rest.work)
    
tdf = pandas.DataFrame(data, columns=['doi','tweets','archive','subject-area','covid','title','authors','abstract','posted'])
tdf.to_csv('1101/preprint_tweets.csv')

# remove abstract column so that CSV size is smaller and it can be rendered nicely in GitHub
tdf = tdf.drop('abstract', 1)
tdf.to_csv('1101/preprint_tweets_no_abstract.csv')

tdf.head(50)

REST API query started for 10.1101/2021.04.27.441510...
REST API query complete  200
REST API query started for 10.1101/2021.04.10.21255248...
REST API query complete  200
REST API query started for 10.1101/2021.01.29.21250653...
REST API query complete  200
REST API query started for 10.1101/2021.02.27.433180...
REST API query complete  200
REST API query started for 10.1101/2021.04.26.441389...
REST API query complete  200
REST API query started for 10.1101/2021.02.17.21251946...
REST API query complete  200
REST API query started for 10.1101/2021.04.21.440862...
REST API query complete  200
REST API query started for 10.1101/2021.04.26.21256152...
REST API query complete  200
REST API query started for 10.1101/2021.01.28.428665...
REST API query complete  200
REST API query started for 10.1101/2020.04.30.066209...
REST API query complete  200
REST API query started for 10.1101/2021.02.27.21252597...
REST API query complete  200
REST API query started for 10.1101/2020.10.29.20220426.

Unnamed: 0,doi,tweets,archive,subject-area,covid,title,authors,posted
0,10.1101/2021.04.27.441510,1699,bioRxiv,Microbiology,True,Aerosol Exposure of Cynomolgus Macaques to SAR...,"[{'given': 'Sandra L.', 'family': 'Bixler', 's...",2021-04-27
1,10.1101/2021.04.26.21256152,170,medRxiv,Infectious Diseases (except HIV/AIDS),False,Effectiveness of portable air filtration on re...,"[{'given': 'Jung Hoon', 'family': 'Lee', 'sequ...",2021-04-28
2,10.1101/2021.04.26.21256136,131,medRxiv,Infectious Diseases (except HIV/AIDS),True,Detecting in-school transmission of SARS-CoV-2...,[{'ORCID': 'http://orcid.org/0000-0001-8011-00...,2021-04-28
3,10.1101/2021.04.28.441626,121,bioRxiv,Evolutionary Biology,False,A re-analysis of the data in Sharkey et al.'s ...,[{'ORCID': 'http://orcid.org/0000-0002-4452-28...,2021-04-29
4,10.1101/2021.04.29.442030,103,bioRxiv,Evolutionary Biology,False,Genomic consequences of domestication of the S...,"[{'given': 'Young', 'family': 'Kwon', 'sequenc...",2021-04-30
5,10.1101/2021.04.27.441464,100,bioRxiv,Biophysics,False,Stochastic dynamics of single molecules across...,[{'ORCID': 'http://orcid.org/0000-0002-2738-86...,2021-04-27
6,10.1101/2021.04.26.441518,75,bioRxiv,Immunology,True,Nucleocapsid vaccine elicits spike-independent...,[{'ORCID': 'http://orcid.org/0000-0001-7515-22...,2021-04-27
7,10.1101/2021.04.27.441667,74,bioRxiv,Evolutionary Biology,False,Enhanced lipogenesis through Pparγ helps cavef...,"[{'given': 'Shaolei', 'family': 'Xiong', 'sequ...",2021-04-28
8,10.1101/2021.04.30.441581,70,bioRxiv,Microbiology,False,RelA-SpoT Homologue toxins pyrophosphorylate t...,"[{'given': 'Tatsuaki', 'family': 'Kurata', 'se...",2021-04-30
9,10.1101/2021.04.26.21255788,68,medRxiv,Public and Global Health,True,Ethnic differences in SARS-CoV-2 vaccine hesit...,[{'ORCID': 'http://orcid.org/0000-0003-4915-07...,2021-04-28


### Tweets of bioRxiv and medRxiv preprints

In [51]:
num_rows = tdf['archive'].count()
num_covid = tdf['covid'].value_counts(ascending=True)[1]
num_biorxiv = tdf['archive'].value_counts(ascending=True)[0]
num_medrxiv = tdf['archive'].value_counts(ascending=True)[1]
end_date = datetime.today().strftime('%Y-%m-%d')
max_count = tdf['tweets'].max()

md('{} preprints (including {} covering SARS-CoV-2, {} from bioRxiv and {} from medRxiv) published in the last 7 days before {} had been tweeted at least 5 times (maximum {}).'.format(num_rows, num_covid, num_biorxiv, num_medrxiv, end_date, max_count))

127 preprints (including 32 covering SARS-CoV-2, 28 from bioRxiv and 99 from medRxiv) published in the last 7 days before 2021-05-04 had been tweeted at least 5 times (maximum 1699).