# Tweets from bioRxiv and medRxiv

Find and display events from a single DOI prefix in a specified period of time, and find the most tweeted DOIs.

In [63]:
import sys
sys.path.insert(0, '..')

import pandas # data analysis library
import json
import datetime
import mrced2 # module to run event data queries
import os # some file manipulations
import math # some number manipulations
import altair.vegalite.v3 as alt # some data visualizations
from IPython.display import Markdown as md # some markdown manipulations
from datetime import datetime, timedelta # some date manipulations

In [64]:
email = "martin@front-matter.io"
prefix = "10.1101"
start_date = (datetime.today() - timedelta(7)).strftime('%Y-%m-%d')
end_date = datetime.today().strftime('%Y-%m-%d')

In [65]:
ed = mrced2.eventData(email = email)
ed.buildQuery({'obj-id.prefix' : prefix, 'source': 'twitter', 'rows': 0,'from-occurred-date' : start_date, 'until-occurred-date' : end_date})

https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&obj-id.prefix=10.1101&source=twitter&rows=0&from-occurred-date=2021-04-25&until-occurred-date=2021-05-02


In [66]:
ed.runQuery(retry = 5)

Event Data query started...
API query complete  200
output file written to 1101/tweets.json


In [67]:
pages = math.ceil(ed.events.getHits() / 1000)

13542 events found


In [68]:
email = "martin@front-matter.io"
prefix = "10.1101"
start_date = (datetime.today() - timedelta(7)).strftime('%Y-%m-%d')
end_date = datetime.today().strftime('%Y-%m-%d')

# find the all result pages for the search
ed = mrced2.eventData(email = email)
ed.getAllPages(pages, {'rows': 1000, 'obj-id.prefix' : prefix, 'source': 'twitter', 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}, fileprefix = '1101/tweets_') 

https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&rows=1000&obj-id.prefix=10.1101&source=twitter&from-occurred-date=2021-04-25&until-occurred-date=2021-05-02
Event Data query started...
API query complete  200
output file written to 1101/tweets_0000.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=18547a97-afa6-4587-9a8a-3ba3844a96ae&rows=1000&obj-id.prefix=10.1101&source=twitter&from-occurred-date=2021-04-25&until-occurred-date=2021-05-02
Event Data query started...
API query complete  200
output file written to 1101/tweets_0001.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=c1ee360a-d543-4fa8-ba7c-6a209dc59cf7&rows=1000&obj-id.prefix=10.1101&source=twitter&from-occurred-date=2021-04-25&until-occurred-date=2021-05-02
Event Data query started...
API query complete  200
output file written to 1101/tweets_0002.json
https://api.eventdata.crossref.org/v1/events?mailto=Anonymous&cursor=685cfa3c-ca3f-49ba-a48b-bda4e4599626&rows=

Initialisation to look at the properties of the results.

In [69]:
jd1 = mrced2.eventRecord() # instance of a class to interpret the events
files = os.listdir('1101') # get all the filenames

jd1.mergeJsons(files, folder = '1101') # load the json event data from multiple files

failed to load .DS_Store
failed to load .gitkeep
failed to load tweets_sorted.json
failed to load preprint_tweets.csv
output file written to 1101/tweets.json


In [70]:
js = json.load(open("1101/tweets.json"))
df=pandas.json_normalize(js, record_path = ['message', 'events'])
gdf = df.groupby(['obj_id']).size().reset_index(name='count').sort_values('count', ascending=False)
cdf = gdf[gdf['count'] >= 10]

In [71]:
email = "martin@front-matter.io"
rest = mrced2.restApi(email = email)

data = []

for index, row in cdf.iterrows():
    rest.runQuery(row)
    if rest.work is not None:
        data.append(rest.work)
    
tdf = pandas.DataFrame(data, columns=['doi','tweets','archive','subject-area','covid','title','authors','abstract','posted'])
tdf.to_csv('1101/preprint_tweets.csv')
    
tdf.head(50)

REST API query started for 10.1101/2021.04.27.441510...
REST API query complete  200
REST API query started for 10.1101/2021.02.27.433180...
REST API query complete  200
REST API query started for 10.1101/2021.01.29.21250653...
REST API query complete  200
REST API query started for 10.1101/2021.04.21.440862...
REST API query complete  200
REST API query started for 10.1101/2021.04.10.21255248...
REST API query complete  200
REST API query started for 10.1101/2021.04.26.441389...
REST API query complete  200
REST API query started for 10.1101/2021.04.21.21255873...
REST API query complete  200
REST API query started for 10.1101/2021.02.17.21251946...
REST API query complete  200
REST API query started for 10.1101/2021.04.06.21254882...
REST API query complete  200
REST API query started for 10.1101/2020.08.21.20179671...
REST API query complete  200
REST API query started for 10.1101/2021.01.28.428665...
REST API query complete  200
REST API query started for 10.1101/2020.10.29.2022042

Unnamed: 0,doi,tweets,archive,subject-area,covid,title,authors,abstract,posted
0,10.1101/2021.04.27.441510,1661,bioRxiv,Microbiology,True,Aerosol Exposure of Cynomolgus Macaques to SAR...,"[{'given': 'Sandra L.', 'family': 'Bixler', 's...",<jats:title>Abstract</jats:title><jats:p>The e...,2021-04-27
1,10.1101/2021.02.27.433180,640,bioRxiv,Immunology,True,Negligible impact of SARS-CoV-2 variants on CD...,"[{'given': 'Alison', 'family': 'Tarke', 'seque...",<jats:title>SUMMARY</jats:title>\n ...,2021-03-01
2,10.1101/2021.01.29.21250653,615,medRxiv,Allergy and Immunology,True,Robust spike antibody responses and increased ...,"[{'given': 'Florian', 'family': 'Krammer', 'se...",<jats:title>Abstract</jats:title><jats:p>An im...,2021-02-01
3,10.1101/2021.04.21.440862,566,bioRxiv,Immunology,True,Rapid induction of antigen-specific CD4+ T cel...,[{'ORCID': 'http://orcid.org/0000-0002-0180-27...,<jats:title>Summary</jats:title><jats:p>The SA...,2021-04-22
4,10.1101/2021.04.10.21255248,471,medRxiv,Infectious Diseases (except HIV/AIDS),True,Retrospective cohort study of Ivermectin as a ...,"[{'given': 'Jose', 'family': 'Morgenstern', 's...",<jats:p>This observational and retrospective c...,2021-04-17
5,10.1101/2021.04.26.441389,327,bioRxiv,Microbiology,False,Antiviral defense via nucleotide depletion in ...,[{'ORCID': 'http://orcid.org/0000-0003-3113-67...,<jats:p>DNA viruses and retroviruses need to c...,2021-04-26
6,10.1101/2021.04.21.21255873,262,medRxiv,Infectious Diseases (except HIV/AIDS),True,Real World Effectiveness of COVID-19 mRNA Vacc...,[{'ORCID': 'http://orcid.org/0000-0002-3464-21...,<jats:title>Abstract</jats:title><jats:sec><ja...,2021-04-23
7,10.1101/2021.02.17.21251946,198,medRxiv,Infectious Diseases (except HIV/AIDS),True,Estimating the Failure Risk of Quarantine Syst...,[{'ORCID': 'http://orcid.org/0000-0002-4427-73...,<jats:p><jats:bold>Objectives:</jats:bold> To ...,2021-02-19
8,10.1101/2021.04.06.21254882,193,medRxiv,Epidemiology,True,Evidence for increased breakthrough rates of S...,"[{'given': 'Talia', 'family': 'Kustin', 'seque...",<jats:title>Summary</jats:title><jats:p>The SA...,2021-04-09
9,10.1101/2020.08.21.20179671,168,medRxiv,Epidemiology,True,Evidence for treatment with estradiol for wome...,[{'ORCID': 'http://orcid.org/0000-0002-1979-38...,<jats:title>Abstract</jats:title><jats:sec><ja...,2020-08-24


### Tweets of bioRxiv and medRxiv preprints

In [None]:
num_rows = tdf['archive'].count()
num_biorxiv = tdf['archive'].value_counts(ascending=True)[0]
num_medrxiv = tdf['archive'].value_counts(ascending=True)[1]
num_covid = tdf['covid'].value_counts(ascending=True)[1]
max_count = tdf['tweets'].max()

md('{} preprints (including {} covering SARS-CoV-2, {} from bioRxiv and {} from medRxiv) had at least 10 tweets (maximum {} tweets) in the 7 days before {}.'.format(num_rows, num_covid, num_biorxiv, num_medrxiv, max_count, datetime.today().strftime('%B %-d, %Y')))