# Tweets from Research Square

Find and display events from the Research Square DOI prefix 10.21203 from the last 7 days, and find the most tweeted preprints.

In [50]:
import sys
sys.path.append('../')
sys.path.insert(0, '..')

import pandas as pd # data analysis library
import json
import mrced2 # module to run event data queries
import os # some file manipulations
import math # some number manipulations
import altair.vegalite.v3 as alt # some data visualizations
from IPython.display import Markdown as md # some markdown manipulations
from datetime import datetime, date, timedelta # some date manipulations

In [51]:
mailto = "info@front-matter.io"
prefix = "10.21203"
start_date = (date.today() - timedelta(days = 7)).strftime('%Y-%m-%d')
end_date = date.today()

In [52]:
ed = mrced2.eventData(mailto = mailto, outputFile = '21203/tweets.json')
ed.buildQuery({'obj-id.prefix' : prefix, 'source': 'twitter', 'rows': 0,'from-occurred-date' : start_date, 'until-occurred-date' : end_date})

https://api.eventdata.crossref.org/v1/events?mailto=info@front-matter.io&obj-id.prefix=10.21203&source=twitter&rows=0&from-occurred-date=2021-05-10&until-occurred-date=2021-05-17


In [53]:
ed.runQuery(retry = 5)

Event Data query started...
API query complete  200
output file written to 21203/tweets.json


In [54]:
pages = math.ceil(ed.events.getHits() / 1000)

2164 events found


In [56]:
mailto = "info@front-matter.io"
prefix = "10.21203"
start_date = date.today() - timedelta(days = 7)
end_date = date.today()

# find the all result pages for the search
ed = mrced2.eventData(mailto = mailto, outputFile = '21203/tweets.json')
ed.getAllPages(pages, {'rows': 1000, 'obj-id.prefix' : prefix, 'source': 'twitter', 'from-occurred-date' : start_date, 'until-occurred-date' : end_date}, fileprefix = '21203/tweets_') 

https://api.eventdata.crossref.org/v1/events?mailto=info@front-matter.io&rows=1000&obj-id.prefix=10.21203&source=twitter&from-occurred-date=2021-05-10&until-occurred-date=2021-05-17
Event Data query started...
API query complete  200
output file written to 21203/tweets_0000.json
https://api.eventdata.crossref.org/v1/events?mailto=info@front-matter.io&cursor=71451718-3d5d-40b9-bf29-aa1cf376fe01&rows=1000&obj-id.prefix=10.21203&source=twitter&from-occurred-date=2021-05-10&until-occurred-date=2021-05-17
Event Data query started...
API query complete  200
output file written to 21203/tweets_0001.json
https://api.eventdata.crossref.org/v1/events?mailto=info@front-matter.io&cursor=1e2c8f37-152f-4c78-bfa2-7c5971f5e38c&rows=1000&obj-id.prefix=10.21203&source=twitter&from-occurred-date=2021-05-10&until-occurred-date=2021-05-17
Event Data query started...
API query complete  200
output file written to 21203/tweets_0002.json


Initialisation to look at the properties of the results.

In [57]:
jd1 = mrced2.eventRecord(outputFile = '21203/tweets.json') # instance of a class to interpret the events
files = os.listdir('21203') # get all the filenames

jd1.mergeJsons(files, folder = '21203') # load the json event data from multiple files

output file written to 1101/tweets.json


In [58]:
js = json.load(open("21203/tweets.json"))
df = pd.json_normalize(js, record_path = ['message', 'events'])
gdf = df.groupby(['obj_id']).size().reset_index(name='count').sort_values('count', ascending=False)
cdf = gdf[gdf['count'] >= 5]

KeyError: 'obj_id'

In [None]:
email = "info@front-matter.io"
rest = mrced2.restApi(email = email)

data = []

for index, row in cdf.iterrows():
    rest.runQuery(row)
    if rest.work is not None and date.fromisoformat(rest.work["posted"]) >= start_date:
        data.append(rest.work)
    
tdf = pd.DataFrame(data, columns=['doi','tweets','archive','subject-area','covid','title','authors','abstract','posted'])
tdf.to_csv('1101/preprint_tweets_' + str(date.today().strftime('%Y-%m-%d')) + '.csv')

tdf.head(50)

### Tweets of Research Square preprints

In [None]:
num_rows = tdf['archive'].count()
num_covid = tdf['covid'].value_counts(ascending=True)[1]
end_date = date.today().strftime('%Y-%m-%d')
max_count = tdf['tweets'].max()

md('{} preprints (including {} covering SARS-CoV-2, from Research Square published in the last 7 days before {} had been tweeted at least 5 times (maximum {}).'.format(num_rows, num_covid, end_date, max_count))