#  Computational Philosophy and History of Science
## Analysing scientific publications: the case of Remdesivir research
## Gerd Graßhoff

In [35]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher

In [36]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
# Import English library
nlp = spacy.load('en_core_web_lg')

In [38]:
dfRem=pd.read_json("dfRemdesivierResearchObjects.json")[["title","abstract"]]

In [39]:
len(dfRem)  

2180

In [40]:
dfRem.head(2)

Unnamed: 0,title,abstract
0,Evaluation of the efficacy and safety of intra...,BACKGROUND: Coronavirus disease 2019 (COVID-19...
1,Role of adjunctive treatment strategies in COV...,The coronavirus disease (COVID-19) pandemic ha...


In [41]:
dfRem=dfRem.dropna(subset=["abstract"])
len(dfRem)

2135

# Filter

Initially just for those abstracts containing the expression "We show"

In [42]:
suche="We show"
def filter(zeile):
    cond=zeile["abstract"].str.contains(suche,na=False)
    return(cond)

In [43]:
df1=dfRem[filter]
df1

Unnamed: 0,title,abstract
341,In silico detection of SARS-CoV-2 specific B-c...,Abstract Rapid generation of diagnostics is pa...
537,Structure based drug discovery by virtual scre...,&lt;p&gt;Background&lt;/p&gt;&lt;p&gt;The curr...
649,Defining the Pandemic at the State Level: Sequ...,"In December of 2019, a novel coronavirus, SARS..."
685,COVID-19 research in Wikipedia,Wikipedia is one of the main sources of free k...
1542,SARS-CoV-2 and SARS-CoV differ in their cell t...,SARS-CoV-2 is a novel coronavirus currently ca...


# Pattern for propositional attitudes 

## "we [VERB]"

https://spacy.io/usage/rule-based-matching

https://explosion.ai/demos/matcher?

In [44]:
dfRem["doc"]=dfRem["abstract"].apply(nlp)

In [45]:
exemplar=nlp(dfRem.iloc[2].abstract)

In [46]:
for i,sent in enumerate(exemplar.sents):
    print(f" Satz {i}= {sent}") 

 Satz 0= Middle East respiratory syndrome coronavirus (MERS-CoV) is the causative agent of a severe respiratory disease associated with more than 2468 human infections and over 851 deaths in 27 countries since 2012.
 Satz 1= There are no approved treatments for MERS-CoV infection although a combination of lopinavir, ritonavir and interferon beta (LPV/RTV-IFNb) is currently being evaluated in humans in the Kingdom of Saudi Arabia.
 Satz 2= Here, we show that remdesivir (RDV) and IFNb have superior antiviral activity to LPV and RTV in vitro.
 Satz 3= In mice, both prophylactic and therapeutic RDV improve pulmonary function and reduce lung viral loads and severe lung pathology.
 Satz 4= In contrast, prophylactic LPV/RTV-IFNb slightly reduces viral loads without impacting other disease parameters.
 Satz 5= Therapeutic LPV/RTV-IFNb improves pulmonary function but does not reduce virus replication or severe lung pathology.
 Satz 6= Thus, we provide in vivo evidence of the potential for RDV t

In [47]:
sentences=[s for s in exemplar.sents]
s=sentences[6].text
doc=nlp(s)
print(doc)

Thus, we provide in vivo evidence of the potential for RDV to treat MERS-CoV infections.


In [48]:
displacy.render(doc, style="ent")

In [49]:
displacy.render(doc, style="dep")

In [50]:
matcher = Matcher(nlp.vocab)

In [51]:
pattern = [{"LOWER":"we"},{'POS': 'VERB'}]
matcher.add("matching",None,pattern)

In [52]:
matches=matcher(doc)
for _, start, end in matches:
    span = doc[start:end]  # The matched span
    print(span.text)
    span2=doc[end-1]
    print(f"das Verb ist: {span2.lemma_}")

we provide
das Verb ist: provide


In [53]:
listverbs=[]
def actionverbs(a):
    abstract=nlp(a)
    matches=matcher(abstract)
    for _,start,end in matches:
        verb=abstract[end-1]
        listverbs.append(verb.lemma_)

In [54]:
actionverbs(a)
listverbs

['show', 'provide']

In [55]:
dfRem["abstract"].apply(actionverbs)
len(listverbs)

1484

In [56]:
from collections import Counter
counts=Counter(listverbs)
print(sorted(counts))

['abstract', 'adapt', 'address', 'administer', 'administrate', 'advance', 'advise', 'advocate', 'aim', 'allow', 'analyse', 'analyze', 'anticipate', 'apply', 'appreciate', 'approach', 'argue', 'ask', 'assay', 'assess', 'assume', 'await', 'base', 'believe', 'build', 'calculate', 'call', 'can', 'carry', 'challenge', 'characterise', 'characterize', 'choose', 'clone', 'co', 'collect', 'combine', 'come', 'comment', 'compare', 'conclude', 'conduct', 'confirm', 'consider', 'construct', 'contend', 'could', 'create', 'decide', 'declare', 'define', 'demonstrate', 'denote', 'deploy', 'describe', 'design', 'detail', 'detect', 'determine', 'develop', 'disagree', 'discover', 'discuss', 'divide', 'dock', 'download', 'draw', 'ease', 'elucidate', 'emphasize', 'employ', 'encourage', 'engineer', 'enrol', 'enter', 'establish', 'estimate', 'evaluate', 'evidence', 'examine', 'exclude', 'expect', 'explain', 'explore', 'express', 'extend', 'extract', 'find', 'focus', 'follow', 'form', 'found10', 'frame', 'gene

# Propositional research attitudes

- GOAL (intention, goal):  
    - aim, seek, study, undertake, investigate, intend
- DISCOV: 
    - show, propose, obtain, report, observe, create, demonstrate, detect, scrutinize, provide

## training data

- prepare matching function for sentences, output in format JSONL

In [57]:
verbtags={"GOAL":
       ["aim","seek","study","undertake","investigate","intend"],
       "DISCOV":
        ["show","provide","notice","report"]
         }

In [58]:
def actionverb(abstract):
    matches=matcher(abstract)
    listDisc=[]
    for _,start,end in matches:
        verb=abstract[end-1].text
        verblem=abstract[end-1].lemma_
        for ent,verbs in verbtags.items():
            if verblem in verbs:
                dict_disc=(start+1,end,ent)
                listDisc.append(dict_disc)
    return(listDisc)

In [59]:
dfRem["discNER"]=dfRem["doc"].apply(actionverb)  

In [60]:
dfRem.head(19)

Unnamed: 0,title,abstract,doc,discNER
0,Evaluation of the efficacy and safety of intra...,BACKGROUND: Coronavirus disease 2019 (COVID-19...,"(BACKGROUND, :, Coronavirus, disease, 2019, (,...",[]
1,Role of adjunctive treatment strategies in COV...,The coronavirus disease (COVID-19) pandemic ha...,"(The, coronavirus, disease, (, COVID-19, ), pa...",[]
2,Comparative therapeutic efficacy of remdesivir...,Middle East respiratory syndrome coronavirus (...,"(Middle, East, respiratory, syndrome, coronavi...","[(79, 80, DISCOV), (161, 162, DISCOV)]"
3,Therapeutic strategies for critically ill pati...,Since the 2019 novel coronavirus disease (COVI...,"(Since, the, 2019, novel, coronavirus, disease...",[]
4,Battling COVID-19: using old weapons for a new...,Coronavirus disease-19 (COVID-19) has reached ...,"(Coronavirus, disease-19, (, COVID-19, ), has,...",[]
5,"The origin, transmission and clinical therapie...","An acute respiratory disease, caused by a nove...","(An, acute, respiratory, disease, ,, caused, b...",[]
6,SARS-CoV-2 RNA polymerase as target for antivi...,A new human coronavirus named SARS-CoV-2 was i...,"(A, new, human, coronavirus, named, SARS, -, C...",[]
7,Current regulatory approaches for accessing po...,This commentary aims to elaborate challenges i...,"(This, commentary, aims, to, elaborate, challe...",[]
8,SARS-CoV-2 and COVID-19: The most important re...,Coronavirus disease 2019 (COVID-19) caused by ...,"(Coronavirus, disease, 2019, (, COVID-19, ), c...",[]
9,Emerging SARS-CoV-2 mutation hot spots include...,BACKGROUND: SARS-CoV-2 is a RNA coronavirus re...,"(BACKGROUND, :, SARS, -, CoV-2, is, a, RNA, co...","[(232, 233, DISCOV)]"


# Causal hypothesis

In [61]:
for sent in sentences:
    print(f"*****\nfor {sent}")
    chunks=list(sent.noun_chunks)
    print("noun chunks:",chunks)

*****
for Middle East respiratory syndrome coronavirus (MERS-CoV) is the causative agent of a severe respiratory disease associated with more than 2468 human infections and over 851 deaths in 27 countries since 2012.
noun chunks: [Middle East respiratory syndrome coronavirus, MERS-CoV, the causative agent, a severe respiratory disease, more than 2468 human infections, 851 deaths, 27 countries]
*****
for There are no approved treatments for MERS-CoV infection although a combination of lopinavir, ritonavir and interferon beta (LPV/RTV-IFNb) is currently being evaluated in humans in the Kingdom of Saudi Arabia.
noun chunks: [no approved treatments, MERS-CoV infection, a combination, lopinavir, ritonavir, interferon beta, (LPV/RTV-IFNb, humans, the Kingdom, Saudi Arabia]
*****
for Here, we show that remdesivir (RDV) and IFNb have superior antiviral activity to LPV and RTV in vitro.
noun chunks: [we, remdesivir, (RDV, IFNb, superior antiviral activity, LPV, RTV]
*****
for In mice, both prop

In [62]:
# similarity to key expression. Here "MERS infection"

In [63]:
key_term="MERS infection"
for sent in sentences:
    chunks=list(sent.noun_chunks)
    for noun in chunks:
        sim=noun.similarity(nlp(key_term))
        print(f"{noun} #### ähnlich: {sim}")

Middle East respiratory syndrome coronavirus #### ähnlich: 0.48299568738055154
MERS-CoV #### ähnlich: 0.4550794638568333
the causative agent #### ähnlich: 0.31675976176508136
a severe respiratory disease #### ähnlich: 0.4990230397309604
more than 2468 human infections #### ähnlich: 0.41296312851298905
851 deaths #### ähnlich: 0.15388795007959338
27 countries #### ähnlich: 0.13847852632223484
no approved treatments #### ähnlich: 0.22902666198807248
MERS-CoV infection #### ähnlich: 0.7989696703497365
a combination #### ähnlich: 0.13322061421844744
lopinavir #### ähnlich: 0.0
ritonavir #### ähnlich: 0.0
interferon beta #### ähnlich: 0.32932173709274914
(LPV/RTV-IFNb #### ähnlich: -0.026341724138412275
humans #### ähnlich: 0.2824499231186059
the Kingdom #### ähnlich: 0.08676200994420641
Saudi Arabia #### ähnlich: 0.041518648812961745
we #### ähnlich: 0.0802233813756872
remdesivir #### ähnlich: 0.0
(RDV #### ähnlich: 0.08677537997556373
IFNb #### ähnlich: 0.0
superior antiviral activity ###

In [64]:
def similar_nchunks(doc,keyterm,cumlist,threshhold=0.5):
    key=nlp(keyterm)
    chunks=list(doc.noun_chunks)
    for noun in chunks:
        simval=noun.similarity(key)
        if simval>threshhold:
            cumlist.append((noun,simval))
    return(cumlist)

In [65]:
similar_nchunks(exemplar,"agent",[])

[(the causative agent, 0.7939071980284536)]

In [66]:
cumlist=[]
dfRem["doc"].apply(lambda x:similar_nchunks(x,"agent",cumlist))
cumlist

[(the causative agent, 0.7939071980284536),
 (adjunctive agents, 0.6236452101155399),
 (the antiviral properties, 0.526081203569923),
 (an antiviral agent, 0.7801708209980864),
 (antiviral agent, 0.7826824688710227),
 (investigational agents, 0.645840763294751),
 (an antiviral agent, 0.7801708209980864),
 (antiviral agents, 0.6711018678024206),
 (COVID-19 and anticancer agents, 0.6757634646731178),
 (other antiviral agents, 0.6656073448165073),
 (These agents, 0.664216703793501),
 (iron chelating agents, 0.6223720438296962),
 (the causative agent, 0.7939071980284536),
 (the causative agent, 0.7939071980284536),
 (each anti-coronavirus agents, 0.5591464172720746),
 (other anti-coronavirus agents, 0.5588481449742129),
 (the causative agent, 0.7939071980284536),
 (effective antiviral agents, 0.6482813972279166),
 (therapeutic agents, 0.6809085885018754),
 (commonly utilized antiinflammatory agents, 0.6073953910142822),
 (Other agents, 0.7175655901380203),
 (concomitant QTc-prolonging agen

In [67]:
cumlist=[]
dfRem["doc"].apply(lambda x:similar_nchunks(x,"evidence",cumlist,threshhold=0.7))
cumlist

[(the clinical evidence, 0.8400976398363346),
 (vivo evidence, 0.7396345690464189),
 (increasingly evidence, 0.8400862495116782),
 (These findings, 0.7233815540617284),
 (Some evidence, 0.9008331003170855),
 (new evidence, 0.8342938365455351),
 (the evidence, 0.8920556548929212),
 (Emerging evidence, 0.8307155818971217),
 (latest clinical and epidemiological evidence, 0.7752953267572479),
 (findings, 0.7154176601291731),
 (These findings, 0.7233815540617284),
 (The current evidence, 0.7855865444171195),
 (evidence, 1.0),
 (No evidence, 0.8855322381207609),
 (preliminary evidence, 0.8874567960660089),
 (evidence, 1.0),
 (the recent clinical findings, 0.7023658315184546),
 (evidence, 1.0),
 (evidence, 1.0),
 (evidence, 1.0),
 (clinical evidence, 0.8462254888715148),
 (specific findings, 0.7020126788076866),
 (Currently available evidence, 0.7102334475438903),
 (largely favorable findings, 0.706427681966554),
 (These findings, 0.7233815540617284),
 (FINDINGS, 0.7154176601291731),
 (these 

# THANK YOU