# Libraries, data

In [1]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import re

In [2]:
# Import English library
nlp = spacy.load('en_core_web_sm')

In [3]:
dfRem=pd.read_json("dfRemdesivierResearchObjects.json")[["title","abstract"]]

In [4]:
len(dfRem)

2180

In [5]:
dfRem.head(2)

Unnamed: 0,title,abstract
0,Evaluation of the efficacy and safety of intra...,BACKGROUND: Coronavirus disease 2019 (COVID-19...
1,Role of adjunctive treatment strategies in COV...,The coronavirus disease (COVID-19) pandemic ha...


In [6]:
dfRem=dfRem.dropna(subset=["abstract"])
len(dfRem)

2135

# Filter

Initially just for those abstracts containing the expression "We show"

In [7]:
suche="We show"
def filter(zeile):
    cond=zeile["abstract"].str.contains(suche,na=False)
    return(cond)

In [8]:
df1=dfRem[filter]
df1

Unnamed: 0,title,abstract
341,In silico detection of SARS-CoV-2 specific B-c...,Abstract Rapid generation of diagnostics is pa...
537,Structure based drug discovery by virtual scre...,&lt;p&gt;Background&lt;/p&gt;&lt;p&gt;The curr...
649,Defining the Pandemic at the State Level: Sequ...,"In December of 2019, a novel coronavirus, SARS..."
685,COVID-19 research in Wikipedia,Wikipedia is one of the main sources of free k...
1542,SARS-CoV-2 and SARS-CoV differ in their cell t...,SARS-CoV-2 is a novel coronavirus currently ca...


# Pattern for propositional attitudes "we [VERB]"

https://spacy.io/usage/rule-based-matching

https://explosion.ai/demos/matcher?

In [9]:
dfRem["doc"]=dfRem["abstract"].apply(nlp)

In [10]:
df=dfRem
a=df.iloc[2].abstract
a

'Middle East respiratory syndrome coronavirus (MERS-CoV) is the causative agent of a severe respiratory disease associated with more than 2468 human infections and over 851 deaths in 27 countries since 2012. There are no approved treatments for MERS-CoV infection although a combination of lopinavir, ritonavir and interferon beta (LPV/RTV-IFNb) is currently being evaluated in humans in the Kingdom of Saudi Arabia. Here, we show that remdesivir (RDV) and IFNb have superior antiviral activity to LPV and RTV in vitro. In mice, both prophylactic and therapeutic RDV improve pulmonary function and reduce lung viral loads and severe lung pathology. In contrast, prophylactic LPV/RTV-IFNb slightly reduces viral loads without impacting other disease parameters. Therapeutic LPV/RTV-IFNb improves pulmonary function but does not reduce virus replication or severe lung pathology. Thus, we provide in vivo evidence of the potential for RDV to treat MERS-CoV infections.'

In [11]:
abstract=nlp(a)

In [12]:
for i,sent in enumerate(abstract.sents):
    print(f" Satz {i}= {sent}")

 Satz 0= Middle East respiratory syndrome coronavirus (MERS-CoV) is the causative agent of a severe respiratory disease associated with more than 2468 human infections and over 851 deaths in 27 countries since 2012.
 Satz 1= There are no approved treatments for MERS-CoV infection although a combination of lopinavir, ritonavir and interferon beta (LPV/RTV-IFNb) is currently being evaluated in humans in the Kingdom of Saudi Arabia.
 Satz 2= Here, we show that remdesivir (RDV) and IFNb have superior antiviral activity to LPV and RTV in vitro.
 Satz 3= In mice, both prophylactic and therapeutic RDV improve pulmonary function and reduce lung viral loads and severe lung pathology.
 Satz 4= In contrast, prophylactic LPV/RTV-IFNb slightly reduces viral loads without impacting other disease parameters.
 Satz 5= Therapeutic LPV/RTV-IFNb improves pulmonary function but does not reduce virus replication or severe lung pathology.
 Satz 6= Thus, we provide in vivo evidence of the potential for RDV t

In [13]:
sentences=[s for s in abstract.sents]
s=sentences[6].text
doc=nlp(s)
s

'Thus, we provide in vivo evidence of the potential for RDV to treat MERS-CoV infections.'

In [14]:
displacy.render(doc, style="ent")

In [15]:
matcher = Matcher(nlp.vocab)

In [16]:
pattern = [{"LOWER":"we"},{'POS': 'VERB'}]
matcher.add("matching",None,pattern)

In [17]:
matches=matcher(doc)
for _, start, end in matches:
    span = doc[start:end]  # The matched span
    print(span.text)
    span2=doc[end-1]
    print(f"das Verb ist: {span2.lemma_}")

we provide
das Verb ist: provide


In [18]:
listverbs=[]
def actionverbs(a):
    abstract=nlp(a)
    matches=matcher(abstract)
    for _,start,end in matches:
        verb=abstract[end-1]
        listverbs.append(verb.lemma_)

In [19]:
actionverbs(a)
listverbs

['show', 'provide']

In [None]:
dfRem["abstract"].apply(actionverbs)
len(listverbs)

In [None]:
from collections import Counter
counts=Counter(listverbs)
print(sorted(counts))

# Propositional research attitudes

- GOAL (intention, goal):  
    - aim, seek, study, undertake, investigate, intend
- DISCOV: 
    - show, propose, obtain, report, observe, create, demonstrate, detect, scrutinize, provide

## training data

- prepare matching function for sentences, output in format JSONL

In [None]:
verbtags={"GOAL":
       ["aim","seek","study","undertake","investigate","intend"],
       "DISCOV":
        ["show","provide","notice","report","intend"]
         }

In [None]:
# for testing
dfRem1=dfRem["doc"].copy()

In [None]:
def actionverb(abstract):
    matches=matcher(abstract)
    listDisc=[]
#    print(len(abstract.text),matches)
    for _,start,end in matches:
        verb=abstract[end-1].text
        verblem=abstract[end-1].lemma_
 #       print(f"+++++ {verblem}")
        for ent,verbs in verbtags.items():
 #           print(ent,verbs)
            if verblem in verbs:
                dict_disc=(start+1,end,ent)
 #               print(dict_disc)
 #               print("listdisc",listDisc)
                listDisc.append(dict_disc)
    return(listDisc)

In [None]:
dfRem1["discNER"]=dfRem1["doc"].apply(actionverb)  

In [None]:
dfRem.head(19)