# Project Corona
## Philosophy and History of Science with Computational Means
Prof Dr. Gerd Graßhoff
### Filter dataframe, NLTP

# Libraries, data

In [1]:
import pandas as pd
import spacy
from spacy import displacy
from spacy.matcher import Matcher
import re

In [2]:
# Import English library
nlp = spacy.load('en_core_web_sm')

In [27]:
dfRem=pd.read_json("dfRemdesivierResearchObjects.json")[["title","abstract"]]

In [28]:
len(dfRem)

2180

In [29]:
dfRem.head(2)

Unnamed: 0,title,abstract
0,Evaluation of the efficacy and safety of intra...,BACKGROUND: Coronavirus disease 2019 (COVID-19...
1,Role of adjunctive treatment strategies in COV...,The coronavirus disease (COVID-19) pandemic ha...


In [30]:
dfRem=dfRem.dropna(subset=["abstract"])
len(dfRem)

2135

# Filter

## selecting rows

In [31]:
suche="We show"
def filter(zeile):
    cond=zeile["abstract"].str.contains(suche,na=False)
    return(cond)

In [32]:
df=dfRem[filter]
df

Unnamed: 0,title,abstract
341,In silico detection of SARS-CoV-2 specific B-c...,Abstract Rapid generation of diagnostics is pa...
537,Structure based drug discovery by virtual scre...,&lt;p&gt;Background&lt;/p&gt;&lt;p&gt;The curr...
649,Defining the Pandemic at the State Level: Sequ...,"In December of 2019, a novel coronavirus, SARS..."
685,COVID-19 research in Wikipedia,Wikipedia is one of the main sources of free k...
1542,SARS-CoV-2 and SARS-CoV differ in their cell t...,SARS-CoV-2 is a novel coronavirus currently ca...


# Pattern maching

In [33]:
a=df.iloc[0].abstract
a

'Abstract Rapid generation of diagnostics is paramount to understand epidemiology and to control the spread of emerging infectious diseases such as COVID-19. Computational methods to predict serodiagnostic epitopes that are specific for the pathogen could help accelerate the development of new diagnostics. A systematic survey of 27 SARS-CoV-2 proteins was conducted to assess whether existing B-cell epitope prediction methods, combined with comprehensive mining of sequence databases and structural data, could predict whether a particular protein would be suitable for serodiagnosis. Nine of the predictions were validated with recombinant SARS-CoV-2 proteins in the ELISA format using plasma and sera from patients with SARS-CoV-2 infection, and a further 11 predictions were compared to the recent literature. Results appeared to be in agreement with 12 of the predictions, in disagreement with 3, while a further 5 were deemed inconclusive. We showed that two of our top five candidates, the N

In [34]:
abstract=nlp(a)

In [35]:
for i,sent in enumerate(abstract.sents):
    print(f" Satz {i}= {sent}")

 Satz 0= Abstract Rapid generation of diagnostics is paramount to understand epidemiology and to control the spread of emerging infectious diseases such as COVID-19.
 Satz 1= Computational methods to predict serodiagnostic epitopes that are specific for the pathogen could help accelerate the development of new diagnostics.
 Satz 2= A systematic survey of 27 SARS-CoV-2 proteins was conducted to assess whether existing B-cell epitope prediction methods, combined with comprehensive mining of sequence databases and structural data, could predict whether a particular protein would be suitable for serodiagnosis.
 Satz 3= Nine of the predictions were validated with recombinant SARS-CoV-2 proteins in the ELISA format using plasma and sera from patients with SARS-CoV-2 infection, and a further 11 predictions were compared to the recent literature.
 Satz 4= Results appeared to be in agreement with 12 of the predictions, in disagreement with 3, while a further 5 were deemed inconclusive.
 Satz 5=

In [36]:
sentences=[s for s in abstract.sents]
s=sentences[5].text
s

'We showed that two of our top five candidates, the N-terminal fragment of the nucleoprotein and the receptor-binding domain of the spike protein, have the highest sensitivity and specificity and signal-to-noise ratio for detecting COVID-19 sera/plasma by ELISA.'

In [37]:
# https://spacy.io/usage/linguistic-features
doc=nlp(s)
for token in doc:
        print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

We -PRON- PRON PRP nsubj Xx True True
showed show VERB VBD ROOT xxxx True False
that that SCONJ IN mark xxxx True True
two two NUM CD nsubj xxx True True
of of ADP IN prep xx True True
our -PRON- DET PRP$ poss xxx True True
top top ADJ JJ amod xxx True True
five five NUM CD nummod xxxx True True
candidates candidate NOUN NNS pobj xxxx True False
, , PUNCT , punct , False False
the the DET DT det xxx True True
N n ADJ JJ compound X True False
- - PUNCT HYPH punct - False False
terminal terminal NOUN NN compound xxxx True False
fragment fragment NOUN NN appos xxxx True False
of of ADP IN prep xx True True
the the DET DT det xxx True True
nucleoprotein nucleoprotein PROPN NNP pobj xxxx True False
and and CCONJ CC cc xxx True True
the the DET DT det xxx True True
receptor receptor NOUN NN npadvmod xxxx True False
- - PUNCT HYPH punct - False False
binding bind VERB VBG amod xxxx True False
domain domain NOUN NN conj xxxx True False
of of ADP IN prep xx True True
the the DET DT det xxx True

In [38]:
displacy.render(doc, style="ent")

# Pattern matching
https://spacy.io/usage/rule-based-matching

https://explosion.ai/demos/matcher?

In [39]:
matcher = Matcher(nlp.vocab)

In [40]:
pattern = [{"LOWER":"we"},{'POS': 'VERB'}]
matcher.add("matching",None,pattern)

In [41]:
matches=matcher(doc)

In [42]:
print(matches)

[(1221037237276548748, 0, 2)]


In [43]:
for match_id, start, end in matches:
    span = doc[start:end]  # The matched span
    print(span.text)

We showed


In [44]:
pattern1 = [{"POS":"PRN"},{"LOWER":"showed"}]
matcher.add("m2",None,pattern1)

In [45]:
matches2=matcher(doc)

In [46]:
for _, start, end in matches:
    span = doc[start:end]  # The matched span
    print(span.text)
    span2=doc[end-1]
    print(f"das Verb ist: {span2.lemma_}")

We showed
das Verb ist: show


In [47]:
matches=matcher(abstract)
for _,start,end in matches:
    verb=abstract[end-1]
    print(verb)

showed


In [54]:
listverb=[]
def actionverb(a):
    abstract=nlp(a)
    matches=matcher(abstract)
    for _,start,end in matches:
        verb=abstract[end-1]
        listverb.append(verb.lemma_)

In [55]:
actionverb(a)
listverb

['show']

In [57]:
dfRem["abstract"].apply(actionverb)
len(listverb)

2973

In [58]:
from collections import Counter
Counter(listverb)

Counter({'show': 67,
         'provide': 72,
         'summarize': 56,
         'highlight': 20,
         'analyze': 34,
         'characterize': 8,
         'notice': 8,
         'identify': 110,
         'describe': 114,
         'report': 158,
         'review': 92,
         'reflect': 2,
         'will': 80,
         'discuss': 88,
         'propose': 76,
         'elucidate': 2,
         'present': 102,
         'prioritize': 4,
         'conclude': 18,
         'hypothesize': 18,
         'develop': 26,
         'predict': 24,
         'advocate': 6,
         'focus': 24,
         'estimate': 14,
         'run': 2,
         'suggest': 40,
         'aim': 132,
         'search': 54,
         'could': 10,
         'must': 10,
         'emphasize': 2,
         'hope': 18,
         'study': 14,
         'evaluate': 38,
         'follow': 8,
         'assess': 18,
         'illustrate': 2,
         'believe': 38,
         'use': 76,
         'screen': 22,
         'find': 130,
       