# 3 Verb Filter

## 0 Import Libraries

In [79]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

In [80]:
# Import English Library
nlp = spacy.load("en_core_web_lg", disable=["ner"])

## 1 Load Dataframe

In [81]:
# Load sentences dataframe from 'HCQ_sentences.json': sentences_df
sentences_df = pd.read_json("../data/HCQ_sentences.json")

In [82]:
sentences_df.head(3)

Unnamed: 0,sentence_id,title,sentence
0,pub.1126880632-0,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1126880632-1,COVID-19 and what pediatric rheumatologists sh...,"The infection, transmitted by 2019 novel coron..."
2,pub.1126880632-2,COVID-19 and what pediatric rheumatologists sh...,"Italy was early and severely involved, with a ..."


## 2 Verb Filter: A first approach

### 2.1 An Initial Search Pattern

In [83]:
# Make Doc-object from one example sentence of 'sentences_df' (Sentence ID: pub.1126655433-13): doc
doc = nlp("These results do not support the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia.")

In [84]:
# Part-of-Speech (POS) tags for tokens in 'doc'
for token in doc:
    print(token.text, token.pos_)

These DET
results NOUN
do AUX
not PART
support VERB
the DET
use NOUN
of ADP
HCQ PROPN
in ADP
patients NOUN
hospitalised VERB
for ADP
documented VERB
SARS PROPN
- PUNCT
CoV-2-positive NOUN
hypoxic ADJ
pneumonia NOUN
. PUNCT


In [85]:
pos_verb = []

for token in doc:
    if token.pos_ == "VERB":
        pos_verb.append(token.text)
        
print(pos_verb)

['support', 'hospitalised', 'documented']


In [86]:
# Dependencies (DEP) for tokens in 'doc'
for token in doc:
    print(token.text, token.pos_, token.dep_)

These DET det
results NOUN nsubj
do AUX aux
not PART neg
support VERB ROOT
the DET det
use NOUN dobj
of ADP prep
HCQ PROPN pobj
in ADP prep
patients NOUN pobj
hospitalised VERB acl
for ADP prep
documented VERB amod
SARS PROPN nmod
- PUNCT punct
CoV-2-positive NOUN nmod
hypoxic ADJ amod
pneumonia NOUN pobj
. PUNCT punct


In [87]:
pos_dep_verb = []

for token in doc:
    if token.pos_ == "VERB":
        if token.dep_ == "ROOT":
            pos_dep_verb.append(token.text)
        
print(pos_dep_verb)

['support']


In [88]:
# Initial search pattern to find SUPPORT-verbs:'initial_search_pattern'
initial_search_pattern = [{"POS": "VERB", "DEP": "ROOT"}]

In [89]:
# Make a matcher to test 'initial_search_pattern' on 'doc': 'test_matcher'
test_matcher = Matcher(nlp.vocab, validate=True)

In [90]:
# Add pattern 'initial_search_pattern' to matcher 'test_matcher'
test_matcher.add("TEST_ID", None, initial_search_pattern)

In [91]:
# Apply 'test_matcher' on 'doc'
for match_id, start, end in test_matcher(doc):
    print(doc[start:end].text)

support


In [92]:
# An alternative way to code it
for (match_id, start, end) in test_matcher(doc):
    print(doc[start:end].text)

support


**Detour: What does a match (=Matcher(Doc)) look like?**

In [93]:
# Apply 'test_matcher' on 'doc': match
match = test_matcher(doc)

# Show 'match'
match

[(1961263444387358288, 4, 5)]

In [94]:
# Tell what type of object 'match' is
type(match)

list

In [95]:
# Tell how many items are in 'match'
len(match)

1

In [96]:
# Show first item in list 'match'
match[0]

(1961263444387358288, 4, 5)

In [97]:
# Tell what type of object the fist item in 'match' is
type(match[0])

tuple

In [98]:
# Print slice of 'doc' with last two numbers in the tuple 
print(doc[4:5].text)     # start index = 4; end index = 5

support


In [99]:
# Tell what type of object 'doc[4:5]' is
type(doc[4:5])

spacy.tokens.span.Span

In [100]:
# Make a new Doc that creates no matches with 'test_matcher': doc2
doc2 = nlp("COVID-19 is a global health threat.")

In [101]:
# Show that there is no token in 'doc2' that is both a verb and the root of the sentence
for token in doc2:
    print(token.text, token.pos_, token.dep_)

COVID-19 PROPN nsubj
is AUX ROOT
a DET det
global ADJ amod
health NOUN compound
threat NOUN attr
. PUNCT punct


In [102]:
# Apply 'test_matcher' on 'doc2': match2
match2 = test_matcher(doc2)

# Show 'match2'
match2

[]

In [103]:
# Tell how many items are in 'match2'
len(match2)

0

### 2.2 List of SUPPORT-verbs

In [104]:
# Make list of sentences (Doc-object) 
# from column 'sentence' of dataframe 'sentences_df': doc_list
doc_list = list(nlp.pipe(sentences_df["sentence"].to_list()))

In [105]:
# Print 'doc_list' (first ten items only)
print(doc_list[:10])

[On March 11th, 2020 the World Health Organization declared COVID-19 a global pandemic., The infection, transmitted by 2019 novel coronavirus (2019-nCov), was first discovered in December 2019, in Wuhan, Hubei Province, and then rapidly spread worldwide., Italy was early and severely involved, with a critical spread of the infection and a very high number of victims., Person-to-person spread mainly occurs via respiratory droplets and contact., The median incubation period is 5 days., The spectrum of respiratory symptoms may range from mild to severe, strictly depending on the age of the patient and the underlying comorbidities., In children COVID-19 related disease is less frequent and less aggressive., In Italy 1% of positive cases are under 18 years of age, and no deaths have been recorded before 29 years of age., For patients affected by rheumatic disease, despite the concerns related to the imbalance of their immune response and the effect of immunosuppressive treatments, there are

In [106]:
# Make matcher 'initial_search_matcher'
initial_search_matcher = Matcher(nlp.vocab, validate=True)

In [107]:
# Add pattern 'initial_search_pattern' to 'initial_search_matcher'
initial_search_matcher.add("INITIAL_SEARCH_ID", None, initial_search_pattern)

In [108]:
# Define function 'match_verb_root()'
def match_verb_root(Doc_list):
    lemma = set()     # Make set 'lemma'
    
    for Doc in Doc_list:          # First loop: iterate over all Doc-objects in Doc_list
        for match_id, start, end in initial_search_matcher(Doc):     # Second loop: iterate over all tuples in the list
                                                                     # which is created by 'initial_search_matcher(Doc)'
                
            lemma.add(Doc[start:end].lemma_)                         # For Doc-object at hand: Slice it by means of start 
                                                                     # index and end index, get basic form (lemma) of
                                                                     # token(s) in that slice and add this lemma to 'lemma'
            
    return sorted(list(lemma), key=str.lower)                        # Convert set 'lemma' into a list, sort items in it 
                                                                     # alphabetically and return the result

In [109]:
root_verb_lemma = match_verb_root(doc_list)

In [110]:
print(root_verb_lemma)

['add', 'adjust', 'affect', 'aim', 'allocate', 'analyze', 'appear', 'assess', 'assign', 'associate', 'base', 'become', 'bring', 'call', 'carry', 'cause', 'claim', 'comprise', 'confirm', 'consider', 'declare', 'demonstrate', 'diagnose', 'die', 'direct', 'discover', 'draw', 'drive', 'enrol', 'enter', 'evaluate', 'exclude', 'expect', 'experience', 'explore', 'face', 'find', 'focus', 'follow', 'force', 'help', 'hospitalise', 'identify', 'improve', 'include', 'increase', 'indicate', 'issue', 'know', 'launch', 'like', 'make', 'measure', 'need', 'observe', 'obtain', 'occur', 'perform', 'play', 'predispose', 'present', 'progress', 'provide', 'raise', 'range', 'receive', 'recommend', 'record', 'register', 'report', 'represent', 'require', 'result', 'reveal', 'review', 'seek', 'set', 'share', 'shorten', 'show', 'spread', 'suggest', 'support', 'transfer', 'treat', 'understand', 'use']


**Aid to help selecting verbs manually**

In [111]:
# Make generator form 'root_verb_lemma': verb_generator
verb_generator = (verb for verb in root_verb_lemma)

In [112]:
# Next item of 'verb_generator': verb_to_prove
verb_to_prove = next(verb_generator)

# Show current 'verb_to_prove'
print(f"verb_to_prove: {verb_to_prove}")

# Make empty list: found_sentences
found_sentences = []

for doc in doc_list:     # Iterate over all Doc-objects in 'doc_list'
    for match_id, start, end in initial_search_matcher(doc):     # Iterate over all matches (tuples) for the current Doc
        if doc[start:end].lemma_ == verb_to_prove:               # If the lemma of the matched Span is equal to the 
                                                                 # current 'verb_to_prove':
                
            if doc not in found_sentences:                       # Check if the Doc is not already in 'found_sentences'
                                                                 # if it is not:
                found_sentences.append(doc)                      # Add the current Doc to the list 'found_sentences'
                    
                    
print(f"Sentences found with '{verb_to_prove}' in it: ")
print("----------------------------------------\n")

# Show each found Doc(sentence) with its respective index number 
for sentence_number, sentence in enumerate(found_sentences):
    print(f"({sentence_number}) {sentence}")

verb_to_prove: add
Sentences found with 'add' in it: 
----------------------------------------

(0) Depending on their clinical presentation, azithromycin was added to the treatment.
(1) Azithromycin added to hydroxychloroquine


In [113]:
#selected_verbs = []

In [114]:
#selected_verbs.append(verb_to_prove)

In [115]:
#print(selected_verbs)

```python
selected_verbs = ['reveal', 'show', 'suggest', 'support']
```

4 SUPPORT-verbs:
* reveal
* show
* suggest
* support

### 2.3 A First Verb Filter

In [116]:
# Make matcher 'verb_matcher_1'
verb_matcher_1 = Matcher(nlp.vocab, validate=True)

In [117]:
# Make search pattern for 'verb_matcher_1': support_verbs_pattern
support_verbs_pattern = [{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": ['reveal', 'show', 'suggest', 'support']}}]

In [118]:
# Add 'support_verbs_pattern' to 'verb_matcher_1'
verb_matcher_1.add("VERB_ID", None, support_verbs_pattern)

In [119]:
# Filter Doc-objects (sentences) in 'doc_list' and add the selected Docs to a list: verb_filtered_sentences_1
verb_filtered_sentences_1 = [doc for doc in doc_list if len(verb_matcher_1(doc)) > 0] 

In [120]:
# Print each enumerated verb filtered sentence
for sentence_number, sentence in enumerate(verb_filtered_sentences_1):
    print(f"({sentence_number}) {sentence}")

(0) This work was supported by the Emergent Projects of National Science and Technology (2020YFC0844500), National Natural Science Foundation of China (81970020, 81770025), National Key Research and Development Program of China (2016YFC0901104), Shanghai Municipal Key Clinical Specialty (shslczdzk02202, shslczdzk01103), National Innovative Research Team of High-level Local Universities in Shanghai, Shanghai Key Discipline for Respiratory Diseases (2017ZZ02014), National Major Scientific and Technological Special Project for Significant New Drugs Development (2017ZX09304007), Key Projects in the National Science and Technology Pillar Program during the Thirteenth Five-year Plan Period (2018ZX09206005-004, 2017ZX10202202-005-004, 2017ZX10203201-008).
(1) This re-analysis reveals severe limitations in the methodology of this study, including ambiguous inclusion/exclusion of participant data and inconsistent analysis techniques, and yielded nonsignificant differences between control and tr

In [121]:
# Show number of item in 'doc_list'/'verb_filtered_sentences_1' using 'len()'
print(f"Number of sentences in 'doc_list': {len(doc_list)}")
print(f"Number of sentences in 'verb_filtered_sentences_1': {len(verb_filtered_sentences_1)}")

Number of sentences in 'doc_list': 216
Number of sentences in 'verb_filtered_sentences_1': 10


### 2.4 Evaluation I

**For the following steps see:** 
Youtube: ["Intro to NLP with spaCy (3): Detecting programming languages | Episode 3: Evaluation"](https://youtu.be/4V0JDdohxAk)

#### 2.4.1 Measuring Device

In [122]:
# Load 'sentences_labeled.xlsx' as a dataframe: measuring_df_1
measuring_df_1 = pd.read_excel("../labeling/sentences_labeled.xlsx")

# Show first 5 rows of 'measuring_df_1'
measuring_df_1.head()

Unnamed: 0,label,sentence
0,0,"On March 11th, 2020 the World Health Organizat..."
1,0,"The infection, transmitted by 2019 novel coron..."
2,0,"Italy was early and severely involved, with a ..."
3,0,Person-to-person spread mainly occurs via resp...
4,0,The median incubation period is 5 days.


In [123]:
# Make new column that tells whether the sentence in column 'sentence' yields a match with 'verb_matcher_1': prediction
measuring_df_1 = measuring_df_1.assign(prediction=lambda df: [len(verb_matcher_1(doc)) > 0 for doc in nlp.pipe(df["sentence"])])

In [124]:
# Show rows no. 71-75 of 'measuring_df_1'
measuring_df_1.iloc[71:76]

Unnamed: 0,label,sentence,prediction
71,0,Proposals should be directed to the correspond...,False
72,0,Recent publications have brought attention to ...,False
73,0,The scientific community should consider this ...,False
74,0,A recent open-label study claimed that hydroxy...,False
75,1,This re-analysis reveals severe limitations in...,True


In [125]:
# Make column 'prediction' in such a way that booleans ('True'/'False') are turned into '1'/'0'
measuring_df_1 = measuring_df_1.assign(prediction=lambda df: df["prediction"].astype(np.int8))

In [126]:
# Show rows no. 71-75 of 'measuring_df_1'
measuring_df_1.iloc[71:76]

Unnamed: 0,label,sentence,prediction
71,0,Proposals should be directed to the correspond...,0
72,0,Recent publications have brought attention to ...,0
73,0,The scientific community should consider this ...,0
74,0,A recent open-label study claimed that hydroxy...,0
75,1,This re-analysis reveals severe limitations in...,1


In [127]:
# Rearrange position of columns
measuring_df_1 = measuring_df_1[["sentence", "label", "prediction"]]

In [128]:
measuring_df_1.iloc[71:76]

Unnamed: 0,sentence,label,prediction
71,Proposals should be directed to the correspond...,0,0
72,Recent publications have brought attention to ...,0,0
73,The scientific community should consider this ...,0,0
74,A recent open-label study claimed that hydroxy...,0,0
75,This re-analysis reveals severe limitations in...,1,1


#### 2.4.2 Metrics

**Confusion Matrix**

In [129]:
# Make confusion matrix from 'measuring_df_1["label"]' and 'measuring_df_1["prediction"]'
confusion_matrix(measuring_df_1["label"], measuring_df_1["prediction"])

array([[200,   2],
       [  6,   8]], dtype=int64)

**Classification Report**

In [130]:
# Make classification report from 'measuring_df_1["label"]' and 'measuring_df_1["prediction"]' and show it on screen
print(classification_report(measuring_df_1["label"], measuring_df_1["prediction"]))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       202
           1       0.80      0.57      0.67        14

    accuracy                           0.96       216
   macro avg       0.89      0.78      0.82       216
weighted avg       0.96      0.96      0.96       216



#### 2.4.3 Mistakes

**False Positives**

In [131]:
false_positives = measuring_df_1.loc[measuring_df_1["prediction"] == 1].loc[measuring_df_1["label"] == 0, ["sentence"]]

In [132]:
false_positives

Unnamed: 0,sentence
60,This work was supported by the Emergent Projec...
143,Current international society recommendations ...


In [133]:
type(false_positives)

pandas.core.frame.DataFrame

In [134]:
for sentence_number, sentence in enumerate(false_positives["sentence"].to_list()):
    print(f"({sentence_number}) {sentence}")

(0) This work was supported by the Emergent Projects of National Science and Technology (2020YFC0844500), National Natural Science Foundation of China (81970020, 81770025), National Key Research and Development Program of China (2016YFC0901104), Shanghai Municipal Key Clinical Specialty (shslczdzk02202, shslczdzk01103), National Innovative Research Team of High-level Local Universities in Shanghai, Shanghai Key Discipline for Respiratory Diseases (2017ZZ02014), National Major Scientific and Technological Special Project for Significant New Drugs Development (2017ZX09304007), Key Projects in the National Science and Technology Pillar Program during the Thirteenth Five-year Plan Period (2018ZX09206005-004, 2017ZX10202202-005-004, 2017ZX10203201-008).
(1) Current international society recommendations suggest that patients with rheumatic diseases on immunosuppressive therapy should not stop glucocorticoids during COVID-19 infection, although minimum possible doses may be used.


**False Negatives**

In [135]:
false_negatives = measuring_df_1.loc[measuring_df_1["prediction"] == 0].loc[measuring_df_1["label"] == 1, ["sentence"]]

In [136]:
for sentence_number, sentence in enumerate(false_negatives["sentence"].to_list()):
    print(f"({sentence_number}) {sentence}")

(0) We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.
(1) The administration of HCQ did not result in a significantly higher negative conversion probability than SOC alone in patients mainly hospitalized with persistent mild to moderate COVID-19.
(2) Adverse events were higher in HCQ recipients than in HCQ non-recipients.
(3) Although mortality rate was not significantly different between cases and controls, frequency of adverse effects was substantially higher in HCQ regimen group.
(4) Use of these drugs is premature and potentially harmful
(5) Among patients with COVID-19, the use of HCQ could significantly shorten TTCR and promote the absorption of pneumonia.


## 3 Verb Filter: Refinements

In [137]:
# Make a Doc of the sentence we want the verb filter to find: doc3
doc3 = nlp("We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.")

In [138]:
# Show linguistic features of 'doc3'
for token in doc3:
    print(token.text, token.pos_, token.dep_, token.lemma_)

We PRON nsubj -PRON-
were AUX ROOT be
unable ADJ acomp unable
to PART aux to
confirm VERB xcomp confirm
a DET det a
benefit NOUN dobj benefit
of ADP prep of
hydroxychloroquine NOUN pobj hydroxychloroquine
or CCONJ cc or
chloroquine NOUN conj chloroquine
, PUNCT punct ,
when ADV advmod when
used VERB advcl use
alone ADV advmod alone
or CCONJ cc or
with ADP conj with
a DET det a
macrolide NOUN pobj macrolide
, PUNCT punct ,
on ADP prep on
in ADP nmod in
- PUNCT punct -
hospital NOUN pobj hospital
outcomes NOUN pobj outcome
for ADP prep for
COVID-19 PROPN pobj COVID-19
. PUNCT punct .


In [139]:
spacy.explain("xcomp")

'open clausal complement'

In [140]:
# Make additional search pattern: confirm_pattern
confirm_pattern = [{"POS": "VERB", "DEP": "xcomp", "LEMMA": "confirm"}]

In [141]:
# Make a new verb matcher 'verb_matcher_2'
verb_matcher_2 = Matcher(nlp.vocab, validate=True)

In [142]:
# Repeat search pattern for 'verb_matcher_2': support_verbs_pattern
support_verbs_pattern = [{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": ['reveal', 'show', 'suggest', 'support']}}]

In [143]:
# Add 'support_verbs_pattern' and 'confirm_pattern' to 'verb_matcher_2'
verb_matcher_2.add("VERB_ID", None, support_verbs_pattern, confirm_pattern)

In [144]:
# Filter Doc-objects (sentences) in 'doc_list' and add the selected Docs to a list: verb_filtered_sentences_2
verb_filtered_sentences_2 = [doc for doc in doc_list if len(verb_matcher_2(doc)) > 0] 

In [145]:
# Print each enumerated verb filtered sentence
for sentence_number, sentence in enumerate(verb_filtered_sentences_2):
    print(f"({sentence_number}) {sentence}")

(0) We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.
(1) This work was supported by the Emergent Projects of National Science and Technology (2020YFC0844500), National Natural Science Foundation of China (81970020, 81770025), National Key Research and Development Program of China (2016YFC0901104), Shanghai Municipal Key Clinical Specialty (shslczdzk02202, shslczdzk01103), National Innovative Research Team of High-level Local Universities in Shanghai, Shanghai Key Discipline for Respiratory Diseases (2017ZZ02014), National Major Scientific and Technological Special Project for Significant New Drugs Development (2017ZX09304007), Key Projects in the National Science and Technology Pillar Program during the Thirteenth Five-year Plan Period (2018ZX09206005-004, 2017ZX10202202-005-004, 2017ZX10203201-008).
(2) This re-analysis reveals severe limitations in the methodology of this study, includ

In [146]:
# Show number of item in 'doc_list'/'verb_filtered_sentences_2' using 'len()'
print(f"Number of sentences in 'doc_list': {len(doc_list)}")
print(f"Number of sentences in 'verb_filtered_sentences_2': {len(verb_filtered_sentences_2)}")

Number of sentences in 'doc_list': 216
Number of sentences in 'verb_filtered_sentences_2': 11


## 4 Evaluation II

**Measuring Device**

In [147]:
# Load 'sentences_labeled.xlsx' as a dataframe: measuring_df_2
measuring_df_2 = pd.read_excel("../labeling/sentences_labeled.xlsx")

In [148]:
# Make new column that tells whether the sentence in column 'sentence' yields a match with 'verb_matcher_2': prediction
measuring_df_2 = measuring_df_2.assign(prediction=lambda df: [len(verb_matcher_2(doc)) > 0 for doc in nlp.pipe(df["sentence"])])

In [149]:
# Make column 'prediction' in such a way that booleans ('True'/'False') are turned into '1'/'0'
measuring_df_2 = measuring_df_2.assign(prediction=lambda df: df["prediction"].astype(np.int8))

In [150]:
# Rearrange position of columns
measuring_df_2 = measuring_df_2[["sentence", "label", "prediction"]]

In [151]:
measuring_df_2.iloc[71:76]

Unnamed: 0,sentence,label,prediction
71,Proposals should be directed to the correspond...,0,0
72,Recent publications have brought attention to ...,0,0
73,The scientific community should consider this ...,0,0
74,A recent open-label study claimed that hydroxy...,0,0
75,This re-analysis reveals severe limitations in...,1,1


**Confusion Matrix**

In [152]:
# Make confusion matrix from 'measuring_df_2["label"]' and 'measuring_df_2["prediction"]'
confusion_matrix(measuring_df_2["label"], measuring_df_2["prediction"])

array([[200,   2],
       [  5,   9]], dtype=int64)

**Classification Report**

In [153]:
# Make classification report from 'measuring_df_2["label"]' and 'measuring_df_2["prediction"]' and show it on screen
print(classification_report(measuring_df_2["label"], measuring_df_2["prediction"]))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       202
           1       0.82      0.64      0.72        14

    accuracy                           0.97       216
   macro avg       0.90      0.82      0.85       216
weighted avg       0.97      0.97      0.97       216

