# 5 Entity Labels

## 0 Import Libraries

In [1]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler
from spacy import displacy

In [2]:
# Import English Library
nlp = spacy.load("en_core_web_lg", disable=["ner"])

## 1 Load Dataframe

In [3]:
# Load sentences dataframe from 'HCQ_sentences.json': sentences_df
sentences_df = pd.read_json("../data/HCQ_sentences.json")

In [4]:
sentences_df.head(3)

Unnamed: 0,sentence_id,title,sentence
0,pub.1126880632-0,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1126880632-1,COVID-19 and what pediatric rheumatologists sh...,"The infection, transmitted by 2019 novel coron..."
2,pub.1126880632-2,COVID-19 and what pediatric rheumatologists sh...,"Italy was early and severely involved, with a ..."


## 2 Verb Filtered Sentences

In [5]:
# Make list of sentences (Doc-object) 
# from column 'sentence' of dataframe 'sentences_df': doc_list
doc_list = list(nlp.pipe(sentences_df["sentence"].to_list()))

In [6]:
# Make matcher: 'verb_matcher_2'
verb_matcher_2 = Matcher(nlp.vocab, validate=True)

In [7]:
# Make search pattern for 'verb_matcher_2': support_verbs_pattern
support_verbs_pattern = [{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": ['reveal', 'show', 'suggest', 'support']}}]

In [8]:
# Make additional search pattern for 'verb_matcher_2': confirm_pattern
confirm_pattern = [{"POS": "VERB", "DEP": "xcomp", "LEMMA": "confirm"}]

In [9]:
# Add 'support_verbs_pattern' and 'confirm_pattern' to 'verb_matcher_2'
verb_matcher_2.add("VERB_ID", None, support_verbs_pattern, confirm_pattern)

In [10]:
# Filter Doc-objects (sentences) in 'doc_list' and add the selected Docs to a list: verb_filtered_sentences_2
verb_filtered_sentences_2 = [doc for doc in doc_list if len(verb_matcher_2(doc)) > 0] 

In [11]:
# Print each enumerated verb filtered sentence
for sentence_number, sentence in enumerate(verb_filtered_sentences_2):
    print(f"({sentence_number}) {sentence}")

(0) We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.
(1) This work was supported by the Emergent Projects of National Science and Technology (2020YFC0844500), National Natural Science Foundation of China (81970020, 81770025), National Key Research and Development Program of China (2016YFC0901104), Shanghai Municipal Key Clinical Specialty (shslczdzk02202, shslczdzk01103), National Innovative Research Team of High-level Local Universities in Shanghai, Shanghai Key Discipline for Respiratory Diseases (2017ZZ02014), National Major Scientific and Technological Special Project for Significant New Drugs Development (2017ZX09304007), Key Projects in the National Science and Technology Pillar Program during the Thirteenth Five-year Plan Period (2018ZX09206005-004, 2017ZX10202202-005-004, 2017ZX10203201-008).
(2) This re-analysis reveals severe limitations in the methodology of this study, includ

## 3 Noun Filtered Sentences

In [12]:
# Make a matcher 'noun_matcher_2'
noun_matcher_2 = Matcher(nlp.vocab, validate=True)

In [13]:
# EVIDENCE-noun patterns for 'noun_matcher_2'
analysis_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "analysis"}]
evidence_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "evidence"}]
finding_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "finding"}]

result_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "result"}]
survey_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "survey"}]

In [14]:
# Additional search pattern for 'noun_matcher_2': 'trial_pattern'
trial_pattern = [{"POS": "NOUN", "DEP": "pobj", "LEMMA": "trial"}]

In [15]:
# Additional search pattern for 'noun_matcher_2': 'we_pattern'
we_pattern = [{"POS": "PRON", "DEP": "nsubj", "LEMMA": "-PRON-"}]

In [16]:
# Add search patterns to 'noun_matcher_2'
noun_matcher_2.add("NOUN_ID", None, 
                   analysis_pattern, 
                   evidence_pattern, 
                   finding_pattern, 
                   result_pattern, 
                   survey_pattern, 
                   trial_pattern, 
                   we_pattern)

In [17]:
# Filter Doc-objects (sentences) in 'verb_filtered_sentences_2' and add the selected Docs to a list: 
# noun_filtered_sentences_2
noun_filtered_sentences_2 = [doc.text for doc in verb_filtered_sentences_2 if len(noun_matcher_2(doc)) > 0] 

In [18]:
# Print each enumerated noun filtered sentence
for sentence_number, sentence in enumerate(noun_filtered_sentences_2):
    print(f"({sentence_number}) {sentence}")

(0) We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.
(1) This re-analysis reveals severe limitations in the methodology of this study, including ambiguous inclusion/exclusion of participant data and inconsistent analysis techniques, and yielded nonsignificant differences between control and treatment groups across any treatment days.
(2) This systematic review and meta-analysis showed no clinical benefits regarding HCQ treatment with/without azithromycin for COVID-19 patients.
(3) These results do not support the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia.
(4) Interpretation Preliminary findings suggest that the higher CQ dosage (10-day regimen) should not be recommended for COVID-19 treatment because of its potential safety hazards.
(5) Preliminary evidence suggests potential benefit with chloroquine or hydroxychloroquine.
(6) The findings s

In [19]:
# Show number of item in 'doc_list'/'verb_filtered_sentences_2'/'noun_filtered_sentences_1' using 'len()'
print(f"Number of sentences in 'doc_list': {len(doc_list)}")
print(f"Number of sentences in 'verb_filtered_sentences_2': {len(verb_filtered_sentences_2)}")
print(f"Number of sentences in 'noun_filtered_sentences_2': {len(noun_filtered_sentences_2)}")

Number of sentences in 'doc_list': 216
Number of sentences in 'verb_filtered_sentences_2': 11
Number of sentences in 'noun_filtered_sentences_2': 9


## 4 Label Entities

For the following see:

[https://spacy.io/usage/rule-based-matching#entityruler](https://spacy.io/usage/rule-based-matching#entityruler)

In [20]:
# Initialize spacy's EntityRuler: ruler
ruler = EntityRuler(nlp, validate=True)

### 4.1 SUPPORT-verbs

In [21]:
# Pattern for Entity "SUPP" (SUPPORT-verb): verb_patterns
verb_patterns = [{"label": "SUPP", "pattern": [{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": ['reveal', 'show', 'suggest', 'support']}}]}, 
                 {"label": "SUPP", "pattern": [{"POS": "VERB", "DEP": "xcomp", "LEMMA": "confirm"}]}]

In [22]:
# Add pattern 'verb_patterns' to 'ruler'
ruler.add_patterns(verb_patterns)

In [23]:
# Add 'ruler' to pipline of 'nlp_ner'
nlp.add_pipe(ruler)

In [24]:
# Convert strings in 'noun_filtered_sentences_2' into Doc-objects with labeled named entities
# make a list of these Docs: disagreement_sentences_1
disagreement_sentences_1 = list(nlp.pipe(noun_filtered_sentences_2))

In [25]:
# Show example sentence (disagreement_sentences_1[6]) with named entities
displacy.render(disagreement_sentences_1[6], style="ent", jupyter=True)

### 4.2 Nouns

#### 4.2.1 EVIDENCE-nouns

In [26]:
# For each Doc-object in 'disagreement_sentences_1': Show "noun chunks"-string of nominal subjects
for doc in disagreement_sentences_1:
    for chunk in doc.noun_chunks:
        if chunk.root.dep_ == "nsubj":
            print(chunk.text)

We
This re-analysis
This systematic review and meta-analysis
These results
Interpretation Preliminary findings
Preliminary evidence
The findings
these drugs
our survey


In [55]:
# Show example sentence (disagreement_sentences_1[3]) with dependency tree
displacy.render(disagreement_sentences_1[3], style="dep", jupyter=True)

In [28]:
# In addition show root verbs to which the noun chunks belong 
for doc in disagreement_sentences_1:
    for chunk in doc.noun_chunks:
        if chunk.root.dep_ == "nsubj":
            print(chunk.text, chunk.root.head.lemma_)

We be
This re-analysis reveal
This systematic review and meta-analysis show
These results support
Interpretation Preliminary findings suggest
Preliminary evidence suggest
The findings support
these drugs have
our survey show


In [29]:
# Show only those noun chunks (nominal subject) of which the root verb is a SUPPORT-verb
for doc in disagreement_sentences_1:
    for chunk in doc.noun_chunks:
        if chunk.root.dep_ == "nsubj":
            if chunk.root.head.lemma_ in ['reveal', 'show', 'suggest', 'support']:
                print(chunk.text)

This re-analysis
This systematic review and meta-analysis
These results
Interpretation Preliminary findings
Preliminary evidence
The findings
our survey


In [30]:
# Define function 'evidence_chunks()'. The function creates a list of label patterns to label noun chunks 
# in sentences gained from a list of Doc-objects
def evidence_chunks(Doc_list):
    evid_chunks = []
    for doc in Doc_list:
        for chunk in doc.noun_chunks:
            if chunk.root.dep_ == "nsubj":
                if chunk.root.head.lemma_ in ['reveal', 'show', 'suggest', 'support']:
                    evid_chunks.append({"label": "EVID", "pattern": chunk.text})
                    
    return evid_chunks

In [31]:
# Make list of label pattens by applying 'evidence_chunks()' to 'disagreement_sentences_1': evidence_patterns
evidence_patterns = evidence_chunks(disagreement_sentences_1)

In [32]:
# Show 'evidence_patterns'
evidence_patterns

[{'label': 'EVID', 'pattern': 'This re-analysis'},
 {'label': 'EVID', 'pattern': 'This systematic review and meta-analysis'},
 {'label': 'EVID', 'pattern': 'These results'},
 {'label': 'EVID', 'pattern': 'Interpretation Preliminary findings'},
 {'label': 'EVID', 'pattern': 'Preliminary evidence'},
 {'label': 'EVID', 'pattern': 'The findings'},
 {'label': 'EVID', 'pattern': 'our survey'}]

In [33]:
# Add 'evidence_patterns' to 'ruler'
ruler.add_patterns(evidence_patterns)

  self.phrase_matcher.add(label, patterns)


#### 4.2.2 Add "trial_label_pattern"

In [34]:
# Convert strings in 'noun_filtered_sentences_2' into Doc-objects with additional labeled named entities
# make a list of these Docs: disagreement_sentences_2
disagreement_sentences_2 = list(nlp.pipe(noun_filtered_sentences_2))

In [35]:
# Show the Docs 6, 0 and 8 of 'disagreement_sentences_2' with labeled named entities
print("\ndisagreement_sentences_2[6]:\n")
displacy.render(disagreement_sentences_2[6], style="ent", jupyter=True)
print("------------------------------------------------------------\n")

print("disagreement_sentences_2[0]:\n")
displacy.render(disagreement_sentences_2[0], style="ent", jupyter=True)
print("------------------------------------------------------------\n")

print("disagreement_sentences_2[8]:\n")
displacy.render(disagreement_sentences_2[8], style="ent", jupyter=True)
print("------------------------------------------------------------\n")


disagreement_sentences_2[6]:



------------------------------------------------------------

disagreement_sentences_2[0]:



------------------------------------------------------------

disagreement_sentences_2[8]:



------------------------------------------------------------



In [36]:
# Make further label pattern: trial_label_pattern
trial_label_pattern = [{"label": "EVID", "pattern": "multicenter clinical trials"}]

In [37]:
# Add 'trial_label_pattern' to 'ruler'
ruler.add_patterns(trial_label_pattern)

#### 4.2.3 Add "we_label_pattern"

In [38]:
# Make further label pattern to mark a 'SCI'-entity: we_label_pattern
we_label_pattern = [{"label": "SCI", "pattern": "We"}]

In [39]:
# Add 'we_label_pattern' to 'ruler'
ruler.add_patterns(we_label_pattern)

  self.phrase_matcher.add(label, patterns)


In [40]:
# Convert strings in 'noun_filtered_sentences_2' into Doc-objects with additional labeled named entities
# make a list of these Docs: disagreement_sentences_3
disagreement_sentences_3 = list(nlp.pipe(noun_filtered_sentences_2))

In [41]:
# Show the Docs 0 and 8 of 'disagreement_sentences_3' with labeled named entities
print("disagreement_sentences_3[0]:\n")
displacy.render(disagreement_sentences_3[0], style="ent", jupyter=True)
print("------------------------------------------------------------\n")

print("disagreement_sentences_3[8]:\n")
displacy.render(disagreement_sentences_3[8], style="ent", jupyter=True)
print("------------------------------------------------------------\n")

disagreement_sentences_3[0]:



------------------------------------------------------------

disagreement_sentences_3[8]:



------------------------------------------------------------



### 4.3 Negations

In [42]:
# Create label patterns to label entities that are negations: negation_patterns
negation_patterns = [{"label": "NEG", "pattern": [{"LEMMA": {"IN": ["not", "no", "unable"]}}]}]

In [43]:
# Add 'negation_patterns' to 'ruler'
ruler.add_patterns(negation_patterns)

In [44]:
# Convert strings in 'noun_filtered_sentences_2' into Doc-objects with labeled named entities
# make a list of these Docs: disagreement_sentences_4
disagreement_sentences_4 = list(nlp.pipe(noun_filtered_sentences_2))

In [45]:
# Show with running number the items of the enumerated list of Docs 'disagreement_sentences_4'
# Highlight labeled entities (SUPPORT-verb ('SUPP'), EVIDENCE-noun/group of scientists ('EVID'/'SCI'), negation ('NEG'))
for sentence_number, sentence in enumerate(disagreement_sentences_4):
    print(f"({sentence_number})")
    displacy.render(sentence, style="ent", jupyter=True)
    print("----------------------------------------------------------------------\n")

(0)


----------------------------------------------------------------------

(1)


----------------------------------------------------------------------

(2)


----------------------------------------------------------------------

(3)


----------------------------------------------------------------------

(4)


----------------------------------------------------------------------

(5)


----------------------------------------------------------------------

(6)


----------------------------------------------------------------------

(7)


----------------------------------------------------------------------

(8)


----------------------------------------------------------------------



## 5 Separate Sentences with Regard to Negation

In [46]:
# Make 'negation_matcher'
negation_matcher = Matcher(nlp.vocab, validate=True)

In [47]:
# Negation Pattern
negation_pattern = [{"LEMMA": {"IN": ["not", "no", "unable"]}}]

In [48]:
# Add 'negation_pattern' to 'negation_matcher'
negation_matcher.add("NEGATION_ID", None, negation_pattern)

In [49]:
# List of affirmative sentences: sents
sents = []

In [50]:
# List of negated sentences: negated_sents
negated_sents = []

In [51]:
# Define a function that seperates Docs from a list according to whether there occurs a negation in a Doc or not:
# negation_filter()
def negation_filter(sent_list):
    for doc in sent_list:
        if len(negation_matcher(doc)) > 0:
            negated_sents.append(doc)
        else:
            sents.append(doc)

In [52]:
# Apply 'negation_filter()' to 'disagreement_sentences_4'. Docs of 'disagreement_sentences_4' will be either stored in 
# 'sents' or 'negated_sents'
negation_filter(disagreement_sentences_4)

**sents**

In [53]:
# Show with running number the items of enumerated list 'sents'
# Highlight labeled entities ('SUPP': SUPPORT-verb; 'EVID': EVIDENCE-noun/'SCI': group of scientists; 'NEG': negation)
for sentence_number, sentence in enumerate(sents):
    print(f"({sentence_number})")
    displacy.render(sentence, style="ent", jupyter=True)
    print("----------------------------------------------------------------------\n")

(0)


----------------------------------------------------------------------

(1)


----------------------------------------------------------------------

(2)


----------------------------------------------------------------------

(3)


----------------------------------------------------------------------

(4)


----------------------------------------------------------------------



**negated_sents**

In [54]:
# Show with running number the items of enumerated list 'negated_sents'
# Highlight labeled entities ('SUPP': SUPPORT-verb; 'EVID': EVIDENCE-noun/'SCI': group of scientists; 'NEG': negation)
for sentence_number, sentence in enumerate(negated_sents):
    print(f"({sentence_number})")
    displacy.render(sentence, style="ent", jupyter=True)
    print("----------------------------------------------------------------------\n")

(0)


----------------------------------------------------------------------

(1)


----------------------------------------------------------------------

(2)


----------------------------------------------------------------------

(3)


----------------------------------------------------------------------

