# 4 Noun Filter

## 0 Import Libraries

In [1]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Import English Library
nlp = spacy.load("en_core_web_lg", disable=["ner"])

## 1 Load Dataframe

In [3]:
# Load sentences dataframe from 'HCQ_sentences.json': sentences_df
sentences_df = pd.read_json("../data/HCQ_sentences.json")

In [4]:
sentences_df.head(3)

Unnamed: 0,sentence_id,title,sentence
0,pub.1126880632-0,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1126880632-1,COVID-19 and what pediatric rheumatologists sh...,"The infection, transmitted by 2019 novel coron..."
2,pub.1126880632-2,COVID-19 and what pediatric rheumatologists sh...,"Italy was early and severely involved, with a ..."


## 2 Verb Filtered Sentences

In [5]:
# Make list of sentences (Doc-object) 
# from column 'sentence' of dataframe 'sentences_df': doc_list
doc_list = list(nlp.pipe(sentences_df["sentence"].to_list()))

In [6]:
# Make matcher: 'verb_matcher_2'
verb_matcher_2 = Matcher(nlp.vocab, validate=True)

In [7]:
# Make search pattern for 'verb_matcher_2': support_verbs_pattern
support_verbs_pattern = [{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": ['reveal', 'show', 'suggest', 'support']}}]

In [8]:
# Make additional search pattern for 'verb_matcher_2': confirm_pattern
confirm_pattern = [{"POS": "VERB", "DEP": "xcomp", "LEMMA": "confirm"}]

In [9]:
# Add 'support_verbs_pattern' and 'confirm_pattern' to 'verb_matcher_2'
verb_matcher_2.add("VERB_ID", None, support_verbs_pattern, confirm_pattern)

In [10]:
# Filter Doc-objects (sentences) in 'doc_list' and add the selected Docs to a list: verb_filtered_sentences_2
verb_filtered_sentences_2 = [doc for doc in doc_list if len(verb_matcher_2(doc)) > 0] 

In [11]:
# Print each enumerated verb filtered sentence
for sentence_number, sentence in enumerate(verb_filtered_sentences_2):
    print(f"({sentence_number}) {sentence}")

(0) We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.
(1) This work was supported by the Emergent Projects of National Science and Technology (2020YFC0844500), National Natural Science Foundation of China (81970020, 81770025), National Key Research and Development Program of China (2016YFC0901104), Shanghai Municipal Key Clinical Specialty (shslczdzk02202, shslczdzk01103), National Innovative Research Team of High-level Local Universities in Shanghai, Shanghai Key Discipline for Respiratory Diseases (2017ZZ02014), National Major Scientific and Technological Special Project for Significant New Drugs Development (2017ZX09304007), Key Projects in the National Science and Technology Pillar Program during the Thirteenth Five-year Plan Period (2018ZX09206005-004, 2017ZX10202202-005-004, 2017ZX10203201-008).
(2) This re-analysis reveals severe limitations in the methodology of this study, includ

In [12]:
# Show number of item in 'doc_list'/'verb_filtered_sentences_2' using 'len()'
print(f"Number of sentences in 'doc_list': {len(doc_list)}")
print(f"Number of sentences in 'verb_filtered_sentences_2': {len(verb_filtered_sentences_2)}")

Number of sentences in 'doc_list': 216
Number of sentences in 'verb_filtered_sentences_2': 11


## 3 Noun Filter

### 3.1 An Initial Search Pattern

In [13]:
# Make Doc-object from one example sentence of 'sentences_df' (Sentence ID: pub.1126655433-13): doc
doc = nlp("These results do not support the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia.")

In [14]:
# Part-of-Speech (POS) tags for tokens in 'doc'
for token in doc:
    print(token.text, token.pos_)

These DET
results NOUN
do AUX
not PART
support VERB
the DET
use NOUN
of ADP
HCQ PROPN
in ADP
patients NOUN
hospitalised VERB
for ADP
documented VERB
SARS PROPN
- PUNCT
CoV-2-positive NOUN
hypoxic ADJ
pneumonia NOUN
. PUNCT


In [15]:
# Make and show list of nouns in 'doc': pos_noun
pos_noun = []

for token in doc:
    if token.pos_ == "NOUN":
        pos_noun.append(token.text)
        
print(pos_noun)

['results', 'use', 'patients', 'CoV-2-positive', 'pneumonia']


In [16]:
# Dependencies (DEP) for tokens in 'doc'
for token in doc:
    print(token.text, token.pos_, token.dep_)

These DET det
results NOUN nsubj
do AUX aux
not PART neg
support VERB ROOT
the DET det
use NOUN dobj
of ADP prep
HCQ PROPN pobj
in ADP prep
patients NOUN pobj
hospitalised VERB acl
for ADP prep
documented VERB amod
SARS PROPN nmod
- PUNCT punct
CoV-2-positive NOUN nmod
hypoxic ADJ amod
pneumonia NOUN pobj
. PUNCT punct


In [17]:
spacy.explain("nsubj")

'nominal subject'

In [18]:
# Make and show list of nouns in 'doc' that are the nominal subject of 'doc' 
pos_dep_noun = []

for token in doc:
    if token.pos_ == "NOUN":
        if token.dep_ == "nsubj":
            pos_dep_noun.append(token.text)
        
print(pos_dep_noun)

['results']


In [19]:
# Initial search pattern to find EVIDENCE-nouns:'initial_search_pattern_noun'
initial_search_pattern_noun = [{"POS": "NOUN", "DEP": "nsubj"}]

In [20]:
# Make a matcher to test 'initial_search_pattern_noun' on 'doc': 'test_matcher_noun'
test_matcher_noun = Matcher(nlp.vocab, validate=True)

In [21]:
# Add pattern 'initial_search_pattern_noun' to matcher 'test_matcher_noun'
test_matcher_noun.add("TEST_ID", None, initial_search_pattern_noun)

In [22]:
# Apply 'test_matcher_noun' on 'doc'
for match_id, start, end in test_matcher_noun(doc):
    print(doc[start:end].text)

results


### 3.2 List of EVIDENCE-nouns

In [23]:
# Make a new matcher: initial_search_matcher_noun
initial_search_matcher_noun = Matcher(nlp.vocab, validate=True)

In [24]:
# Initial search pattern to find EVIDENCE-nouns:'initial_search_pattern_noun'
initial_search_pattern_noun = [{"POS": "NOUN", "DEP": "nsubj"}]

In [25]:
# Add pattern 'initial_search_pattern_noun' to matcher 'initial_search_matcher_noun'
initial_search_matcher_noun.add("INITIAL_SEARCH_ID", None, initial_search_pattern_noun)

In [26]:
# Define function 'match_noun_nsubj()'
def match_noun_nsubj(Doc_list):
    lemma_noun = set()     # Make set 'lemma_noun'
    
    for Doc in Doc_list:          # First loop: iterate over all Doc-objects in Doc_list
        for match_id, start, end in initial_search_matcher_noun(Doc):    # Second loop: iterate over all tuples in the list
                                                                         # which is created by 'initial_search_matcher_noun
                                                                         #(Doc)'
                
            lemma_noun.add(Doc[start:end].lemma_)                    # For Doc-object at hand: Slice it by means of start 
                                                                     # index and end index, get basic form (lemma) of
                                                                     # token(s) in that slice and add this lemma to 
                                                                     #'lemma_noun'
            
    return sorted(list(lemma_noun), key=str.lower)                   # Convert set 'lemma_noun' into a list, sort items in
                                                                     # it alphabetically and return the result

In [27]:
noun_nsubj_lemma = match_noun_nsubj(verb_filtered_sentences_2)

In [28]:
print(noun_nsubj_lemma)

['analysis', 'drug', 'evidence', 'finding', 'patient', 'recommendation', 'result', 'survey']


**Aid to help selecting nouns manually**

In [29]:
# Make generator form 'noun_nsubj_lemma': noun_generator
noun_generator = (noun for noun in noun_nsubj_lemma)

In [30]:
# Next item of 'noun_generator': noun_to_prove
noun_to_prove = next(noun_generator)

# Show current 'noun_to_prove'
print(f"noun_to_prove: {noun_to_prove}")

# Make empty list: found_sentences
found_sentences_1 = []

for doc in verb_filtered_sentences_2:     # Iterate over all Doc-objects in 'verb_filtered_sentences_2'
    for match_id, start, end in initial_search_matcher_noun(doc):     # Iterate over all matches (tuples) for the current 
                                                                      # Doc
            
        if doc[start:end].lemma_ == noun_to_prove:                    # If the lemma of the matched Span is equal to the 
                                                                      # current 'noun_to_prove':
                
            if doc not in found_sentences_1:                          # Check if the Doc is not already in 
                                                                      # 'found_sentences_1' if it is not:
                                                                 
                found_sentences_1.append(doc)                         # Add the current Doc to the list 'found_sentences'
                    
                    
print(f"Sentences found with '{noun_to_prove}' in it: ")
print("----------------------------------------\n")

# Show each found Doc(sentence) with its respective index number 
for sentence_number, sentence in enumerate(found_sentences_1):
    print(f"({sentence_number}) {sentence}")

noun_to_prove: analysis
Sentences found with 'analysis' in it: 
----------------------------------------

(0) This re-analysis reveals severe limitations in the methodology of this study, including ambiguous inclusion/exclusion of participant data and inconsistent analysis techniques, and yielded nonsignificant differences between control and treatment groups across any treatment days.
(1) This systematic review and meta-analysis showed no clinical benefits regarding HCQ treatment with/without azithromycin for COVID-19 patients.


In [31]:
#selected_nouns = []

In [32]:
#selected_nouns.append(noun_to_prove)

In [33]:
#print(selected_nouns)

```python
selected_nouns = ['analysis', 'evidence', 'finding', 'result', 'survey']
```

5 EVIDENCE-nouns:
* analysis
* evidence
* finding
* result
* survey

### 3.3 A First Noun Filter

In [34]:
# Make matcher 'noun_matcher_1'
noun_matcher_1 = Matcher(nlp.vocab, validate=True)

In [35]:
# Make search patterns for 'noun_matcher_1'
analysis_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "analysis"}]
evidence_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "evidence"}]
finding_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "finding"}]

result_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "result"}]
survey_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "survey"}]

In [36]:
# Add search patterns to 'noun_matcher_1'
noun_matcher_1.add("NOUN_ID", None, analysis_pattern, evidence_pattern, finding_pattern, result_pattern, survey_pattern)

In [37]:
# Filter Doc-objects (sentences) in 'verb_filtered_sentences_2' and add the selected Docs to a list: 
# noun_filtered_sentences_1
noun_filtered_sentences_1 = [doc for doc in verb_filtered_sentences_2 if len(noun_matcher_1(doc)) > 0] 

In [38]:
# Print each enumerated noun filtered sentence
for sentence_number, sentence in enumerate(noun_filtered_sentences_1):
    print(f"({sentence_number}) {sentence}")

(0) This re-analysis reveals severe limitations in the methodology of this study, including ambiguous inclusion/exclusion of participant data and inconsistent analysis techniques, and yielded nonsignificant differences between control and treatment groups across any treatment days.
(1) This systematic review and meta-analysis showed no clinical benefits regarding HCQ treatment with/without azithromycin for COVID-19 patients.
(2) These results do not support the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia.
(3) Interpretation Preliminary findings suggest that the higher CQ dosage (10-day regimen) should not be recommended for COVID-19 treatment because of its potential safety hazards.
(4) Preliminary evidence suggests potential benefit with chloroquine or hydroxychloroquine.
(5) The findings support the hypothesis that these drugs have efficacy in the treatment of COVID-19.
(6) Despite its small sample size, our survey shows that hydroxychloro

In [39]:
# Show number of item in 'doc_list'/'verb_filtered_sentences_2'/'noun_filtered_sentences_1' using 'len()'
print(f"Number of sentences in 'doc_list': {len(doc_list)}")
print(f"Number of sentences in 'verb_filtered_sentences_2': {len(verb_filtered_sentences_2)}")
print(f"Number of sentences in 'noun_filtered_sentences_1': {len(noun_filtered_sentences_1)}")

Number of sentences in 'doc_list': 216
Number of sentences in 'verb_filtered_sentences_2': 11
Number of sentences in 'noun_filtered_sentences_1': 7


### 3.4 Evaluation I

**For the following steps see:** 
Youtube: ["Intro to NLP with spaCy (3): Detecting programming languages | Episode 3: Evaluation"](https://youtu.be/4V0JDdohxAk)

#### 3.4.1 Measuring Device

In [40]:
# Load 'sentences_labeled.xlsx' as a dataframe: measuring_df_3
measuring_df_3 = pd.read_excel("../labeling/sentences_labeled.xlsx")

In [41]:
# Make list that is the similar to 'noun_filtered_sentences_1' exept that it contains strings instead of Docs 
noun_filtered_strings_1 = [doc.text for doc in noun_filtered_sentences_1]

In [42]:
# Define function 'compare_sentences()': It returns a boolean ('True'/'False') depending on whether the assertion that 
# a sentence (string) is in 'noun_filtered_strings_1' is true or false
def compare_sentences(sentence):
    return sentence in noun_filtered_strings_1

In [43]:
# Make new column by applying 'compare_sentences()' on each row of 'measuring_df_3["sentence"]': prediction
measuring_df_3["prediction"] = measuring_df_3["sentence"].apply(compare_sentences)

In [44]:
# Show rows no. 71-75 of 'measuring_df_3'
measuring_df_3.iloc[71:76]

Unnamed: 0,label,sentence,prediction
71,0,Proposals should be directed to the correspond...,False
72,0,Recent publications have brought attention to ...,False
73,0,The scientific community should consider this ...,False
74,0,A recent open-label study claimed that hydroxy...,False
75,1,This re-analysis reveals severe limitations in...,True


In [45]:
# Make column 'prediction' in such a way that booleans ('True'/'False') are turned into '1'/'0'
measuring_df_3 = measuring_df_3.assign(prediction=lambda df: df["prediction"].astype(np.int8))

In [46]:
# Show rows no. 71-75 of 'measuring_df_3'
measuring_df_3.iloc[71:76]

Unnamed: 0,label,sentence,prediction
71,0,Proposals should be directed to the correspond...,0
72,0,Recent publications have brought attention to ...,0
73,0,The scientific community should consider this ...,0
74,0,A recent open-label study claimed that hydroxy...,0
75,1,This re-analysis reveals severe limitations in...,1


In [47]:
# Rearrange position of columns
measuring_df_3 = measuring_df_3[["sentence", "label", "prediction"]]

In [48]:
measuring_df_3.iloc[71:76]

Unnamed: 0,sentence,label,prediction
71,Proposals should be directed to the correspond...,0,0
72,Recent publications have brought attention to ...,0,0
73,The scientific community should consider this ...,0,0
74,A recent open-label study claimed that hydroxy...,0,0
75,This re-analysis reveals severe limitations in...,1,1


#### 3.4.2 Metrics

**Confusion Matrix**

In [49]:
# Make confusion matrix from 'measuring_df_3["label"]' and 'measuring_df_3["prediction"]'
confusion_matrix(measuring_df_3["label"], measuring_df_3["prediction"])

array([[202,   0],
       [  7,   7]], dtype=int64)

**Classification Report**

In [50]:
# Make classification report from 'measuring_df_3["label"]' and 'measuring_df_3["prediction"]' and show it on screen
print(classification_report(measuring_df_3["label"], measuring_df_3["prediction"]))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       202
           1       1.00      0.50      0.67        14

    accuracy                           0.97       216
   macro avg       0.98      0.75      0.82       216
weighted avg       0.97      0.97      0.96       216



#### 3.4.3 Mistakes

**False Positives**

In [51]:
false_positives = measuring_df_3.loc[measuring_df_3["prediction"] == 1].loc[measuring_df_3["label"] == 0, ["sentence"]]

In [52]:
for sentence_number, sentence in enumerate(false_positives["sentence"].to_list()):
    print(f"({sentence_number}) {sentence}")

**False Negatives**

In [53]:
false_negatives = measuring_df_3.loc[measuring_df_3["prediction"] == 0].loc[measuring_df_3["label"] == 1, ["sentence"]]

In [54]:
for sentence_number, sentence in enumerate(false_negatives["sentence"].to_list()):
    print(f"({sentence_number}) {sentence}")

(0) We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.
(1) The administration of HCQ did not result in a significantly higher negative conversion probability than SOC alone in patients mainly hospitalized with persistent mild to moderate COVID-19.
(2) Adverse events were higher in HCQ recipients than in HCQ non-recipients.
(3) Although mortality rate was not significantly different between cases and controls, frequency of adverse effects was substantially higher in HCQ regimen group.
(4) Use of these drugs is premature and potentially harmful
(5) Among patients with COVID-19, the use of HCQ could significantly shorten TTCR and promote the absorption of pneumonia.
(6) Chloroquine phosphate, an old drug for treatment of malaria, is shown to have apparent efficacy and acceptable safety against COVID-19 associated pneumonia in multicenter clinical trials conducted in China.


## 4 Noun Filter: Refinements

In [55]:
# Make a Doc of one sentence we want the noun filter to find: doc3
doc3 = nlp("We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.")

In [56]:
# Show linguistic features of 'doc3'
for token in doc3:
    print(token.text, token.pos_, token.dep_, token.lemma_)

We PRON nsubj -PRON-
were AUX ROOT be
unable ADJ acomp unable
to PART aux to
confirm VERB xcomp confirm
a DET det a
benefit NOUN dobj benefit
of ADP prep of
hydroxychloroquine NOUN pobj hydroxychloroquine
or CCONJ cc or
chloroquine NOUN conj chloroquine
, PUNCT punct ,
when ADV advmod when
used VERB advcl use
alone ADV advmod alone
or CCONJ cc or
with ADP conj with
a DET det a
macrolide NOUN pobj macrolide
, PUNCT punct ,
on ADP prep on
in ADP nmod in
- PUNCT punct -
hospital NOUN pobj hospital
outcomes NOUN pobj outcome
for ADP prep for
COVID-19 PROPN pobj COVID-19
. PUNCT punct .


In [57]:
spacy.explain("PRON")

'pronoun'

In [58]:
# Make additional search pattern: we_pattern
we_pattern = [{"POS": "PRON", "DEP": "nsubj", "LEMMA": "-PRON-"}]

In [59]:
# Make a Doc of another sentence we want the noun filter to find: doc4
doc4 = nlp("Chloroquine phosphate, an old drug for treatment of malaria, is shown to have apparent efficacy and acceptable safety against COVID-19 associated pneumonia in multicenter clinical trials conducted in China.")

In [60]:
# Show linguistic features of 'doc4'
for token in doc4:
    print(token.text, token.pos_, token.dep_, token.lemma_)

Chloroquine ADJ compound chloroquine
phosphate NOUN nsubjpass phosphate
, PUNCT punct ,
an DET det an
old ADJ amod old
drug NOUN appos drug
for ADP prep for
treatment NOUN pobj treatment
of ADP prep of
malaria NOUN pobj malaria
, PUNCT punct ,
is AUX auxpass be
shown VERB ROOT show
to PART aux to
have AUX xcomp have
apparent ADJ amod apparent
efficacy NOUN dobj efficacy
and CCONJ cc and
acceptable ADJ amod acceptable
safety NOUN conj safety
against ADP prep against
COVID-19 PROPN nummod COVID-19
associated VERB amod associate
pneumonia NOUN pobj pneumonia
in ADP prep in
multicenter ADJ amod multicenter
clinical ADJ amod clinical
trials NOUN pobj trial
conducted VERB acl conduct
in ADP prep in
China PROPN pobj China
. PUNCT punct .


In [62]:
# Make additional search pattern: trial_pattern
trial_pattern = [{"POS": "NOUN", "DEP": "pobj", "LEMMA": "trial"}]

In [61]:
# Rearrange the sentence structure of 'doc4': doc4_1
doc4_1 = nlp("Multicenter clinical trials conducted in China show that chloroquine phosphate has apparent efficacy and acceptable safety against COVID-19 associated pneumonia.")

In [80]:
for token in doc4_1:
    print(token.text, token.pos_, token.dep_, token.lemma_)

Multicenter ADJ amod multicenter
clinical ADJ amod clinical
trials NOUN nsubj trial
conducted VERB acl conduct
in ADP prep in
China PROPN pobj China
show VERB ROOT show
that SCONJ mark that
chloroquine NOUN compound chloroquine
phosphate NOUN nsubj phosphate
has AUX ccomp have
apparent ADJ amod apparent
efficacy NOUN dobj efficacy
and CCONJ cc and
acceptable ADJ amod acceptable
safety NOUN conj safety
against ADP prep against
COVID-19 PROPN nummod COVID-19
associated VERB amod associate
pneumonia NOUN pobj pneumonia
. PUNCT punct .


In [81]:
doc4_1_pos_dep = []

for token in doc4_1:
    if token.pos_ == "VERB":
        if token.dep_ == "ROOT":
            doc4_1_pos_dep.append(token)
            
print(doc4_1_pos_dep)

[show]


In [82]:
doc4_1_noun_nsubj = []

for token in doc4_1:
    if token.pos_ == "NOUN":
        if token.dep_ == "nsubj":
            doc4_1_noun_nsubj.append(token)
            
print(doc4_1_noun_nsubj)

[trials, phosphate]


In [63]:
# Make a new noun matcher 'noun_matcher_2'
noun_matcher_2 = Matcher(nlp.vocab, validate=True)

In [64]:
# Repeat search patterns for 'noun_matcher_2'
analysis_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "analysis"}]
evidence_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "evidence"}]
finding_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "finding"}]

result_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "result"}]
survey_pattern = [{"POS": "NOUN", "DEP": "nsubj", "LEMMA": "survey"}]

In [65]:
# Repeat 'trial_pattern' for 'noun_matcher_2'
trial_pattern = [{"POS": "NOUN", "DEP": "pobj", "LEMMA": "trial"}]

In [66]:
# Repeat 'we_pattern' for 'noun_matcher_2'
we_pattern = [{"POS": "PRON", "DEP": "nsubj", "LEMMA": "-PRON-"}]

In [67]:
# Add search patterns to 'noun_matcher_2'
noun_matcher_2.add("NOUN_ID", None, 
                   analysis_pattern, 
                   evidence_pattern, 
                   finding_pattern, 
                   result_pattern, 
                   survey_pattern, 
                   trial_pattern, 
                   we_pattern)

In [68]:
# Filter Doc-objects (sentences) in 'verb_filtered_sentences_2' and add the selected Docs to a list: 
# noun_filtered_sentences_2
noun_filtered_sentences_2 = [doc for doc in verb_filtered_sentences_2 if len(noun_matcher_2(doc)) > 0] 

In [69]:
# Print each enumerated noun filtered sentence
for sentence_number, sentence in enumerate(noun_filtered_sentences_2):
    print(f"({sentence_number}) {sentence}")

(0) We were unable to confirm a benefit of hydroxychloroquine or chloroquine, when used alone or with a macrolide, on in-hospital outcomes for COVID-19.
(1) This re-analysis reveals severe limitations in the methodology of this study, including ambiguous inclusion/exclusion of participant data and inconsistent analysis techniques, and yielded nonsignificant differences between control and treatment groups across any treatment days.
(2) This systematic review and meta-analysis showed no clinical benefits regarding HCQ treatment with/without azithromycin for COVID-19 patients.
(3) These results do not support the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia.
(4) Interpretation Preliminary findings suggest that the higher CQ dosage (10-day regimen) should not be recommended for COVID-19 treatment because of its potential safety hazards.
(5) Preliminary evidence suggests potential benefit with chloroquine or hydroxychloroquine.
(6) The findings s

In [70]:
# Show number of item in 'doc_list'/'verb_filtered_sentences_2'/'noun_filtered_sentences_1' using 'len()'
print(f"Number of sentences in 'doc_list': {len(doc_list)}")
print(f"Number of sentences in 'verb_filtered_sentences_2': {len(verb_filtered_sentences_2)}")
print(f"Number of sentences in 'noun_filtered_sentences_2': {len(noun_filtered_sentences_2)}")

Number of sentences in 'doc_list': 216
Number of sentences in 'verb_filtered_sentences_2': 11
Number of sentences in 'noun_filtered_sentences_2': 9


## 5 Evaluation II

**Measuring Device**

In [71]:
# Load 'sentences_labeled.xlsx' as a dataframe: measuring_df_4
measuring_df_4 = pd.read_excel("../labeling/sentences_labeled.xlsx")

In [72]:
# Make list that is the similar to 'noun_filtered_sentences_2' exept that it contains strings instead of Docs 
noun_filtered_strings_2 = [doc.text for doc in noun_filtered_sentences_2]

In [73]:
# Define function 'compare_sentences()': It returns a boolean ('True'/'False') depending on whether the assertion that 
# a sentence (string) is in 'noun_filtered_strings_2' is true or false
def compare_sentences(sentence):
    return sentence in noun_filtered_strings_2

In [74]:
# Make new column by applying 'compare_sentences()' on each row of 'measuring_df_4["sentence"]': prediction
measuring_df_4["prediction"] = measuring_df_4["sentence"].apply(compare_sentences)

In [75]:
# Make column 'prediction' in such a way that booleans ('True'/'False') are turned into '1'/'0'
measuring_df_4 = measuring_df_4.assign(prediction=lambda df: df["prediction"].astype(np.int8))

In [76]:
# Rearrange position of columns
measuring_df_4 = measuring_df_4[["sentence", "label", "prediction"]]

In [77]:
measuring_df_4.iloc[71:76]

Unnamed: 0,sentence,label,prediction
71,Proposals should be directed to the correspond...,0,0
72,Recent publications have brought attention to ...,0,0
73,The scientific community should consider this ...,0,0
74,A recent open-label study claimed that hydroxy...,0,0
75,This re-analysis reveals severe limitations in...,1,1


**Confusion Matrix**

In [78]:
# Make confusion matrix from 'measuring_df_4["label"]' and 'measuring_df_4["prediction"]'
confusion_matrix(measuring_df_4["label"], measuring_df_4["prediction"])

array([[202,   0],
       [  5,   9]], dtype=int64)

**Classification Report**

In [79]:
# Make classification report from 'measuring_df_4["label"]' and 'measuring_df_4["prediction"]' and show it on screen
print(classification_report(measuring_df_4["label"], measuring_df_4["prediction"]))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       202
           1       1.00      0.64      0.78        14

    accuracy                           0.98       216
   macro avg       0.99      0.82      0.89       216
weighted avg       0.98      0.98      0.97       216

