# Current State of Disagreement-Projekt

## 0 Import Libraries

In [1]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.pipeline import EntityRuler
from spacy import displacy

In [2]:
# Import English Library
nlp = spacy.load("en_core_web_lg")

## 1 Outline

(A) Abstracts -> sentences -> filtered sentences (sections 2 - 4)

(B) Label entities (section 5)

(C) Disagreement pairs (section 6)
 - Seperate affirmative sentences from negated sentences (6.1)
 - Assign sentences that show disagreement (6.2)
 
*Basic idea:*

>`"The findings support the hypothesis that these drugs have efficacy in the treatment of COVID-19."`

>`"These results do not support the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia."`

|Sentence type|Noun (EVID/SCI)|Negation (NEG)|Verb (SUPP)|Statement (Span)|
|-------------|---------------|--------------|-----------|----------------|
|PRO          |`The findings` |              |`support`  |`the hypothesis that these drugs have efficacy in the treatment of COVID-19`|
|CON          |`These results`|`[do] not`    |`support`  |`the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia`|

## 2 Load Dataframe

In [3]:
abstract_df = pd.read_json("data/HCQ_clean_abstracts.json")
abstract_df.head(3)

Unnamed: 0,Publication ID,title,abstract_clean
0,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1127834352,Hydroxychloroquine or chloroquine with or with...,"BACKGROUND: Hydroxychloroquine or chloroquine,..."
2,pub.1126667578,Hydroxychloroquine in patients mainly with mil...,Abstract Objectives To assess the efficacy and...


## 3 New Dataframe: From Abstracts to Sentences

In [5]:
# Make a dataframe that
## has a row for each sentence
## assigns a unique id to each sentence
## assigns the title of publication to each sentence
def single_sentences(dataframe):            # this function is applied on a dataframe
    data_list = []                          # create empty list-object: data_list
      
    for row_number in dataframe.index:      # for-loop iterates over index of all row numbers
        sentence_number = 0                 # set counter
        
        for sentence in nlp(dataframe["abstract_clean"].iloc[row_number]).sents:
            
            sentence_id = dataframe["Publication ID"].iloc[row_number] + "-" + str(sentence_number)
            
            data_list.append([sentence_id, dataframe["title"].iloc[row_number], sentence.text])
            
            sentence_number += 1
            
    new_dataframe = pd.DataFrame(data_list, columns=["sentence_id", "title", "sentence"])
    
    return new_dataframe

In [5]:
# Create a dataframe that contains in each row one single sentence
# and its corresponding title and sentence ID as 
# its unique identifier: sentences_df
sentences_df = single_sentences(abstract_df)

In [6]:
sentences_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216 entries, 0 to 215
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   sentence_id  216 non-null    object
 1   title        216 non-null    object
 2   sentence     216 non-null    object
dtypes: object(3)
memory usage: 5.2+ KB


In [7]:
sentences_df.head(3)

Unnamed: 0,sentence_id,title,sentence
0,pub.1126880632-0,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat..."
1,pub.1126880632-1,COVID-19 and what pediatric rheumatologists sh...,"The infection, transmitted by 2019 novel coron..."
2,pub.1126880632-2,COVID-19 and what pediatric rheumatologists sh...,"Italy was early and severely involved, with a ..."


In [8]:
# Make list of sentences (Doc-object) 
# from column 'sentence' of dataframe 'sentences_df':
# doc_list
doc_list = list(nlp.pipe(sentences_df["sentence"].to_list()))

In [9]:
len(doc_list)

216

## 4 Filter Sentences

### 4.1 Filter by Verb

**General Patterns:**

>`[{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": verbs}}]`

or

>`[{"POS": "VERB", "DEP": "xcomp", "LEMMA": {"IN": verbs}}]`

**Verbs that express a Support-Relation:**

* confirm
* find
* indicate
* reveal
* show
* suggest
* support

In [10]:
spacy.explain("xcomp")

'open clausal complement'

In [11]:
# Make  verb_matcher
verb_matcher = Matcher(nlp.vocab, validate=True)

In [12]:
# List of verbs
verbs = ["find", "indicate", "reveal", "show", "suggest", "support"]

# Make patterns
verb_pattern = [{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": verbs}}]
confirm_pattern = [{"POS": "VERB", "DEP": "xcomp", "LEMMA": "confirm"}]

In [13]:
# Add patterns to matcher
verb_matcher.add("SUPPORT_VERB_ID", None, verb_pattern, confirm_pattern)

In [14]:
verb_filtered_docs = [doc for doc in doc_list if len(verb_matcher(doc)) > 0]

### 4.2 Filter by Noun

**General Pattern:**

>`[{"DEP": "nsubj", "LEMMA": noun}]`

**Nouns that indicate Evidence:**

* analysis
* evidence
* finding
* result
* survey
* trial

**Pronoun, 1st person, plural:**

* we ("-PRON-")

In [15]:
# Make another matcher: 'noun_matcher'
noun_matcher = Matcher(nlp.vocab, validate=True)

In [16]:
# Make patterns to search for certain (evidence) nouns
analysis_pattern = [{"DEP": "nsubj", "LEMMA": "analysis"}]
evidence_pattern = [{"DEP": "nsubj", "LEMMA": "evidence"}]
finding_pattern = [{"DEP": "nsubj", "LEMMA": "finding"}]

result_pattern = [{"DEP": "nsubj", "LEMMA": "result"}]
survey_pattern = [{"DEP": "nsubj", "LEMMA": "survey"}]
trial_pattern = [{"LEMMA": "trial"}]

# Make pattern to search for "we"
we_pattern = [{"DEP": "nsubj", "LEMMA": "-PRON-"}]

In [17]:
# Add Patterns to 'noun_matcher'
noun_matcher.add("NOUN_ID", None, analysis_pattern, evidence_pattern, finding_pattern, result_pattern, 
                     survey_pattern, trial_pattern, we_pattern)

In [18]:
# Make a list of further filtered sentences: 'noun_filtered_docs'
noun_filtered_docs = [doc for doc in verb_filtered_docs if len(noun_matcher(doc)) > 0]

In [19]:
print(f"Number of Sentences in 'doc_list': {len(doc_list)}")
print(f"Number of Sentences in 'verb_filtered_docs': {len(verb_filtered_docs)}")
print(f"Number of Sentences in 'noun_filtered_docs': {len(noun_filtered_docs)}")

Number of Sentences in 'doc_list': 216
Number of Sentences in 'verb_filtered_docs': 15
Number of Sentences in 'noun_filtered_docs': 12


### 4.3 Filter Dataframe by Relevant Sentences

In [20]:
# Filter dataframe 'sentences_df' in order to have an new dataframe ('filtered_sentences_df')
# that it only contains sentences that are relevant, i.e. sentences that were matched both 
# by 'verb_matcher' and 'noun_matcher'.
# In order to filter 'sentences_df' make a list of str-objects from 'noun_filtered_docs': filtered_strings
filtered_strings = [doc.text for doc in noun_filtered_docs]

In [21]:
# Filter 'sentences_df':
# Take only those rows where sentence in column 'sentence' is a member of 'filtered_strings'
# Store this filtered dataframe in 'filtered_sentences_df'
filtered_sentences_df = sentences_df.loc[sentences_df["sentence"].isin(filtered_strings)]

In [22]:
# Make proper index for 'filtered_sentences_df'

## Reset index
filtered_sentences_df = filtered_sentences_df.reset_index()

## Drop column 'index'
filtered_sentences_df = filtered_sentences_df.drop(columns=["index"])

In [23]:
filtered_sentences_df

Unnamed: 0,sentence_id,title,sentence
0,pub.1127834352-20,Hydroxychloroquine or chloroquine with or with...,We were unable to confirm a benefit of hydroxy...
1,pub.1127182972-1,An independent appraisal and re-analysis of hy...,This re-analysis reveals severe limitations in...
2,pub.1126839717-12,Hydroxychloroquine Versus COVID-19: A Periodic...,The results of the meta-analysis on comparativ...
3,pub.1126839717-19,Hydroxychloroquine Versus COVID-19: A Periodic...,Meta-analysis indicated no significant prophyl...
4,pub.1126839717-21,Hydroxychloroquine Versus COVID-19: A Periodic...,This systematic review and meta-analysis showe...
5,pub.1126655433-13,No evidence of clinical efficacy of hydroxychl...,These results do not support the use of HCQ in...
6,pub.1126626949-14,Chloroquine diphosphate in two different dosag...,Interpretation Preliminary findings suggest th...
7,pub.1126596624-8,Rheumatologists’ perspective on coronavirus di...,Preliminary evidence suggests potential benefi...
8,pub.1127408847-18,Efficacy of chloroquine and hydroxychloroquine...,We found that COVID-19 infections are highly p...
9,pub.1127408847-21,Efficacy of chloroquine and hydroxychloroquine...,The findings support the hypothesis that these...


## 5 Label Entities

For the following see:

[https://spacy.io/usage/rule-based-matching#entityruler](https://spacy.io/usage/rule-based-matching#entityruler)

**Initialize EntityRuler**

In [24]:
# Initialize spacy's EntityRuler: ruler
ruler = EntityRuler(nlp, validate=True, overwrite_ents=True)

**SUPPORT-Verbs**

In [25]:
verb_patterns = [{"label": "SUPP", "pattern": [{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": ["find", "indicate", "reveal", "show", "suggest", "support"]}}]}, 
                 {"label": "SUPP", "pattern": [{"POS": "VERB", "DEP": "xcomp", "LEMMA": "confirm"}]}]

**Nouns**

In [26]:
evidence_patterns = [{'label': 'EVID', 'pattern': 'This re-analysis'},
                     {'label': 'EVID', 'pattern': 'This systematic review and meta-analysis'},
                     {'label': 'EVID', 'pattern': 'The results'},
                     {'label': 'EVID', 'pattern': 'This systematic review and meta-analysis'},
                     {'label': 'EVID', 'pattern': 'These results'},
                     {'label': 'EVID', 'pattern': 'Interpretation Preliminary findings'},
                     {'label': 'EVID', 'pattern': 'Preliminary evidence'},
                     {'label': 'EVID', 'pattern': 'The findings'},
                     {'label': 'EVID', 'pattern': 'our survey'}]

In [27]:
trial_pattern = [{"label": "EVID", "pattern": "multicenter clinical trials"}]

In [28]:
we_pattern = [{"label": "SCI", "pattern": "We"}]

**Negations**

In [29]:
negation_patterns = [{"label": "NEG", "pattern": [{"LEMMA": {"IN": ["not", "no", "unable"]}}]}]

**Apply Ruler to Sentences**

In [30]:
# Add patterns to ruler
ruler.add_patterns(verb_patterns)
ruler.add_patterns(evidence_patterns)
ruler.add_patterns(trial_pattern)
ruler.add_patterns(we_pattern)
ruler.add_patterns(negation_patterns)

  self.phrase_matcher.add(label, patterns)
  self.phrase_matcher.add(label, patterns)


In [31]:
# Add ruler to pipline of the nlp-object
nlp.add_pipe(ruler)

In [32]:
# Make list of Doc-objects from 'filtered_sentences_df["sentence"]': docs
docs = list(nlp.pipe(filtered_sentences_df["sentence"].to_list()))

In [33]:
# Show an example: Sentence with Named Entities
displacy.render(docs[5], style="ent", jupyter=True)

## 6 Disagreement Pairs

### 6.1 Separating Sentences with Regard to Negation

In [34]:
# Make 'negation_matcher'
negation_matcher = Matcher(nlp.vocab, validate=True)

In [35]:
# Negation Pattern
negation_pattern = [{"LEMMA": {"IN": ["not", "no", "unable"]}}]

In [36]:
# Add 'negation_pattern' to 'negation_matcher'
negation_matcher.add("NEGATION_ID", None, negation_pattern)

In [37]:
# List of affirmative sentences: sents
sents = []

In [38]:
# List of negated sentences: negated_sents
negated_sents = []

In [39]:
# Define function which sorts sentences into two groups 
# depending on whether a sentence is affirmative or negated
def negation_filter(sent_list):
    for doc in sent_list:
        if len(negation_matcher(doc)) > 0:
            negated_sents.append(doc)
        else:
            sents.append(doc)

In [40]:
# Sort sentences in 'docs'
negation_filter(docs)

### 6.2 Pairs of Disagreeing Sentences

In [41]:
# Create 'span_matcher'
# Purpose: Slice Doc-objects in 'sents' and 'negated_sents' into Span-objects
# The Span-objects are needed to compute similarity
span_matcher = Matcher(nlp.vocab, validate=True)

span_pattern_1 = [{"POS": "VERB", "DEP": "ROOT", "LEMMA": {"IN": ["find", "indicate", "reveal", "show", "suggest", "support"]}}]
span_pattern_2 = [{"POS": "VERB", "DEP": "xcomp", "LEMMA": "confirm"}]

span_matcher.add("SPAN_ID", None, span_pattern_1, span_pattern_2)

In [42]:
# Define a function that creates pairs of disagreeing sentences and stores each pair in a list: disagreement_pairs()
def disagreement_pairs(list_of_affirmative_sentences, list_of_negated_sentences):
    
    # List of pairs of disagreeing sentences: pairs_of_disagreeing_sentences
    # This list shall be returned be the function
    pairs_of_disagreeing_sentences = []
    
    # 1. Loop ("outer loop"): Iterate over all Doc-objects in 'list_of_affirmative_sentences'
    for doc in list_of_affirmative_sentences:
        # Slice Doc-object into Span-object with the help of 'span_matcher': span1
        for match_id, start, end in span_matcher(doc):
            span1 = doc[end:]
         
        # 2. Loop ("inner loop"): Iterate over all Doc-objects in 'list_of_negated_sentences'
        for doc_neg in list_of_negated_sentences:
            # Slice Doc-object into Span-object with the help of 'span_matcher': span2 
            for match_id, start, end in span_matcher(doc_neg):
                span2 = doc_neg[end + 1:]
            
            # If Span-object 1 and Span-object 2 have a certain degree of similarity
            # then make a pair of the corresponding sentences and add the pair to 
            # the list 'pairs_of_disagreeing_sentences'
            if span1.similarity(span2) >= 0.87:
                pairs_of_disagreeing_sentences.append((doc, doc_neg))
    
    # Return the list of pairs which sentences show disagreement
    return pairs_of_disagreeing_sentences

In [43]:
# Make a list of pairs of disagreeing sentences from 'sents' and 'negated_sents': disagreementPairs
disagreementPairs = disagreement_pairs(sents, negated_sents)

In [44]:
# Show pairs of disagreeing sentences with highlighted entities
for (doc, doc_neg) in disagreementPairs[2:5]:
    print("==========\n")
    print("(PRO)\n")
    displacy.render(doc, style="ent", jupyter=True)
    print("\n(CON)\n")
    displacy.render(doc_neg, style="ent", jupyter=True)
    print("\n")


(PRO)




(CON)






(PRO)




(CON)






(PRO)




(CON)







In [45]:
sents[0]

This re-analysis reveals severe limitations in the methodology of this study, including ambiguous inclusion/exclusion of participant data and inconsistent analysis techniques, and yielded nonsignificant differences between control and treatment groups across any treatment days.