# Opposing sentences

## Import libraries

In [128]:
import pandas as pd
import spacy
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
from spacy import displacy

In [129]:
# Import English Library
nlp = spacy.load("en_core_web_lg")

## Load dataframe

In [130]:
df = pd.read_json("df_HCQ.json")
df.head()

Unnamed: 0,Publication ID,title,abstract,abstract_clean
0,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat...","On March 11th, 2020 the World Health Organizat..."
1,pub.1127834352,Hydroxychloroquine or chloroquine with or with...,"BACKGROUND: Hydroxychloroquine or chloroquine,...","BACKGROUND: Hydroxychloroquine or chloroquine,..."
2,pub.1126667578,Hydroxychloroquine in patients mainly with mil...,Abstract Objectives To assess the efficacy and...,Abstract Objectives To assess the efficacy and...
3,pub.1125404383,Of chloroquine and COVID-19,Recent publications have brought attention to ...,Recent publications have brought attention to ...
4,pub.1127182972,An independent appraisal and re-analysis of hy...,A recent open-label study claimed that hydroxy...,A recent open-label study claimed that hydroxy...


In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 0 to 16
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Publication ID  17 non-null     object
 1   title           17 non-null     object
 2   abstract        17 non-null     object
 3   abstract_clean  17 non-null     object
dtypes: object(4)
memory usage: 680.0+ bytes


## Reshape dataframe

In [132]:
# Make new dataframe from 'df': df_HCQ
df_HCQ = df[["Publication ID", "title", "abstract_clean"]]

In [133]:
# Add column 'doc' to 'df_HCQ'
# 'df_HCQ["doc"]' shall contain Doc-objects made from abstracts
df_HCQ["doc"] = df_HCQ["abstract_clean"].apply(nlp)

In [134]:
def get_sentences(doc):
    sents_list = [sent for sent in doc.sents]
    
    return sents_list

In [135]:
df_HCQ["sentences"] = df_HCQ["doc"].apply(get_sentences)
df_HCQ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17 entries, 0 to 16
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Publication ID  17 non-null     object
 1   title           17 non-null     object
 2   abstract_clean  17 non-null     object
 3   doc             17 non-null     object
 4   sentences       17 non-null     object
dtypes: object(5)
memory usage: 816.0+ bytes


In [136]:
df_HCQ.head(3)

Unnamed: 0,Publication ID,title,abstract_clean,doc,sentences
0,pub.1126880632,COVID-19 and what pediatric rheumatologists sh...,"On March 11th, 2020 the World Health Organizat...","(On, March, 11th, ,, 2020, the, World, Health,...","[(On, March, 11th, ,, 2020, the, World, Health..."
1,pub.1127834352,Hydroxychloroquine or chloroquine with or with...,"BACKGROUND: Hydroxychloroquine or chloroquine,...","(BACKGROUND, :, Hydroxychloroquine, or, chloro...","[(BACKGROUND, :), (Hydroxychloroquine, or, chl..."
2,pub.1126667578,Hydroxychloroquine in patients mainly with mil...,Abstract Objectives To assess the efficacy and...,"(Abstract, Objectives, To, assess, the, effica...","[(Abstract, Objectives, To, assess, the, effic..."


### Example abstracts

In [137]:
sentences_6 = df_HCQ["sentences"].iloc[6]
for i, sentence in enumerate(sentences_6):
    print(f"({i}) {sentence}")

(0) Background Treatments are urgently needed to prevent respiratory failure and deaths from coronavirus disease 2019 (COVID-19).
(1) Hydroxychloroquine (HCQ) has received worldwide attention because of positive results from small studies.
(2) Methods
(3) We used data collected from routine care of all adults in 4 French hospitals with documented SARS-CoV-2 pneumonia and requiring oxygen ≥ 2 L/min to emulate a target trial aimed at assessing the effectiveness of HCQ at 600 mg/day.
(4) The composite primary endpoint was transfer to intensive care unit (ICU) within 7 days from inclusion and/or death from any cause.
(5) Analyses were adjusted for confounding factors by inverse probability of treatment weighting.
(6) Results
(7) This study included 181 patients with SARS-CoV-2 pneumonia; 84 received HCQ within 48 hours of admission (HCQ group) and 97 did not (no-HCQ group).
(8) Initial severity was well balanced between the groups.
(9) In the weighted analysis, 20.2% patients in the HCQ gr

In [138]:
sentences_14 = df_HCQ["sentences"].iloc[14]
for i, sentence in enumerate(sentences_14):
    print(f"({i}) {sentence}")

(0) The coronavirus disease 2019 (COVID-19) virus is spreading rapidly, and scientists are endeavoring to discover drugs for its efficacious treatment in China.
(1) Chloroquine phosphate, an old drug for treatment of malaria, is shown to have apparent efficacy and acceptable safety against COVID-19 associated pneumonia in multicenter clinical trials conducted in China.
(2) The drug is recommended to be included in the next version of the Guidelines for the Prevention, Diagnosis, and Treatment of Pneumonia Caused by COVID-19 issued by the National Health Commission of the People's Republic of China for treatment of COVID-19 infection in larger populations in the future.


### Example of disagreement-sentences (from different abstracts)

In [139]:
pro_example = sentences_14[1]
con_example= sentences_6[-1] # also: sentences_6[15]; '[-1]' shall point out, that the disagreeing sentence is the last
                             # sentence in the abstract

print(f"(PRO) {pro_example}")
print("\n")
print(f"(CON) {con_example}")

(PRO) Chloroquine phosphate, an old drug for treatment of malaria, is shown to have apparent efficacy and acceptable safety against COVID-19 associated pneumonia in multicenter clinical trials conducted in China.


(CON) These results do not support the use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia.


* The debated statement is a statement of causal relevancy. "Chloroquine phosphate"/"HCQ" is said to be effective against COVID-19 related pneumonia. AND it is safe enough to do no harm to patients. I.e. (Hydroxy)chloroquine is suitable for giving it to patients. So HCQ is said to heal patients that suffer form COVID-19 related pneumonia. Thus, the debated statement is this: (CR) "Treatment with (Hydroxy)chloroquine ist causally relevant for healing COVID-19 related pneumonia". If CR is true, then (Hydroxy)chloroquine could or should be used to treat COVID-19 related pneumonia. CR is the debated statement.
* PRO claims that some body of evidence "show[s]" CR. (The body of evidence is not stated in PRO, but I suppose some evidence is assumed that is suitable for "showing" CR.) CON claims that some body of evidence ("[t]hese results") "support[s]" CR. (Let's ignore for a moment that there is a negation in CON). Both PRO and CON say that there is a relation between some body of evidence and CR. Let's call that relation "support-relation", or SUP(x,y) for short. PRO states that the support-relation obtains between some body of evidence and CR (SUP(E,CR)), whereas CON denies that this relation obtains (NOT SUP(E,CR)).
* Claiming that the support-relation obtains between some body of evidence and CR means that one claims, that CR is true (or at least might well be true). But denying that the support-relation obtains is **not** to say that CR is false.
* There is disagreement between PRO and CON. The disagreement is **not**: PRO states that CR is true while CON states that CR is NOT true. The disagreement rather seems to be: PRO states SUP(E,CR) while CON states NOT SUP(E,CR). It seems to be promising to search for expressions (verbs) that express SUP.

### Expression lists

#### Text doc

In [140]:
examples = [pro_example, con_example]

In [141]:
examples_text = [example.text for example in examples]

In [142]:
text = ""
for sent in examples_text:
    text += sent

In [143]:
exemp = nlp(text)

#### Verbs

In [144]:
# Verb matcher
matcher_verb = Matcher(nlp.vocab)
pattern_verb = [{"DEP": "ROOT", "POS": "VERB"}]
matcher_verb.add("VERB_ID", None, pattern_verb)

In [145]:
verb_list = []

In [146]:
# Fill 'verb_list'
for match_id, start, end in matcher_verb(exemp):
    if exemp[start:end].lemma_ in verb_list:
        continue
    else:
        verb_list.append(exemp[start:end].lemma_)

In [147]:
verb_list

['show', 'support']

#### Statements

In [148]:
statements = []

In [149]:
# Fill 'statements'
for example in examples:
    for match_id, start, end in matcher_verb(example):
        if example[end + 1:-1].text in statements:
            continue
        else:
            statements.append(example[end + 1:-1].text)

In [150]:
statements

['have apparent efficacy and acceptable safety against COVID-19 associated pneumonia in multicenter clinical trials conducted in China',
 'use of HCQ in patients hospitalised for documented SARS-CoV-2-positive hypoxic pneumonia']

In [151]:
# Make docs from statements
statements_docs = list(nlp.pipe(statements))

In [152]:
# Statement matcher
matcher_3 = PhraseMatcher(nlp.vocab)
pattern_3 = statements_docs
matcher_3.add("STATEMENT", None, *pattern_3)

#### Evidence

In [153]:
#matcher_5 = Matcher(nlp.vocab)
#pattern_5 = [{"LEMMA": {"IN": verb_list}}]
#matcher_5.add("SUPPORT", None, pattern_5)

In [154]:
matcher_2 = Matcher(nlp.vocab)
pattern_2 = [{"POS": "NOUN"}, 
             {"POS": "AUX", "OP": "?"}, 
             {"DEP": "neg", "OP": "?"}, 
             {"LEMMA": {"IN": verb_list}}
]
matcher_2.add("EVIDENCE_ID", None, pattern_2)

In [155]:
evidence_list = []

In [156]:
for match_id, start, end in matcher_2(exemp):
    if exemp[start - 1:start + 1].text in evidence_list:
        continue
    else:
        evidence_list.append(exemp[start - 1:start +1].text)

In [157]:
evidence_list

['These results']

#### Not Support

In [158]:
# NOT_SUPPORT matcher
matcher_notSupport = Matcher(nlp.vocab)
pattern_notSupport = [{"DEP": "neg", "OP": "+"}, 
             {"LEMMA": {"IN": verb_list}}
]
matcher_notSupport.add("NOT_SUPPORT", None, pattern_notSupport)

In [159]:
notSupport_list = []

In [160]:
# Not support list
for match_id, start, end in matcher_notSupport(exemp):
    if exemp[start - 1:end].text in notSupport_list:
        continue
    else:
        notSupport_list.append(exemp[start - 1:end].text)

In [161]:
notSupport_list

['do not support']

#### Support

### Entity labelling

In [162]:
# Entities
exemp.ents = []

#### Statement

In [163]:
# Make entity and label 'STATEMENT'
for match_id, start, end in matcher_3(exemp):
    span_state = Span(exemp, start, end, label="STATEMENT")
    exemp.ents = list(exemp.ents) + [span_state]

#### Evidence

In [164]:
# Evidence matcher
matcher_6 = PhraseMatcher(nlp.vocab)
pattern_6 = list(nlp.pipe(evidence_list))
matcher_6.add("EVIDENCE", None, *pattern_6)

In [165]:
# Make entity and label 'EVIDENCE'
for match_id, start, end in matcher_6(exemp):
    span_evidence = Span(exemp, start, end, label="EVIDENCE")

    exemp.ents = list(exemp.ents) + [span_evidence]

#### Not-Support

In [166]:
# Not-Support matcher
matcher_7 = PhraseMatcher(nlp.vocab)
pattern_7 = list(nlp.pipe(notSupport_list))
matcher_7.add("NOT_SUPPORT", None, *pattern_7)

In [167]:
# Make entity and label 'NOT_SUPPORT'
for match_id, start, end in matcher_7(exemp):
    span_notSupport = Span(exemp, start, end, label="NOT_SUPPORT")

    exemp.ents = list(exemp.ents) + [span_notSupport]

In [168]:
#for match_id, start, end in matcher_5(exemp):
#    if exemp[start:end].text in support_list:
#        continue
#    else:
#        support_list.append(exemp[start:end].text)

### Disagreement - possibly

In [169]:
# List of doc.sents: 'sents'
sents = [sent for sent in exemp.sents]

In [170]:
# Do sent1 and sent2 disagree?
for sent1 in sents:
    for sent2 in sents:
        # Compares two sentences. Are they different?
        if sent1 != sent2:
            
            # Similarity of statements/causal hypotheses the sents express
            similar = 0
            for ent1 in sent1.ents:
                if ent1.label_ == "STATEMENT":
                    for ent2 in sent2.ents:
                        if ent2.label_ == "STATEMENT":
                            similar = ent1.similarity(ent2)
                        else:
                            continue
                else:
                    continue
            
            # Comparing whether sents support or do not support a statement
            if similar >= 0.75:
                for ent1 in sent1.ents:
                    if ent1.label_ == "NOT_SUPPORT":
                        for ent2 in sent2.ents:
                            if ent2.label_ != "NOT_SUPPORT":
                                #print(f"(CON):\n {sent1.text}\n(PRO):\n{sent2.text}")
                                print("==========\n")
                                print("(PRO):\n")
                                displacy.render(sent2, style="ent", jupyter=True)
                                print("\n\n")
                                print("(CON):\n")
                                displacy.render(sent1, style="ent", jupyter=True)
                                print("\n==========")
                        continue
            else:
                continue
        else:
            continue


(PRO):






(CON):




