## Implementing Benchmarks

In [182]:
import itertools
import logging
import os
import re
import ssl
import gensim.downloader as api
import nltk
import pandas as pd
import numpy as np
import warnings


from fse import SplitIndexedList
from fse.models import uSIF
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize.treebank import TreebankWordTokenizer

from datasets import load_dataset
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


import torch
import torch.nn as nn

nltk.download('vader_lexicon')

# Load GLOVE, which is necessary for uSIF embeddings
if not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
    ssl._create_default_https_context = ssl._create_unverified_context
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
glove = api.load("glove-wiki-gigaword-300")

def polarity_v_score(text: str) -> float:
    """
    Calculate polarity of a sentence using Vader.

    :param text: input sentence
    :return: polarity value of sentence. Ranges from -1 (negative) to 1 (positive).
    """
    vader = SentimentIntensityAnalyzer()
    return vader.polarity_scores(text)["compound"]

def uSIF_similarity(row, model, s):
    prem = row["sentence1"]
    prem_idx = s.items.index(prem)
    hyp = row["sentence2"]
    hyp_idx = s.items.index(hyp)
    similarity = model.sv.similarity(prem_idx, hyp_idx)
    
    return similarity

def calculate_polarity_similarity(df):
    # Calculate polarity
    df["s1_polarity"] = df.apply(lambda row: polarity_v_score(row['sentence1']), axis=1)
    df["s2_polarity"] = df.apply(lambda row: polarity_v_score(row['sentence2']), axis=1)

    unique_claims_set = set(list(df["sentence1"]) + list(df["sentence2"]))
    s = SplitIndexedList(list(unique_claims_set))
    print(f"Found {len(s)} unique claims in DF")

    # Calculate similarity
    model = uSIF(glove, workers=2, lang_freq="en")
    model.train(s)
    
    df["uSIF_similarity"] = df.apply(lambda row: uSIF_similarity(row, model, s), axis=1)
    return df
    
def classify_polarity_similarity(df_dict, val_set="val"):
    
    feature_columns = ["s1_polarity", "s2_polarity", "uSIF_similarity"]
    X = df_dict["train"][feature_columns]
    y = df_dict["train"]["label"]

    model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    model.fit(X, y)
    yhat = model.predict(df_dict[val_set][feature_columns])
    print(classification_report(df_dict[val_set]["label"], yhat, digits=3))

    return None


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dnsosa/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
2022-11-01 17:38:02,365 : MainThread : INFO : loading projection weights from /Users/dnsosa/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz
2022-11-01 17:39:03,657 : MainThread : INFO : KeyedVectors lifecycle event {'msg': 'loaded (400000, 300) matrix of type float32 from /Users/dnsosa/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300.gz', 'binary': False, 'encoding': 'utf8', 'datetime': '2022-11-01T17:39:03.656798', 'gensim': '4.2.0', 'python': '3.7.13 (default, Mar 28 2022, 07:24:34) \n[Clang 12.0.0 ]', 'platform': 'Darwin-20.6.0-x86_64-i386-64bit', 'event': 'load_word2vec_format'}


In [110]:
from covid_lit_contra_claims.data.constants import *

def load_roam_sep_data(roam_path):
    label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}
    roam_df_list = []
    splits = ["Train", "Val", "Test"]
    for data_split in splits:
        roam_df = pd.read_excel(roam_path, sheet_name=data_split)
        roam_df = roam_df.drop(roam_df.columns[0], axis=1)
        roam_df = roam_df.dropna().reset_index(drop=True)
        roam_df = roam_df.rename(columns={"text1": "sentence1", "text2": "sentence2", "annotation": "label"})
        roam_df = roam_df[roam_df["label"].isin(label_map.keys())]
        roam_df.replace({"label": label_map})
        roam_df_list.append(roam_df)
    
    return roam_df_list

In [183]:
roam_df_list = load_roam_sep_data(ROAM_SEP_PATH)
roam_df_list_polsim = [calculate_polarity_similarity(roam_df_split) for roam_df_split in roam_df_list]
roam_df_dict = dict(zip(["train", "val", "test"], roam_df_list_polsim))
classify_polarity_similarity(roam_df_dict, val_set="test")

2022-11-01 17:39:09,859 : MainThread : INFO : no frequency mode: using wordfreq for estimation of frequency for language: en


Found 102 unique claims in DF


2022-11-01 17:39:11,273 : MainThread : INFO : scanning all indexed sentences and their word counts
2022-11-01 17:39:11,274 : MainThread : INFO : finished scanning 102 sentences with an average length of 33 and 3428 total words
2022-11-01 17:39:11,566 : MainThread : INFO : estimated memory for 102 sentences with 300 dimensions and 400000 vocabulary: 459 MB (0 GB)
2022-11-01 17:39:11,569 : MainThread : INFO : initializing sentence vectors for 102 sentences
2022-11-01 17:39:11,571 : MainThread : INFO : pre-computing uSIF weights for 400000 words
2022-11-01 17:39:12,689 : MainThread : INFO : begin training
2022-11-01 17:39:12,693 : MainThread : INFO : worker thread finished; awaiting finish of 1 more threads
2022-11-01 17:39:12,694 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2022-11-01 17:39:12,703 : MainThread : INFO : computing 5 principal components took 0s
2022-11-01 17:39:12,705 : MainThread : INFO : removing 5 principal components took 0s
2022-11-0

Found 48 unique claims in DF


2022-11-01 17:39:15,733 : MainThread : INFO : scanning all indexed sentences and their word counts
2022-11-01 17:39:15,733 : MainThread : INFO : finished scanning 48 sentences with an average length of 34 and 1653 total words
2022-11-01 17:39:16,017 : MainThread : INFO : estimated memory for 48 sentences with 300 dimensions and 400000 vocabulary: 459 MB (0 GB)
2022-11-01 17:39:16,018 : MainThread : INFO : initializing sentence vectors for 48 sentences
2022-11-01 17:39:16,019 : MainThread : INFO : pre-computing uSIF weights for 400000 words
2022-11-01 17:39:17,106 : MainThread : INFO : begin training
2022-11-01 17:39:17,109 : MainThread : INFO : worker thread finished; awaiting finish of 1 more threads
2022-11-01 17:39:17,109 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2022-11-01 17:39:17,114 : MainThread : INFO : computing 5 principal components took 0s
2022-11-01 17:39:17,115 : MainThread : INFO : removing 5 principal components took 0s
2022-11-01 1

Found 54 unique claims in DF


2022-11-01 17:39:20,569 : MainThread : INFO : scanning all indexed sentences and their word counts
2022-11-01 17:39:20,569 : MainThread : INFO : finished scanning 54 sentences with an average length of 36 and 1997 total words
2022-11-01 17:39:20,843 : MainThread : INFO : estimated memory for 54 sentences with 300 dimensions and 400000 vocabulary: 459 MB (0 GB)
2022-11-01 17:39:20,844 : MainThread : INFO : initializing sentence vectors for 54 sentences
2022-11-01 17:39:20,845 : MainThread : INFO : pre-computing uSIF weights for 400000 words
2022-11-01 17:39:21,947 : MainThread : INFO : begin training
2022-11-01 17:39:21,948 : MainThread : INFO : worker thread finished; awaiting finish of 1 more threads
2022-11-01 17:39:21,949 : MainThread : INFO : worker thread finished; awaiting finish of 0 more threads
2022-11-01 17:39:21,952 : MainThread : INFO : computing 5 principal components took 0s
2022-11-01 17:39:21,953 : MainThread : INFO : removing 5 principal components took 0s
2022-11-01 1

               precision    recall  f1-score   support

contradiction      0.000     0.000     0.000        21
   entailment      0.480     0.182     0.264        66
      neutral      0.543     0.880     0.672       100

     accuracy                          0.535       187
    macro avg      0.341     0.354     0.312       187
 weighted avg      0.460     0.535     0.452       187



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [168]:
#xx = roam_df_dict["test"]
#xx[xx["label"] == "contradiction"]

xx = roam_df_dict["train"]
xx[xx["label"] == "contradiction"]

Unnamed: 0,sentence1,sentence2,label,s1_polarity,s2_polarity,uSIF_similarity
8,"to date, only remdesivir and dexamethasone hav...",hydroxychloroquine was administered relatively...,contradiction,0.5574,-0.3412,0.381222
12,"to date, only remdesivir and dexamethasone hav...","in short, neither dexamethasone nor hydrocorti...",contradiction,0.5574,0.7351,0.178178
26,these findings formed the basis of a recent ra...,"in moderate to severe ards covid-19 patients, ...",contradiction,0.7425,-0.128,0.28745
33,these findings formed the basis of a recent ra...,cao and co-workers showed 199 adult patients w...,contradiction,0.7425,-0.8625,0.164452
37,these findings formed the basis of a recent ra...,"the third patient, who had started receiving h...",contradiction,0.7425,-0.3182,0.231615
62,it is important to underline that the immunomo...,"in short, neither dexamethasone nor hydrocorti...",contradiction,0.8627,0.7351,0.211342
66,it is important to underline that the immunomo...,withaferin a alone or in combination with drug...,contradiction,0.8627,-0.3182,0.44646
83,"the third patient, who had started receiving h...",lopinavir-ritonavir and ribavirin have been us...,contradiction,-0.3182,0.4939,0.391408
97,in the novel coronavirus pneumonia diagnosis ...,few drugs like remdesivir and dexamethasone ha...,contradiction,0.2263,0.9127,0.142546
110,in the novel coronavirus pneumonia diagnosis ...,the most prominent finding to emerge from this...,contradiction,0.2263,0.3804,0.172027


In [152]:
df.head()

Unnamed: 0,sentence1,sentence2,label,s1_polarity,s2_polarity,uSIF_similarity
0,mortality at 28 days was significantly lower i...,"finally, the most recent and promising researc...",entailment,0.7717,0.7624,0.266236
1,an in vitro study found that remdesivir and ch...,specific therapeutic procedures suggested to i...,entailment,-0.2023,-0.8316,0.249282
2,an in vitro study found that remdesivir and ch...,in the novel coronavirus pneumonia diagnosis ...,neutral,-0.2023,0.2263,0.23482
3,an in vitro study found that remdesivir and ch...,the most prominent finding to emerge from this...,neutral,-0.2023,0.3804,0.164445
4,an in vitro study found that remdesivir and ch...,"remdesivir, favipiravir, baricinitib, and anak...",neutral,-0.2023,-0.6597,0.129485


In [102]:
from collections import Counter
from itertools import product
from nltk.tokenize.treebank import TreebankWordTokenizer
from sklearn.linear_model import LogisticRegression



def classify_negative_parity(premise, hypothesis):
    pass


In [None]:
from covid_lit_contra_claims.evaluation.nli_utils import glove2dict

glove_lookup = glove2dict(
    os.path.join(GLOVE_HOME, 'glove.6B.300d.txt'))

def glove_leaves_phi(ex, np_func=np.mean):
    """
    Represent `ex` as a combination of the vector of their words,
    and concatenate these two combinator vectors.

    Parameters
    ----------
    ex : NLIExample

    np_func : function
        A numpy matrix operation that can be applied columnwise,
        like `np.mean`, `np.sum`, or `np.prod`. The requirement is that
        the function take `axis=0` as one of its arguments (to ensure
        columnwise combination) and that it return a vector of a
        fixed length, no matter what the size of the tree is.

    Returns
    -------
    np.array

    """
    prem_vecs = _get_tree_vecs(ex.premise, glove_lookup, np_func)
    hyp_vecs = _get_tree_vecs(ex.hypothesis, glove_lookup, np_func)
    return np.concatenate((prem_vecs, hyp_vecs))


def _get_tree_vecs(text, lookup, np_func):
    tokens = tokenizer.tokenize(text)    
    allvecs = np.array([lookup[w] for w in tokens if w in lookup])
    if len(allvecs) == 0:
        dim = len(next(iter(lookup.values())))
        feats = np.zeros(dim)
    else:
        feats = np_func(allvecs, axis=0)
    return feats

%%time
_ = nli.experiment(
    train_reader=nli.NLIReader(roam_datasets['train']),
    phi=glove_leaves_phi,
    train_func=fit_softmax_with_hyperparameter_search,
    assess_reader=nli.NLIReader(roam_datasets['val']),
    vectorize=False)  # Ask `experiment` not to featurize; we did it already.

In [146]:
glove

NameError: name 'glove' is not defined

Unnamed: 0,sentence1,sentence2,labels
0,mortality at 28 days was significantly lower i...,"finally, the most recent and promising researc...",entailment
1,an in vitro study found that remdesivir and ch...,specific therapeutic procedures suggested to i...,entailment
2,an in vitro study found that remdesivir and ch...,in the novel coronavirus pneumonia diagnosis ...,neutral
3,an in vitro study found that remdesivir and ch...,the most prominent finding to emerge from this...,neutral
4,an in vitro study found that remdesivir and ch...,"remdesivir, favipiravir, baricinitib, and anak...",neutral
...,...,...,...
429,"in the dexamethasone group, the incidence of d...",we recommend decreasing the dose of dexamethas...,neutral
430,we report herein our experience regarding the ...,our case also suggests that a brief course of ...,neutral
431,qt prolongation should be considered when usin...,conclusions therapeutic regimens of ifn- + lo...,neutral
432,roads less traveled might also be considered o...,arabi and colleagues initiated a placebo-contr...,neutral


In [1]:


snli = load_dataset("snli")


Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading and preparing dataset snli/plain_text (download: 90.17 MiB, generated: 65.51 MiB, post-processed: Unknown size, total: 155.68 MiB) to /Users/dnsosa/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b...


Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Dataset snli downloaded and prepared to /Users/dnsosa/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [54]:
snli


DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

## CS 224 NLI Baselines

In [169]:
# From NLI
# References: https://github.com/cgpotts/cs224u/blob/afd64b41f845b0f444b152d0f7acf2a45228349a/nli.py#L186
from covid_lit_contra_claims.evaluation.nli_utils import fit_classifier_with_hyperparameter_search, glove2dict

tokenizer = TreebankWordTokenizer()

# Hypothesis only benchmark
def hypothesis_only_unigrams_phi(ex):
    return Counter(tokenizer.tokenize(ex.hypothesis))

def premise_only_unigrams_phi(ex):
    return Counter(tokenizer.tokenize(ex.premise))

def word_overlap_phi(ex):
    words1 = {w.lower() for w in tokenizer.tokenize(ex.premise)}
    words2 = {w.lower() for w in tokenizer.tokenize(ex.hypothesis)}
    return Counter(words1 & words2)

def word_cross_product_phi(ex):
    words1 = [w.lower() for w in tokenizer.tokenize(ex.premise)]
    words2 = [w.lower() for w in tokenizer.tokenize(ex.hypothesis)]
    return Counter([(w1, w2) for w1, w2 in product(words1, words2)])


#GLOVE_HOME = os.path.join('data', 'glove.6B')
#glove_lookup = glove2dict(
#    os.path.join(GLOVE_HOME, 'glove.6B.300d.txt'))

def glove_leaves_phi(ex, np_func=np.mean):
    """
    Represent `ex` as a combination of the vector of their words,
    and concatenate these two combinator vectors.

    Parameters
    ----------
    ex : NLIExample

    np_func : function
        A numpy matrix operation that can be applied columnwise,
        like `np.mean`, `np.sum`, or `np.prod`. The requirement is that
        the function take `axis=0` as one of its arguments (to ensure
        columnwise combination) and that it return a vector of a
        fixed length, no matter what the size of the tree is.

    Returns
    -------
    np.array

    """
    prem_vecs = _get_tree_vecs(ex.premise, glove_lookup, np_func)
    hyp_vecs = _get_tree_vecs(ex.hypothesis, glove_lookup, np_func)
    return np.concatenate((prem_vecs, hyp_vecs))


def _get_tree_vecs(text, lookup, np_func):
    tokens = tokenizer.tokenize(text)    
    allvecs = np.array([lookup[w] for w in tokens if w in lookup])
    if len(allvecs) == 0:
        dim = len(next(iter(lookup.values())))
        feats = np.zeros(dim)
    else:
        feats = np_func(allvecs, axis=0)
    return feats

In [98]:
def fit_softmax(X, y):
    mod = LogisticRegression(
        fit_intercept=True,
        solver='liblinear',
        multi_class='ovr')
    mod.fit(X, y)
    return mod

def fit_softmax_with_hyperparameter_search(X, y):
    """
    A MaxEnt model of dataset with hyperparameter cross-validation.

    Parameters
    ----------
    X : 2d np.array
        The matrix of features, one example per row.

    y : list
        The list of labels for rows in `X`.

    Returns
    -------
    sklearn.linear_model.LogisticRegression
        A trained model instance, the best model found.

    """

    mod = LogisticRegression(
        fit_intercept=True,
        max_iter=5,  ## A small number of iterations.
        solver='liblinear',
        multi_class='ovr')

    param_grid = {
        'C': [0.4, 0.6, 0.8, 1.0],
        'penalty': ['l1','l2']}

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bestmod = fit_classifier_with_hyperparameter_search(
            X, y, mod, param_grid=param_grid, cv=3)

    return bestmod

In [178]:
def calculate_baseline_metrics(dataset, baseline_classifier, hp_opt=False, val_set="val"):
    
    if hp_opt:
        baseline_classifier_experiment_xval = nli.experiment(
            train_reader=nli.NLIReader(dataset['train']),
            phi=baseline_classifier,
            train_func=fit_softmax_with_hyperparameter_search,
            assess_reader=None,
            verbose=False)

        optimized_baseline_classifier = baseline_classifier_experiment_xval['model']
        del baseline_classifier_experiment_xval

        def fit_optimized_baseline_classifier(X, y):
            optimized_baseline_classifier.max_iter = 1000 # To convergence in this phase!
            optimized_baseline_classifier.fit(X, y)
            return optimized_baseline_classifier
        
    
        train_func = fit_optimized_baseline_classifier
        
    else: 
        train_func = fit_softmax
    
    baseline_results = nli.experiment(train_reader=nli.NLIReader(dataset['train']),
                                      phi=baseline_classifier,
                                      train_func=train_func,
                                      assess_reader=nli.NLIReader(dataset[val_set]))

    return baseline_results


### Load Dataset

In [74]:
%%time

from datasets import load_dataset
from covid_lit_contra_claims.evaluation import nli

from covid_lit_contra_claims.data.CreateDatasetUtilities import load_roam_full_data
from covid_lit_contra_claims.data.CreateDataset import create_roam_dataset
from covid_lit_contra_claims.data.constants import *

roam_dataset = create_roam_dataset(ROAM_SEP_PATH)
roam_dataset = roam_dataset.rename_column("labels", "label")
roam_dataset = roam_dataset.rename_column("sentence1", "premise")
roam_dataset = roam_dataset.rename_column("sentence2", "hypothesis")

# snli = load_dataset("snli")

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

CPU times: user 495 ms, sys: 38.2 ms, total: 533 ms
Wall time: 549 ms


### Word Overlap

In [179]:
%%time
word_overlap_results = calculate_baseline_metrics(roam_dataset, word_overlap_phi, hp_opt=True, val_set="test")

Best params: {'C': 1.0, 'penalty': 'l2'}
Best score: 0.387
               precision    recall  f1-score   support

contradiction      0.000     0.000     0.000        21
   entailment      0.455     0.303     0.364        66
      neutral      0.563     0.800     0.661       100

     accuracy                          0.535       187
    macro avg      0.339     0.368     0.342       187
 weighted avg      0.462     0.535     0.482       187

CPU times: user 413 ms, sys: 148 ms, total: 561 ms
Wall time: 818 ms


### Word Cross-Product

In [180]:
%%time
word_cross_product_results = calculate_baseline_metrics(roam_dataset, word_cross_product_phi, hp_opt=True, val_set="test")

Best params: {'C': 1.0, 'penalty': 'l1'}
Best score: 0.559
               precision    recall  f1-score   support

contradiction      0.167     0.048     0.074        21
   entailment      0.360     0.545     0.434        66
      neutral      0.531     0.430     0.475       100

     accuracy                          0.428       187
    macro avg      0.353     0.341     0.328       187
 weighted avg      0.430     0.428     0.415       187

CPU times: user 26.2 s, sys: 946 ms, total: 27.1 s
Wall time: 11.4 s


### Hypothesis- and Premise-Only Unigrams

In [181]:
%%time
hypothesis_unigrams_results = calculate_baseline_metrics(roam_dataset, hypothesis_only_unigrams_phi, hp_opt=False, val_set="test")
premise_unigrams_results = calculate_baseline_metrics(roam_dataset, premise_only_unigrams_phi, hp_opt=False, val_set="test")

               precision    recall  f1-score   support

contradiction      0.000     0.000     0.000        21
   entailment      0.340     0.258     0.293        66
      neutral      0.537     0.720     0.615       100

     accuracy                          0.476       187
    macro avg      0.292     0.326     0.303       187
 weighted avg      0.407     0.476     0.433       187

               precision    recall  f1-score   support

contradiction      0.000     0.000     0.000        21
   entailment      0.333     0.439     0.379        66
      neutral      0.510     0.510     0.510       100

     accuracy                          0.428       187
    macro avg      0.281     0.316     0.296       187
 weighted avg      0.390     0.428     0.407       187

CPU times: user 239 ms, sys: 6.83 ms, total: 246 ms
Wall time: 248 ms


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Glove Embeddings

In [None]:


%%time
_ = nli.experiment(
    train_reader=nli.NLIReader(roam_datasets['train']),
    phi=glove_leaves_phi,
    train_func=fit_softmax_with_hyperparameter_search,
    assess_reader=nli.NLIReader(roam_datasets['val']),
    vectorize=False)  # Ask `experiment` not to featurize; we did it already.