In [1]:
# The path of the data directory (where the ALIGNED glove embeddings are)
DATA_DIR = "./data"

# The path of the tensorboard logs directory
LOGS_DIR = "./lisa_logs"

# The order in which the models should appear in the tables
MODEL_ORDER = ['baseline', 'lstm', 'bilstm', 'bilstm-max']

## Package Imports

In [2]:
from encoders import *
from glove import GloVeEmbeddings
from models import Classifier
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
from torchtext.data.utils import get_tokenizer

import glob, os, re, spacy, torch
import pandas as pd

if not spacy.util.is_package("en_core_web_sm"):
    print("Downloading SpaCy English model (small)")
    spacy.cli.download("en_core_web_sm")

## Model Demonstration

In [3]:
glove = GloVeEmbeddings(DATA_DIR)
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

Reading pre-trained GloVe embeddings from disk


In [4]:
CKPTS_GLOB = "*/*/checkpoints/*.ckpt"
CKPTS_PATTERN = r"([^\/]+)\/version_\d+.*\.ckpt"

EMBED_DIM = 300
LSTM_STATE_DIM = 2048

models = {}
for ckpt_name in glob.glob(os.path.join(LOGS_DIR, CKPTS_GLOB)):
    # Extract model name from checkpoint name
    res = re.search(CKPTS_PATTERN, ckpt_name)
    model_name = res.group(1)

    if model_name == "baseline":
        repr_dim = EMBED_DIM
        encoder = BaselineEncoder()
    else:
        repr_dim = LSTM_STATE_DIM

        if model_name == "lstm":
            encoder = LSTMEncoder(EMBED_DIM, LSTM_STATE_DIM)
        elif model_name == "bilstm":
            repr_dim *= 2
            encoder = BiLSTMEncoder(EMBED_DIM, LSTM_STATE_DIM)
        elif model_name == "bilstm-max":
            repr_dim *= 2
            encoder = MaxBiLSTMEncoder(EMBED_DIM, LSTM_STATE_DIM)
        else:
            print(f"Encountered unsupported encoder architecture '{model_name}'")
            continue

    model_args = {"embeddings": glove.vectors, "encoder": encoder}
    model = Classifier.load_from_checkpoint(ckpt_name, **model_args)
    model.load_embeddings(glove.vectors)
    models[model_name] = model

In [5]:
INT_TO_CLASS = {
    0: "entailment",
    1: "neutral",
    2: "contradiction"
}

@torch.no_grad()
def inference(model_name: str, premise: str, hypothesis: str) -> str:
    if model_name not in models:
        raise Exception(f"Unknown encoder type '{model_name}'!")

    # Load model from dict
    model = models[model_name]

    # Lowercase + tokenize
    premise = tokenizer(premise.lower())
    hypothesis = tokenizer(hypothesis.lower())

    # Convert list of tokens to list of IDs
    premise = [glove.get_id(t) for t in premise]
    hypothesis = [glove.get_id(t) for t in hypothesis]

    # Convert to tensors with an extra dimension (batch_size=1)
    p = torch.IntTensor(premise).unsqueeze(0)
    h = torch.IntTensor(hypothesis).unsqueeze(0)

    # Count length of each sentence
    p_len = torch.LongTensor([len(premise)])
    h_len = torch.LongTensor([len(hypothesis)])

    logits = model(p, h, p_len, h_len)
    category = INT_TO_CLASS[logits.argmax().item()]
    return category

In [6]:
# must be one of: 'baseline', 'lstm', 'bilstm', 'bilstm-max'
MODEL_NAME = 'bilstm-max'

PREMISE = 'The dog is eating.'
HYPOTHESIS = 'The dog sleeps.'

inference(MODEL_NAME, PREMISE, HYPOTHESIS)

'contradiction'

## Results Overview

In [7]:
def highlight_max(s):
    is_max = s == s.max()
    return ['font-weight: bold' if cell else '' for cell in is_max]

def format_df(df):
    dfs = df.style
    dfs = dfs.apply(highlight_max)
    dfs = dfs.format("{:2.2f}")
    dfs = dfs.set_table_styles([
        dict(selector='thead th', props=[('text-align', 'center'), ('vertical-align', 'bottom')]),
        dict(selector='td', props=[('text-align', 'center'), ('padding', '0.5em 1.5em')]),
    ])
    return dfs

### Performance Comparison
(corresponds to Table 3 in Conneau et al.)

In [8]:
LOGS_GLOB = "*/*"
LOGS_PATTERN = r"([^\/]+)\/((?:version_\d+)|(?:eval))"

nli_df = pd.DataFrame(columns=['dev', 'test'])
for log_name in glob.glob(os.path.join(LOGS_DIR, LOGS_GLOB)):
    # Extract model & version name from logfile name
    res = re.search(LOGS_PATTERN, log_name)
    model_name = res.group(1)
    is_test = res.group(2) == "eval"

    # Read the TFEvents file
    ea = EventAccumulator(log_name)
    ea.Reload()

    if is_test:
        # Read the test_acc value
        acc = ea.Scalars('test_acc')[0].value
    else:
        # Read all val_acc values and pick the maximum
        acc = max(map(lambda e: e.value, ea.Scalars('val_acc')))

    # Convert accuracy to percentage
    acc *= 100

    col_name = 'test' if is_test else 'dev'
    if model_name not in nli_df.index:
        acc_df = pd.DataFrame.from_dict({col_name: [acc]})
        acc_df.index = [model_name]

        nli_df = pd.concat((nli_df, acc_df))
    else:
        nli_df.at[model_name, col_name] = acc

In [9]:
def calculate_accuracy(df: pd.DataFrame, name: str) -> pd.DataFrame:
    # Filter out columns that don't have a validation accuracy
    # This is the case in non-classification tasks, such as SICK-R and STS14
    df = df.loc[:, df.loc['devacc'].notnull()]

    # Extract the validation accuracy for each task
    val_acc = df.loc['devacc']

    # Calculate the weighing factor for micro-accuracy
    n_val = df.loc['ndev']
    weight = n_val / n_val.sum()

    # Calculate the macro and micro accuracy
    macro = val_acc.mean()
    micro = (val_acc * weight).sum()

    # Return metrics as dataframe
    acc_dict = {'micro': [micro], 'macro': [macro]}
    acc_df = pd.DataFrame.from_dict(acc_dict)
    acc_df.index = [name]
    return acc_df

In [10]:
RESULTS_GLOB = "results_*.json"
RESULTS_PATTERN = r"results*_([^\.]+)\.json"

transfer_df = pd.DataFrame()
for results_file in glob.glob(os.path.join(LOGS_DIR, RESULTS_GLOB)):
    # Extract model name from file name
    res = re.search(RESULTS_PATTERN, results_file)
    model_name = res.group(1)

    # Convert json to dataframe
    df = pd.read_json(results_file)
    # Calculate accuracies and create dataframe row
    model_accs = calculate_accuracy(df, model_name)

    # Append row to transfer results dataframe
    transfer_df = pd.concat((transfer_df, model_accs))

In [11]:
performance_df = pd.concat((nli_df, transfer_df), axis=1, keys=['NLI', 'Transfer'])
performance_df.reindex(MODEL_ORDER)

format_df(performance_df)

Unnamed: 0_level_0,NLI,NLI,Transfer,Transfer
Unnamed: 0_level_1,dev,test,micro,macro
baseline,65.72,65.33,79.83,78.39
lstm,81.45,81.26,77.32,76.42
bilstm,80.87,80.66,79.95,79.28
bilstm-max,84.37,83.85,81.85,81.16


### SentEval Comparison
(corresponds to Table 4 in Conneau et al.)

In [12]:
RESULTS_GLOB = "results_*.json"
RESULTS_PATTERN = r"results*_([^\.]+)\.json"

senteval_df = pd.DataFrame()
for results_file in glob.glob(os.path.join(LOGS_DIR, RESULTS_GLOB)):
    # Extract model name from file name
    res = re.search(RESULTS_PATTERN, results_file)
    model_name = res.group(1)

    # Convert json to dataframe
    df = pd.read_json(results_file)

    # Select accuracy for classification tasks (except MRPC)
    df_class = df.loc[['acc'], df.loc['acc'].notnull()].drop('MRPC', axis=1)
    df_class.index = [model_name]

    # Select accuracy and F1 score for the MRPC task
    mrpc_cols = pd.MultiIndex.from_product((['MRPC'], ['acc','f1']))
    mrpc_vals = df.loc[['acc', 'f1'], 'MRPC'].array
    df_mrpc = pd.DataFrame([mrpc_vals], columns=mrpc_cols, index=[model_name])

    # Select pearson value for the SICK-R task
    df_sickr = pd.DataFrame(df.loc[['pearson'], 'SICKRelatedness'])
    df_sickr.index = [model_name]

    # Select pearson value(s) for the STS14 task
    dict_sts14 = df.loc['all', 'STS14']['pearson']
    sts14_cols = pd.MultiIndex.from_product((['STS14'], dict_sts14.keys()))
    df_sts14 = pd.DataFrame([dict_sts14.values()], columns=sts14_cols, index=[model_name])

    # Concat all tasks to one dataframe row
    scores_df = pd.concat((df_class, df_mrpc, df_sickr, df_sts14), axis=1)

    # Append row to to SentEval scores dataframe
    senteval_df = pd.concat((senteval_df, scores_df), axis=0)

TASK_ORDER = [
    'MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC',
    ('MRPC', 'acc'), ('MRPC', 'f1'),
    'SICKRelatedness', 'SICKEntailment',
    ('STS14', 'mean'), ('STS14', 'wmean')
]

TASK_NAMES = TASK_ORDER.copy()
TASK_NAMES[4] = 'SST'
TASK_NAMES[6] = 'MRPC<br>accuracy' ; TASK_NAMES[7] = 'MRPC<br>F1-score'
TASK_NAMES[-4] = 'SICK-R' ; TASK_NAMES[-3] = 'SICK-E'
TASK_NAMES[-2] = 'STS14<br>average<br>pearson' ; TASK_NAMES[-1] = 'STS14<br>weighted<br>pearson'

senteval_df = senteval_df.reindex(MODEL_ORDER).T.reindex(TASK_ORDER).T
senteval_df.columns = TASK_NAMES  # rename to match the paper's order

format_df(senteval_df)

Unnamed: 0,MR,CR,SUBJ,MPQA,SST,TREC,MRPC accuracy,MRPC F1-score,SICK-R,SICK-E,STS14 average pearson,STS14 weighted pearson
baseline,75.07,79.23,90.67,84.76,78.25,71.4,70.9,80.19,0.8,78.22,0.45,0.46
lstm,72.62,76.71,86.53,85.03,76.77,72.2,71.65,81.36,0.85,83.42,0.55,0.56
bilstm,73.24,78.49,89.89,84.98,78.36,76.8,72.12,81.06,0.86,83.46,0.56,0.58
bilstm-max,75.8,81.46,91.2,85.6,79.13,79.2,73.1,81.15,0.88,85.2,0.65,0.66


## Error Analysis