In [1]:
import sys
import spacy
import wandb
import numpy as np
processor = spacy.load('en_core_web_sm')

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import json
from checklist.test_suite import TestSuite


class BatchedInference:
    def __init__(self, model, tokenizer, checkpoint_path=None, device="cpu"): 
        self.model = model
        if checkpoint_path: 
            self.load_model_from_checkpoint(checkpoint_path, device=device)
        self.model = self.model.eval()
        self.tokenizer = tokenizer
        self.softmax = torch.nn.Softmax(dim=-1)
        self.device = device
        
    def __call__(self, batch):
        with torch.no_grad():
            tokenized_input = self.tokenizer(batch, padding=True)
            input_ids = torch.tensor(tokenized_input["input_ids"]).to(self.device)
            attention_mask = torch.tensor(tokenized_input["attention_mask"]).to(self.device)
            output = self.model(input_ids, attention_mask=attention_mask)
            prediction = output.logits.argmax(dim=-1).cpu()
            confidence = self.softmax(output.logits).cpu()
        return prediction, confidence
    
    def load_model_from_checkpoint(self, checkpoint_path, device="cpu"): 
        checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
        model_state_dict = {k.replace("module.", ""): v for (k, v) in checkpoint["model"].items()}
        self.model.load_state_dict(model_state_dict, strict=False)
        self.model.to(device)
    
    @classmethod
    def from_model_name(cls, model_name, checkpoint_path=None, device="cpu"): 
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        return cls(model=model, tokenizer=tokenizer, checkpoint_path=checkpoint_path, device=device)
    
    
def save_test_results(config, test_suite): 
    tests_information = {}
    for test in test_suite.tests: 
        tests_information[test] = {}
        samples = test_suite.tests[test].data
        predictions = [pred.tolist() for pred in test_suite.tests[test].results["preds"]]
        confidences = [conf.tolist() for conf in test_suite.tests[test].results["confs"]]
        expect_results = [expect_result.tolist() for expect_result in test_suite.tests[test].results["expect_results"]]
        stats = test_suite.tests[test].get_stats()
        failures = [test_suite.tests[test].data[idx] for idx in test_suite.tests[test].fail_idxs()]
        passed = test_suite.tests[test].results["passed"].tolist()

        tests_information[test]["samples"] = samples
        tests_information[test]["predictions"] = predictions
        tests_information[test]["confidences"] = confidences
        tests_information[test]["expect_results"] = expect_results
        tests_information[test]["stats"] = stats
        tests_information[test]["failures"] = failures
        tests_information[test]["passed"] = passed

        wandb.log({"test": test, "samples": samples, "predictions": predictions, "confidences": confidences, "expect_results": expect_results, "stats": stats, "failures": failures, "passed": passed})

    with open(results_path, "w") as f: 
        json.dump(tests_information, f)

    wandb.save(config["results_path"], base_path="../../")

In [2]:
test_suite_path = "testset_19_07_21.pkl"

#### Random Seed 0 - Vanilla 

In [115]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs0-shuffle-train/albert-large-v2_6.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [116]:
results_path = "results/checklist/rs0_shuffle_train_6_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs0_shuffle_train_6_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.74MB of 4.74MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,90
_timestamp,1626950109
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [117]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    12 (0.9%)

Example fails:
0.5 I abhorred this actor.
----
0.5 We abhorred that actor.
----
0.8 I abhorred that director.
----


add positive phrases
Test cases:      500
Fails (rate):    3 (0.6%)

Example fails:
0.4 The lack of pace kills it , although , in a movie about cancer , this might be apt .
0.1 The lack of pace kills it , although , in a movie about cancer , this might be apt. I would watch this again.

----
0.1 The central story lacks punch .
0.0 The central story lacks punch. I would watch this again.

----
0.9 A very stylish but ultimately extremely silly tale ... a slick piece of nonsense but nothing more .
0.1 A very stylish but ultimately extremely silly tale ... a slick piece of nonsense but nothing more. It is good.

----


add negative phrases
Test

#### Random Seed 0 - SWA

In [118]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs0-swa-linear-60-start2-drop-shuffle/albert-large-v2_7.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [119]:
results_path = "results/checklist/rs0-swa-linear-60-start2-drop-shuffle_7_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs0-swa-linear-60-start2-drop-shuffle_7_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.80MB of 4.80MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,90
_timestamp,1626950367
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [120]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    3 (0.2%)

Example fails:
0.6 We abhor the show.
----
0.6 I abhor the show.
----
0.6 I abhor this director.
----


add positive phrases
Test cases:      500
Fails (rate):    4 (0.8%)

Example fails:
0.8 A very stylish but ultimately extremely silly tale ... a slick piece of nonsense but nothing more .
0.7 A very stylish but ultimately extremely silly tale ... a slick piece of nonsense but nothing more. I value it.

----
0.8 We need ( Moore 's ) noisy , cocky energy , his passion and class consciousness ; we need his shticks , we need his stones .
0.7 We need ( Moore 's ) noisy , cocky energy , his passion and class consciousness ; we need his shticks , we need his stones. I would watch this again.

----
0.5 So routine , familiar and predictable , it raises the possib

#### Random Seed 1 - Vanilla

In [121]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs1-shuffle-train/albert-large-v2_2.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [122]:
results_path = "results/checklist/rs1_shuffle_train_2_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs1_shuffle_train_2_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.79MB of 4.79MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,91
_timestamp,1626950556
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [123]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    3 (0.6%)

Example fails:
0.1 Too bad the former Murphy Brown does n't pop Reese back .
0.2 Too bad the former Murphy Brown does n't pop Reese back. I regret it.

----
0.0 Edited and shot with a syncopated style mimicking the work of his subjects , Pray turns the idea of the documentary on its head , making it rousing , invigorating fun lacking any MTV puffery .
0.1 Edited and shot with a syncopated style mimicking the work of his subjects , Pray turns the idea of the documentary on its head , making it rousing , invigorating fun lacking any MTV puffery. I abhor it.
0.1 Edited and shot with a syncopated style mimicking the work o

#### Random Seed 1 - SWA

In [124]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs1-swa-linear-75-start2-drop-shuffle/albert-large-v2_4.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [125]:
results_path = "results/checklist/rs1_rs1-swa-linear-75-start2-drop-shuffle_4_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs1-swa-linear-75-start2-drop-shuffle_4_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.61MB of 4.61MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,93
_timestamp,1626950656
_step,17


0,1
_runtime,▁▁▁▁▁█████████████
_timestamp,▁▁▁▁▁█████████████
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [126]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    0 (0.0%)


change neutral words with BERT
Test cases:      500
Fails (rate):    39 (7.8%)

Example fails:
0.9 There 's a disreputable air about the whole thing , and that 's what makes it irresistible .
0.3 There 's a disreputable air about the whole thing , maybe that 's what makes it irresistible .

----
0.8 In its ragged , cheap and unassuming way , the movie works .
0.0 In its ragged , cheap and unassuming way , the above works .
0.2 In its ragged , cheap and unassuming way , the code works .

----
0.3 A film of precious increments artfully camouflaged as everyday activities .
1.0 A film capturing precious increments artfull

#### Random Seed 2 - Vanilla

In [127]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs2-shuffle-train/albert-large-v2_5.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [128]:
results_path = "results/checklist/rs2-shuffle-train_5_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs2-shuffle-train_5_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.60MB of 4.60MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,93
_timestamp,1626950757
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [129]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    6 (1.2%)

Example fails:
0.0 Let 's cut to the consumer-advice bottom line : Stay home .
0.1 Let 's cut to the consumer-advice bottom line : Stay home. I abhor it.

----
0.0 Francophiles will snicker knowingly and you 'll want to slap them .
0.1 Francophiles will snicker knowingly and you 'll want to slap them. I abhor it.
0.1 Francophiles will snicker knowingly and you 'll want to slap them. I dread it.

----
0.3 The Transporter is as lively and as fun as it is unapologetically dumb
0.5 The Transporter is as lively and as fun as it is unapologetically dumb. I abhor it.
0.4 The Transporter is as lively and as fun as it is unapol

#### Random Seed 2 - SWA 

In [130]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs2-swa-linear-60-start2-drop-shuffle/albert-large-v2_4.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [131]:
results_path = "results/checklist/rs2-swa-linear-60-start2-drop-shuffle_4_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs2-swa-linear-60-start2-drop-shuffle_4_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.63MB of 4.63MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,92
_timestamp,1626950856
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [132]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    1 (0.2%)

Example fails:
0.0 Roger Michell , who did an appealing job directing Persuasion and Notting Hill in England , gets too artsy in his American debut .
0.1 Roger Michell , who did an appealing job directing Persuasion and Notting Hill in England , gets too artsy in his American debut. I regret it.

----


change neutral words with BERT
Test cases:      500
Fails (rate):    37 (7.4%)

Example fails:
0.8 If you pitch your expectations at an all time low , you could do worse than this oddly cheerful -- but not particularly funny -- body-switching farce .
0.0 If you pitch your expectations at an all time low , you could do w

#### Random Seed 3 - Vanilla

In [133]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs3-shuffle-train/albert-large-v2_1.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [134]:
results_path = "results/checklist/rs3-shuffle-train_1_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs3-shuffle-train_1_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.61MB of 4.61MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,92
_timestamp,1626950955
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [135]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    0 (0.0%)


change neutral words with BERT
Test cases:      500
Fails (rate):    35 (7.0%)

Example fails:
0.2 A strong first quarter , slightly less so second quarter , and average second half .
0.5 really strong first quarter , slightly less so second quarter , and average second half .

----
0.9 Evokes a little of the fear that parents have for the possible futures of their children -- and the sometimes bad choices mothers and fathers make in the interests of doing them good .
0.0 Evokes very little of the fear that parents have for the possible futures of their children -- and the sometimes bad choices mothers and fathers mak

#### Random Seed 3 - SWA

In [136]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs3-swa-linear-60-start2-drop-shuffle/albert-large-v2_8.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [137]:
results_path = "results/checklist/rs3-swa-linear-60-start2-drop-shuffle_8_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs3-swa-linear-60-start2-drop-shuffle_8_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.59MB of 4.59MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,93
_timestamp,1626951055
_step,17


0,1
_runtime,▁▁▁▁██████████████
_timestamp,▁▁▁▁██████████████
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [138]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    0 (0.0%)


change neutral words with BERT
Test cases:      500
Fails (rate):    40 (8.0%)

Example fails:
0.2 The 3D images only enhance the film 's otherworldly quality , giving it a strange combo of you-are-there closeness with the disorienting unreality of the seemingly broken-down fourth wall of the movie screen .
0.6 The 3D images only enhance the film 's otherworldly quality , giving it a strange combo of you-are-there closeness with the disorienting unreality of the seemingly broken-down fourth wall of the big screen .

----
1.0 The production values are up there .
0.0 The production values are up below .

----
0.9 Run , 

#### Random Seed 4 - Vanilla

In [139]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs4-shuffle-train/albert-large-v2_1.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [140]:
results_path = "results/checklist/rs4-shuffle-train_1_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs4-shuffle-train_1_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.60MB of 4.60MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,92
_timestamp,1626951154
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [141]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    3 (0.6%)

Example fails:
0.1 Where last time jokes flowed out of Cho 's life story , which provided an engrossing dramatic through line , here the comedian hides behind obviously constructed routines .
0.2 Where last time jokes flowed out of Cho 's life story , which provided an engrossing dramatic through line , here the comedian hides behind obviously constructed routines. Never watching this again.
0.2 Where last time jokes flowed out of Cho 's life story , which provided an engrossing dramatic through line , here the comedian hides behind obviously constructed routines. I regret it.

----
0.0 Roger Michell , who did an appea

#### Random Seed 4 - SWA

In [142]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs4-swa-linear-75-start2-drop-shuffle/albert-large-v2_6.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [143]:
results_path = "results/checklist/rs4-swa-linear-75-start2-drop-shuffle_6_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs4-swa-linear-75-start2-drop-shuffle_6_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.60MB of 4.60MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,92
_timestamp,1626951254
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.11.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [144]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    2 (0.4%)

Example fails:
0.0 Without September 11 , Collateral Damage would have been just another bad movie .
0.1 Without September 11 , Collateral Damage would have been just another bad movie. Never watching this again.

----
0.2 Is this progress ?
0.4 Is this progress. Never watching this again.

----


change neutral words with BERT
Test cases:      500
Fails (rate):    33 (6.6%)

Example fails:
0.9 The Transporter is as lively and as fun as it is unapologetically dumb
0.3 * Transporter is as lively and as fun as it is unapologetically dumb

----
0.6 If you pitch your expectations at an all time low , you could do worse tha

#### Random Seed 5 - Vanilla

In [3]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs5-shuffle-train/albert-large-v2_4.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [4]:
results_path = "results/checklist/rs5-shuffle-train_4_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs5-shuffle-train_4_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

[34m[1mwandb[0m: Currently logged in as: [33mukh[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [5]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    39 (7.8%)

Example fails:
0.0 The issue of faith is not explored very deeply
0.1 The issue of faith is not explored very deeply. I despise it.
0.1 The issue of faith is not explored very deeply. I regret it.

----
0.0 Even legends like Alfred Hitchcock and John Huston occasionally directed trifles ... so it 's no surprise to see a world-class filmmaker like Zhang Yimou behind the camera for a yarn that 's ultimately rather inconsequential .
0.2 Even legends like Alfred Hitchcock and John Huston occasionally directed trifles ... so it 's no surprise to see a world-class filmmaker like Zhang Yimou behind the camera for a yarn that

#### Random Seed 5 - SWA

In [6]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs5-swa-linear-60-start2-drop-shuffle/albert-large-v2_3.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [7]:
results_path = "results/checklist/rs5-swa-linear-60-start2-drop-shuffle_3_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs5-swa-linear-60-start2-drop-shuffle_3_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.67MB of 4.67MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,91
_timestamp,1631610025
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [8]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    1 (0.2%)

Example fails:
0.9 Is n't it great ?
0.0 Is n't it great. I would watch this again.
0.1 Is n't it great. I value it.

----


add negative phrases
Test cases:      500
Fails (rate):    9 (1.8%)

Example fails:
0.0 You would be better off investing in the worthy EMI recording that serves as the soundtrack , or the home video of the 1992 Malfitano-Domingo production .
0.1 You would be better off investing in the worthy EMI recording that serves as the soundtrack , or the home video of the 1992 Malfitano-Domingo production. I regret it.

----
0.0 Schindler 's List it ai n't .
0.2 Schindler 's List it ai n't. I regret it.

----
0.1 This pep-talk for faith , hope and charity does little to off

#### Random Seed 6 - Vanilla

In [9]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs6-shuffle-train/albert-large-v2_2.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [10]:
results_path = "results/checklist/rs6-shuffle-train_2_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs6-shuffle-train_2_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.60MB of 4.60MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,90
_timestamp,1631610269
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [11]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    11 (2.2%)

Example fails:
0.1 And in truth , cruel as it may sound , he makes Arnold Schwarzenegger look like Spencer Tracy .
0.2 And in truth , cruel as it may sound , he makes Arnold Schwarzenegger look like Spencer Tracy. I abhor it.
0.2 And in truth , cruel as it may sound , he makes Arnold Schwarzenegger look like Spencer Tracy. I despise it.

----
0.2 ( Director ) Byler may yet have a great movie in him , but Charlotte Sometimes is only half of one .
0.3 ( Director ) Byler may yet have a great movie in him , but Charlotte Sometimes is only half of one. I dread it.

----
0.1 Though the book runs only about 300 pages , it is

#### Random Seed 6 - SWA

In [12]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs6-swa-linear-60-start2-drop-shuffle/albert-large-v2_7.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [13]:
results_path = "results/checklist/rs6-swa-linear-60-start2-drop-shuffle_7_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs6-swa-linear-60-start2-drop-shuffle_7_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.63MB of 4.63MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,90
_timestamp,1631610499
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [14]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    1 (0.2%)

Example fails:
1.0 Is n't it great ?
0.0 Is n't it great. I would watch this again.
0.0 Is n't it great. I value it.

----


add negative phrases
Test cases:      500
Fails (rate):    3 (0.6%)

Example fails:
0.0 Director Dirk Shafer and co-writer Greg Hinton ride the dubious divide where gay porn reaches for serious drama .
0.1 Director Dirk Shafer and co-writer Greg Hinton ride the dubious divide where gay porn reaches for serious drama. I abhor it.

----
0.0 Roger Michell , who did an appealing job directing Persuasion and Notting Hill in England , gets too artsy in his American debut .
0.1 Roger Michell , who did an appealing job directing Persuasion and Notting Hill in England , get

#### Random Seed 7 - Vanilla

In [15]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs7-shuffle-train/albert-large-v2_2.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [16]:
results_path = "results/checklist/rs7-shuffle-train_2_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs7-shuffle-train_2_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.62MB of 4.62MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,90
_timestamp,1631611094
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [17]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    2 (0.4%)

Example fails:
0.0 Too bad the former Murphy Brown does n't pop Reese back .
0.1 Too bad the former Murphy Brown does n't pop Reese back. I regret it.

----
0.9 This is such a high-energy movie where the drumming and the marching are so excellent , who cares if the story 's a little weak .
0.9 This is such a high-energy movie where the drumming and the marching are so excellent , who cares if the story 's a little weak. I dread it.

----


change neutral words with BERT
Test cases:      500
Fails (rate):    39 (7.8%)

Example fails:
0.9 Disney 's live-action division has a history of releasing cinematic flotsam , but t

#### Random Seed 7 - SWA

In [18]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs7-swa-linear-60-start2-drop-shuffle/albert-large-v2_6.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [19]:
results_path = "results/checklist/rs7-swa-linear-60-start2-drop-shuffle_6_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs7-swa-linear-60-start2-drop-shuffle_6_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.60MB of 4.60MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,92
_timestamp,1631611217
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [20]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    1 (0.2%)

Example fails:
0.0 Schindler 's List it ai n't .
0.2 Schindler 's List it ai n't. I regret it.

----


change neutral words with BERT
Test cases:      500
Fails (rate):    34 (6.8%)

Example fails:
0.8 Like a south-of-the-border Melrose Place .
0.1 Like another south-of-the-border Melrose Place .
0.5 Like in south-of-the-border Melrose Place .

----
1.0 Workmanlike , maybe , but still a film with all the elements that made the other three great , scary times at the movies .
0.0 Workmanlike , maybe , but still a film lacking all the elements that made the other three great , scary times at the movies .

----
1.0 In its 

#### Random Seed 8 - Vanilla

In [21]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs8-shuffle-train/albert-large-v2_3.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [22]:
results_path = "results/checklist/rs8-shuffle-train_3_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs8-shuffle-train_3_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.63MB of 4.63MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,92
_timestamp,1631611325
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁██████
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁██████
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [23]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    1 (0.2%)

Example fails:
0.9 Is n't it great ?
0.0 Is n't it great. I would watch this again.
0.4 Is n't it great. I recommend it.

----


add negative phrases
Test cases:      500
Fails (rate):    2 (0.4%)

Example fails:
0.1 The only thing that could possibly make them less interesting than they already are is for them to get full montied into a scrappy , jovial team .
0.2 The only thing that could possibly make them less interesting than they already are is for them to get full montied into a scrappy , jovial team. Never watching this again.

----
0.0 Jolie 's performance vanishes somewhere between her hair and her lips .
0.1 Jolie 's performance vanishes somewhere between her hair and her lips

#### Random Seed 8 - SWA

In [24]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs8-swa-linear-60-start2-drop-shuffle/albert-large-v2_4.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [25]:
results_path = "results/checklist/rs8-swa-linear-60-start2-drop-shuffle_4_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs8-swa-linear-60-start2-drop-shuffle_4_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.62MB of 4.62MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,91
_timestamp,1631611463
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [26]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    0 (0.0%)


change neutral words with BERT
Test cases:      500
Fails (rate):    46 (9.2%)

Example fails:
1.0 The production values are up there .
0.0 The production values are up below .
0.3 The production values are up by .

----
1.0 Workmanlike , maybe , but still a film with all the elements that made the other three great , scary times at the movies .
0.0 Workmanlike , maybe , but still a film lacking all the elements that made the other three great , scary times at the movies .

----
1.0 Run , do n't walk , to see this barbed and bracing comedy on the big screen .
0.3 Run , do n't walk , to see more barbed and bracing come

#### Random Seed 9 - Vanilla

In [27]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs9-shuffle-train/albert-large-v2_4.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [28]:
results_path = "results/checklist/rs9-shuffle-train_4_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs9-shuffle-train_4_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.61MB of 4.61MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,92
_timestamp,1631611562
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_timestamp,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [29]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    0 (0.0%)


change neutral words with BERT
Test cases:      500
Fails (rate):    34 (6.8%)

Example fails:
0.6 Like a south-of-the-border Melrose Place .
0.1 Like another south-of-the-border Melrose Place .
0.3 Like in south-of-the-border Melrose Place .

----
1.0 Workmanlike , maybe , but still a film with all the elements that made the other three great , scary times at the movies .
0.0 Workmanlike , maybe , but still a film lacking all the elements that made the other three great , scary times at the movies .

----
0.4 In its ragged , cheap and unassuming way , the movie works .
0.6 In its ragged , cheap and unassuming way , t

#### Random Seed 9 - SWA

In [30]:
model_name = "albert-large-v2"
checkpoint_path = "model-outputs/final-models/rs9-swa-linear-75-start2-drop-shuffle/albert-large-v2_6.pt"
pipeline = BatchedInference.from_model_name(
    model_name, checkpoint_path=checkpoint_path, device="cuda"
)

def pred_and_conf(data, batch_size=32):
    data = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]
    predictions = []
    confidences = []
    for data in data: 
        preds, confs = pipeline(data)
        preds = preds.numpy().tolist()
        confs = confs.numpy()
        predictions.append(preds)
        confidences.append(confs)
    predictions = np.hstack(predictions)
    confidences = np.vstack(confidences)
    return predictions, confidences

Some weights of the model checkpoint at albert-large-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-large-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You s

In [31]:
results_path = "results/checklist/rs9-swa-linear-75-start2-drop-shuffle_6_testset_19_07_21.json"
config = {
    "project_name": "checklist_evaluation", 
    "run_name": "rs9-swa-linear-75-start2-drop-shuffle_6_testset_19_07_21",
    "model": "albert-large-v2", 
    "checkpoint": checkpoint_path, 
    "test_suite": test_suite_path,
    "results_path": results_path
}
wandb.init(config=config, project=config["project_name"], name=config["run_name"])

test_suite = TestSuite.from_file(test_suite_path)
test_suite.run(pred_and_conf, overwrite=True, seed=0)
save_test_results(config, test_suite)

VBox(children=(Label(value=' 4.59MB of 4.59MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
test,"used to, but now"
_runtime,93
_timestamp,1631611661
_step,17


0,1
_runtime,▁▁▁▁▁▁▁▁▁▁████████
_timestamp,▁▁▁▁▁▁▁▁▁▁████████
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██


[34m[1mwandb[0m: wandb version 0.12.1 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Running Movie sentiments
Predicting 58 examples
Running Single positive words
Predicting 22 examples
Running Single negative words
Predicting 14 examples
Running Sentiment-laden words in context
Predicting 1350 examples
Running add positive phrases
Predicting 5500 examples
Running add negative phrases
Predicting 5000 examples
Running Simple negations: negative
Predicting 1350 examples
Running Hard: Negation of positive with neutral stuff in the middle (should be negative)
Predicting 500 examples
Running Movie genre specific sentiments
Predicting 736 examples
Running Change names
Predicting 1617 examples
Running Polarizing Negative Names - Positive Instances
Predicting 1727 examples
Running Polarizing Positive Names - Negative Instances
Predicting 1353 examples
Running Polarizing Negative Names - Negative Instances
Predicting 1353 examples
Running Polarizing Positive Names - Positive Instances
Predicting 1727 examples
Running Change Movie Industries
Predicting 252 examples
Running Movie

In [32]:
test_suite.summary()

Vocabulary

Single positive words
Test cases:      22
Fails (rate):    0 (0.0%)


Single negative words
Test cases:      14
Fails (rate):    0 (0.0%)


Sentiment-laden words in context
Test cases:      1350
Fails (rate):    0 (0.0%)


add positive phrases
Test cases:      500
Fails (rate):    0 (0.0%)


add negative phrases
Test cases:      500
Fails (rate):    1 (0.2%)

Example fails:
0.1 In the end , Punch-Drunk Love is one of those films that I wanted to like much more than I actually did .
0.3 In the end , Punch-Drunk Love is one of those films that I wanted to like much more than I actually did. I dread it.

----


change neutral words with BERT
Test cases:      500
Fails (rate):    39 (7.8%)

Example fails:
0.1 Ecks this one off your must-see list .
0.6 Ecks just one off your must-see list .

----
0.4 Must be seen to be believed .
1.0 Must be seen must be believed .

----
0.9 Absorbing and disturbing -- perhaps more disturbing than originally intended -- but a little clarity woul