In [1]:
from relation_modeling_utils import load_data

train_df = load_data("data/atomic2020_data-feb2021/train.tsv", multi_label=True)
val_df = load_data("data/atomic2020_data-feb2021/dev.tsv", multi_label=True)
test_df = load_data("data/atomic2020_data-feb2021/test.tsv", multi_label=True)

In [6]:
from kogito.inference import CommonsenseInference
from kogito.core.processors.relation import SWEMRelationMatcher
import time

csi = CommonsenseInference()
csi.remove_processor("simple_relation_matcher")
swem_matcher = SWEMRelationMatcher("swem_relation_matcher")
csi.add_processor(swem_matcher)
heads = val_df.text.to_list()
start = time.time()
kgraph = csi.infer(heads=heads, dry_run=True)
end = time.time()
print(f"Took {(end-start)/len(heads)} seconds per head")
# kgraph.to_jsonl("kgraph_modelbased_relations_swem.json")

Matching relations...
Took 8.954379081726074 seconds per head


In [7]:
from kogito.inference import CommonsenseInference
from kogito.core.processors.relation import DistilBertRelationMatcher
import time

csi = CommonsenseInference()
csi.remove_processor("simple_relation_matcher")
dbert_matcher = DistilBertRelationMatcher("dbert_relation_matcher")
csi.add_processor(dbert_matcher)
heads = val_df.text.to_list()
start = time.time()
kgraph = csi.infer(heads=heads, dry_run=True)
end = time.time()
print(f"Took {(end-start)/len(heads)} seconds per head")
# kgraph.to_jsonl("kgraph_modelbased_relations_dbert.json")

Matching relations...


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Predicting: 0it [00:00, ?it/s]

Took 16.970062732696533 seconds per head


In [12]:
len(train_df), len(val_df), len(test_df)

(36940, 2962, 6569)

In [13]:
train_df.head()

Unnamed: 0,text,label
0,PersonX abandons ___ altogether,"[0, 0, 1]"
1,PersonX abandons the ___ altogether,"[0, 1, 1]"
2,PersonX abolishes ___ altogether,"[0, 1, 1]"
3,PersonX abolishes ___ in the states,"[0, 1, 1]"
4,PersonX abolishes the ___ altogether,"[0, 1, 1]"


In [10]:
from relation_modeling_utils import explode_labels
train_df, val_df, test_df = explode_labels(train_df), explode_labels(val_df), explode_labels(test_df)

In [7]:
train_df.label_0.value_counts(), train_df.label_1.value_counts(), train_df.label_2.value_counts()

(0    22457
 1    14483
 Name: label_0, dtype: int64,
 0    18538
 1    18402
 Name: label_1, dtype: int64,
 1    21006
 0    15934
 Name: label_2, dtype: int64)

In [8]:
val_df.label_0.value_counts(), val_df.label_1.value_counts(), val_df.label_2.value_counts()

(0    2630
 1     332
 Name: label_0, dtype: int64,
 1    2263
 0     699
 Name: label_1, dtype: int64,
 1    2228
 0     734
 Name: label_2, dtype: int64)

In [9]:
test_df.label_0.value_counts(), test_df.label_1.value_counts(), test_df.label_2.value_counts()

(0    4668
 1    1901
 Name: label_0, dtype: int64,
 1    4419
 0    2150
 Name: label_1, dtype: int64,
 0    3996
 1    2573
 Name: label_2, dtype: int64)

In [18]:
train_df[(train_df.label_0 == 1) & (train_df.label_1 == 1)]

Unnamed: 0,text,label,label_0,label_1,label_2
19717,alcohol,"[1, 1, 0]",1,1,0
19771,bag,"[1, 1, 0]",1,1,0
19822,beer,"[1, 1, 0]",1,1,0
19915,call,"[1, 1, 0]",1,1,0
19930,car,"[1, 1, 0]",1,1,0
...,...,...,...,...,...
26118,subject,"[1, 1, 0]",1,1,0
26215,flirt,"[1, 1, 1]",1,1,1
26572,ice skate,"[1, 1, 0]",1,1,0
26592,open door,"[1, 1, 0]",1,1,0


In [91]:
from relation_modeling_utils import get_class_dist_report
train_df_report = get_class_dist_report(train_df)

In [92]:
train_df_report

{('class_0', 0): 0.6079317812669194,
 ('class_0', 'class_0', 0, 0): 0.6079317812669194,
 ('class_0', 'class_0', 0, 1): 0.0,
 ('class_0', 1): 0.3920682187330807,
 ('class_0', 'class_0', 1, 0): 0.0,
 ('class_0', 'class_0', 1, 1): 0.3920682187330807,
 ('class_0', 'class_1', 0, 0): 0.1147265836491608,
 ('class_0', 'class_1', 0, 1): 0.49320519761775855,
 ('class_0', 'class_1', 1, 0): 0.3871142393069843,
 ('class_0', 'class_1', 1, 1): 0.0049539794260963724,
 ('class_0', 'class_2', 0, 0): 0.040931239848402814,
 ('class_0', 'class_2', 0, 1): 0.5670005414185165,
 ('class_0', 'class_2', 1, 0): 0.3904168922577152,
 ('class_0', 'class_2', 1, 1): 0.0016513264753654576,
 ('class_1', 0): 0.5018408229561451,
 ('class_1', 'class_0', 0, 0): 0.1147265836491608,
 ('class_1', 'class_0', 0, 1): 0.3871142393069843,
 ('class_1', 1): 0.4981591770438549,
 ('class_1', 'class_0', 1, 0): 0.49320519761775855,
 ('class_1', 'class_0', 1, 1): 0.0049539794260963724,
 ('class_1', 'class_1', 0, 0): 0.5018408229561451,
 (

In [56]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("PersonX abandons ___ altogether".replace("_", "").replace("  ", " "))

for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

PersonX nsubj abandons VERB []
abandons ROOT abandons VERB [PersonX, altogether]
altogether advmod abandons VERB []


In [63]:
from relation_modeling_utils import HeuristicClassifier
heuristic_model = HeuristicClassifier()
train_preds = heuristic_model.predict(train_df)
val_preds = heuristic_model.predict(val_df)
test_preds = heuristic_model.predict(test_df)

100%|██████████| 36940/36940 [01:38<00:00, 373.90it/s]
100%|██████████| 2962/2962 [00:07<00:00, 371.91it/s]
100%|██████████| 6569/6569 [00:17<00:00, 377.47it/s]


In [65]:
from relation_modeling_utils import report_metrics
report_metrics(torch.tensor(train_preds, dtype=float), torch.tensor(np.asarray(train_df.label.to_list())))

Accuracy=0.832, precision=0.813, recall=0.855, f1=0.832


In [66]:
report_metrics(torch.tensor(val_preds, dtype=float), torch.tensor(np.asarray(val_df.label.to_list())))

Accuracy=0.792, precision=0.814, recall=0.849, f1=0.825


In [67]:
report_metrics(torch.tensor(test_preds, dtype=float), torch.tensor(np.asarray(test_df.label.to_list())))

Accuracy=0.771, precision=0.747, recall=0.843, f1=0.781


In [76]:
from relation_modeling_utils import create_vocab
train_vocab, val_vocab, test_vocab = create_vocab(train_df), create_vocab(val_df), create_vocab(test_df)

100%|██████████| 140049/140049 [00:00<00:00, 1285019.43it/s]
100%|██████████| 14524/14524 [00:00<00:00, 1351062.82it/s]
100%|██████████| 27270/27270 [00:00<00:00, 1325960.40it/s]


In [10]:
len(train_vocab.intersection(val_vocab)) / len(train_vocab), len(train_vocab.intersection(val_vocab)) / len(val_vocab)

(0.15235929505400797, 0.8725581395348837)

In [12]:
len(train_vocab.intersection(test_vocab)) / len(train_vocab), len(train_vocab.intersection(test_vocab)) / len(test_vocab)

(0.27109559002680095, 0.8000958772770853)

In [8]:
import pandas as pd
atomic_df = pd.read_csv("data/atomic/v4_atomic_all_agg.csv")

In [None]:
atomic_df.head()

In [10]:
train_texts = set(train_df.text.to_list())
ood_test = [event for event in atomic_df.event if event not in train_texts]

In [12]:
len(atomic_df)

24312

In [11]:
len(ood_test)

4617

In [None]:
ood_df = pd.DataFrame({'text': ood_test})
ood_vocab = create_vocab(ood_df)

In [14]:
len(train_vocab.intersection(ood_vocab)) / len(train_vocab), len(train_vocab.intersection(ood_vocab)) / len(ood_vocab)

(0.17363761877690245, 0.8737229260318757)

In [None]:
import nltk
nltk.download('wordnet')

In [17]:
import spacy
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("spacy_wordnet", after='tagger', config={'lang': nlp.lang})

<spacy_wordnet.wordnet_annotator.WordnetAnnotator at 0x7ff0c52bcdc0>

In [45]:
token = nlp('offered')[0]
token._.wordnet.synsets()

[Synset('offer.v.01'),
 Synset('offer.v.02'),
 Synset('volunteer.v.02'),
 Synset('offer.v.04'),
 Synset('offer.v.05'),
 Synset('offer.v.06'),
 Synset('offer.v.07'),
 Synset('offer.v.08'),
 Synset('offer.v.09'),
 Synset('put_up.v.02'),
 Synset('extend.v.04'),
 Synset('propose.v.05'),
 Synset('offer.v.13')]

In [None]:
train_vocab.intersection(test_vocab)

In [68]:
import pandas as pd
transomcs_df = pd.read_csv("data/TransOMCS_full.txt", sep="\t", header=None, names=["head", "relation", "tail", "score"])

In [69]:
transomcs_df.head()

Unnamed: 0,head,relation,tail,score
0,student,AtLocation,school,1.0
1,building,AtLocation,city,1.0
2,sugar,AtLocation,coffee,1.0
3,government,AtLocation,city,1.0
4,school,AtLocation,city,1.0


In [70]:
len(transomcs_df)

18481607

In [14]:
transomcs_df[transomcs_df['score'] > 0.5]

Unnamed: 0,head,relation,tail,score
0,student,AtLocation,school,1.00
1,building,AtLocation,city,1.00
2,sugar,AtLocation,coffee,1.00
3,government,AtLocation,city,1.00
4,school,AtLocation,city,1.00
...,...,...,...,...
5370463,bribe,UsedFor,lend,0.51
5370464,jefferson,HasA,county,0.51
5370465,man,UsedFor,give to,0.51
5370466,sender,UsedFor,know,0.51


In [18]:
transomcs_df[transomcs_df.isna().any(axis=1)]

Unnamed: 0,head,relation,tail,score
38215,,ReceivesAction,fasten,0.99
73145,,InstanceOf,warranty,0.99
108082,work,ReceivesAction,,0.99
114095,,ReceivesAction,read,0.99
124334,,ReceivesAction,lure,0.99
...,...,...,...,...
18379073,,ReceivesAction,dereference,0.00
18379482,ping,ReceivesAction,,0.00
18406398,,InstanceOf,betrothal,0.00
18438389,uptight,InstanceOf,,0.00


In [71]:
transomcs_df = transomcs_df.dropna()

In [16]:
len(transomcs_df)

18480653

In [72]:
transomcs_df = transomcs_df[transomcs_df.score >= 0.5]

In [73]:
len(transomcs_df)

5534596

In [74]:
transomcs_df.duplicated(subset=['head']).any()

True

In [77]:
from kogito.core.relation import CONCEPTNET_TO_ATOMIC_MAP, PHYSICAL_RELATIONS, EVENT_RELATIONS, SOCIAL_RELATIONS
from collections import defaultdict

def relation_to_class(relation):
    if relation in PHYSICAL_RELATIONS:
        return 0
    
    if relation in EVENT_RELATIONS:
        return 1
    
    if relation in SOCIAL_RELATIONS:
        return 2
    
    return None

test_ood_samples = []
unrecognized_rels = set()
head_label_map = defaultdict(set)

for row in transomcs_df.itertuples():
    heads = row.head.split()
    if not any([head in train_vocab for head in heads]):
        rel_class = relation_to_class(row.relation)
        if rel_class is None:
            atomic_relations = CONCEPTNET_TO_ATOMIC_MAP.get(row.relation)
            if atomic_relations:
                if not isinstance(atomic_relations, list):
                    atomic_relations = [atomic_relations]
                
                for rel in atomic_relations:
                    rel_class = relation_to_class(rel)
                    head_label_map[row.head].add(rel_class)
            else:
                unrecognized_rels.add(row.relation)
        else:
            head_label_map[row.head].add(rel_class)

for head, labels in head_label_map.items():
    final_label = [1 if label in labels else 0 for label in range(3)]
    test_ood_samples.append((head, final_label))

In [78]:
len(test_ood_samples)

41829

In [22]:
unrecognized_rels

{'CreatedBy', 'InstanceOf'}

In [79]:
test_ood_df = pd.DataFrame(test_ood_samples, columns=['text', 'label'])

In [80]:
test_ood_df.head()

Unnamed: 0,text,label
0,curator,"[1, 1, 1]"
1,foyer,"[1, 1, 1]"
2,yolk,"[1, 1, 0]"
3,fade,"[1, 1, 1]"
4,pave,"[1, 1, 1]"


In [81]:
test_ood_preds = heuristic_model.predict(test_ood_df)

100%|██████████| 41829/41829 [01:44<00:00, 401.65it/s]


In [82]:
report_metrics(torch.tensor(test_ood_preds, dtype=float), torch.tensor(np.asarray(test_ood_df.label.to_list())))

Accuracy=0.658, precision=0.785, recall=0.535, f1=0.604


In [93]:
test_ood_df_report = get_class_dist_report(test_ood_df)

In [94]:
test_ood_df_report

{('class_0', 0): 0.012025150015539459,
 ('class_0', 'class_0', 0, 0): 0.012025150015539459,
 ('class_0', 'class_0', 0, 1): 0.0,
 ('class_0', 1): 0.9879748499844605,
 ('class_0', 'class_0', 1, 0): 0.0,
 ('class_0', 'class_0', 1, 1): 0.9879748499844605,
 ('class_0', 'class_1', 0, 0): 0.004016352291472423,
 ('class_0', 'class_1', 0, 1): 0.008008797724067035,
 ('class_0', 'class_1', 1, 0): 0.35998948098209377,
 ('class_0', 'class_1', 1, 1): 0.6279853690023668,
 ('class_0', 'class_2', 0, 0): 0.007889263429677974,
 ('class_0', 'class_2', 0, 1): 0.0041358865858614835,
 ('class_0', 'class_2', 1, 0): 0.8271534103134189,
 ('class_0', 'class_2', 1, 1): 0.16082143967104162,
 ('class_1', 0): 0.36400583327356617,
 ('class_1', 'class_0', 0, 0): 0.004016352291472423,
 ('class_1', 'class_0', 0, 1): 0.35998948098209377,
 ('class_1', 1): 0.6359941667264338,
 ('class_1', 'class_0', 1, 0): 0.008008797724067035,
 ('class_1', 'class_0', 1, 1): 0.6279853690023668,
 ('class_1', 'class_1', 0, 0): 0.364005833273

In [25]:
test_ood_df = explode_labels(test_ood_df)

In [26]:
test_ood_df.label_0.value_counts(), test_ood_df.label_1.value_counts(), test_ood_df.label_2.value_counts()

(1    41326
 0      503
 Name: label_0, dtype: int64,
 1    26603
 0    15226
 Name: label_1, dtype: int64,
 0    34929
 1     6900
 Name: label_2, dtype: int64)

In [43]:
from kogito.core.processors.relation import SWEMRelationClassifier
import numpy as np
import torch
from torch.nn.utils.rnn import pad_sequence
import spacy
from relation_modeling_utils import HeadDataset
from torch.utils.data import DataLoader

nlp = spacy.load("en_core_web_sm")
vocab = np.load(
    "./data/vocab_glove_100d.npy", allow_pickle=True
).item()

swem_classifier = SWEMRelationClassifier(pooling="avg")
swem_classifier.load_state_dict(
    torch.load(
        "./models/swem_multi_label_finetune_state_dict.pth"
    )
)

<All keys matched successfully>

In [45]:
# swem_test_data = HeadDataset(test_ood_df, vocab=vocab)
swem_test_data = HeadDataset(test_df, vocab=vocab)
swem_test_dataloader = DataLoader(swem_test_data, batch_size=len(swem_test_data))

In [46]:
with torch.no_grad():
    swem_X, swem_y = next(iter(swem_test_dataloader))
    swem_preds = swem_classifier.forward(swem_X)

In [48]:
swem_y

tensor([[0, 1, 1],
        [0, 1, 1],
        [0, 1, 1],
        ...,
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0]])

In [49]:
report_metrics(swem_preds, swem_y)

Accuracy=0.860, precision=0.829, recall=0.961, f1=0.878


In [9]:
from torch import nn
import torch.nn.functional as F
from transformers import DistilBertModel, DistilBertTokenizer
import pytorch_lightning as pl
from torch.utils.data import Dataset
import torchmetrics

class DistilBertHeadDataset(Dataset):
    def __init__(self, df):
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.labels = np.asarray(df['label'].to_list())
        self.texts = [self.tokenizer(text, padding='max_length', max_length=32, truncation=True,
                                     return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]


class DistilBERTClassifier(pl.LightningModule):
    def __init__(self, num_classes=3, dropout=0.5, freeze_emb=False):
        super().__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)

        if freeze_emb:
            for parameter in self.distilbert.parameters():
                parameter.requires_grad = False
            self.classifier = nn.Sequential(self.linear)
        else:
            self.classifier = nn.Sequential(self.linear)
        self.test_accuracy = torchmetrics.Accuracy()
        self.test_precision = torchmetrics.Precision(num_classes=3, average='weighted')
        self.test_recall = torchmetrics.Recall(num_classes=3, average='weighted')
        self.test_f1 = torchmetrics.F1Score(num_classes=3, average='weighted')
    
    def forward(self, input_ids, mask):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=mask, return_dict=False)
        outputs = self.classifier(outputs[0][:, 0, :])
        return outputs

    def test_step(self, batch, batch_idx):
        X, y = batch
        mask = X['attention_mask']
        input_ids = X['input_ids'].squeeze(1)
        outputs = self.forward(input_ids, mask)
        probs = F.sigmoid(outputs)
        self.test_accuracy(probs, y)
        self.test_precision(probs, y)
        self.test_recall(probs, y)
        self.test_f1(probs, y)
        return probs
    
    def test_epoch_end(self, outputs):
        results = dict(accuracy=self.test_accuracy.compute(),
                    precision=self.test_precision.compute(),
                    recall=self.test_recall.compute(),
                    F1=self.test_f1.compute())
        self.log_dict(results)
        

In [10]:
distilbert_classifier = DistilBERTClassifier.load_from_checkpoint('./models/distilbert/distilbert_model_20220404H1852.ckpt')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: Error(s) in loading state_dict for DistilBERTClassifier:
	Missing key(s) in state_dict: "classifier.0.weight", "classifier.0.bias". 
	Unexpected key(s) in state_dict: "classifier.1.weight", "classifier.1.bias". 

In [38]:
dbert_test_data = DistilBertHeadDataset(test_ood_df)
dbert_test_dataloader = DataLoader(dbert_test_data, batch_size=128)

In [58]:
trainer = pl.Trainer(accelerator="gpu", devices=[0])
trainer.test(distilbert_classifier, dbert_test_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Missing logger folder: /root/kogito/examples/relation_modeling/lightning_logs
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]



{'accuracy': tensor(0.7189, device='cuda:0'), 'precision': tensor(0.8397, device='cuda:0'), 'recall': tensor(0.5454, device='cuda:0'), 'F1': tensor(0.5650, device='cuda:0')}
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'F1': 0.5650182366371155,
 'accuracy': 0.7189190983772278,
 'precision': 0.8396950960159302,
 'recall': 0.5454303026199341}
--------------------------------------------------------------------------------


[{'accuracy': 0.7189190983772278,
  'precision': 0.8396950960159302,
  'recall': 0.5454303026199341,
  'F1': 0.5650182366371155}]

In [36]:
from torch import nn
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer
import pytorch_lightning as pl
from torch.utils.data import Dataset, DataLoader
import torchmetrics
import numpy as np

class BertHeadDataset(Dataset):
    def __init__(self, df):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.labels = np.asarray(df['label'].to_list())
        self.texts = [self.tokenizer(text, padding='max_length', max_length=32, truncation=True,
                                     return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]


class BERTClassifier(pl.LightningModule):
    def __init__(self, num_classes=3, dropout=0.5, learning_rate=1e-4, freeze_emb=False):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_classes)

        if freeze_emb:
            for parameter in self.bert.parameters():
                parameter.requires_grad = False
            self.classifier = nn.Sequential(self.linear)
        else:
            self.classifier = nn.Sequential(self.dropout, self.linear)
        self.test_accuracy = torchmetrics.Accuracy()
        self.test_precision = torchmetrics.Precision(num_classes=3, average='weighted')
        self.test_recall = torchmetrics.Recall(num_classes=3, average='weighted')
        self.test_f1 = torchmetrics.F1Score(num_classes=3, average='weighted')
    
    def forward(self, input_ids, mask):
        _, outputs = self.bert(input_ids=input_ids, attention_mask=mask, return_dict=False)
        outputs = self.classifier(outputs)
        return outputs

    def test_step(self, batch, batch_idx):
        X, y = batch
        mask = X['attention_mask']
        input_ids = X['input_ids'].squeeze(1)
        outputs = self.forward(input_ids, mask)
        probs = F.sigmoid(outputs)
        self.test_accuracy(probs, y)
        self.test_precision(probs, y)
        self.test_recall(probs, y)
        self.test_f1(probs, y)
        return probs
    
    def test_epoch_end(self, outputs):
        results = dict(accuracy=self.test_accuracy.compute(),
                    precision=self.test_precision.compute(),
                    recall=self.test_recall.compute(),
                    F1=self.test_f1.compute())
        self.log_dict(results)

In [32]:
bert_classifier = BERTClassifier.load_from_checkpoint('./models/bert/bert_model_20220404H1850.ckpt')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [38]:
bert_test_data = BertHeadDataset(test_ood_df)
bert_test_dataloader = DataLoader(bert_test_data, batch_size=32)

In [39]:
bert_trainer = pl.Trainer(accelerator="gpu", devices=[0])
bert_trainer.test(bert_classifier, bert_test_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]



--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'F1': 0.545831024646759,
 'accuracy': 0.7217401266098022,
 'precision': 0.7522579431533813,
 'recall': 0.5436528325080872}
--------------------------------------------------------------------------------


[{'accuracy': 0.7217401266098022,
  'precision': 0.7522579431533813,
  'recall': 0.5436528325080872,
  'F1': 0.545831024646759}]