In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import einops
import fancy_einsum
from tqdm import tqdm
import re
from sklearn.metrics import roc_curve, auc
import transformer_lens.utils as utils
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

from sparse_autoencoder import SparseAutoencoder

from transformer_lens import HookedTransformer, HookedTransformerConfig

import transformer_lens.utils as utils
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def imshow(tensor, **kwargs):
    px.imshow(
        utils.to_numpy(tensor),
        color_continuous_midpoint=0.0,
        color_continuous_scale="RdBu",
        **kwargs,
    ).show()


def line(tensor, **kwargs):
    px.line(
        y=utils.to_numpy(tensor),
        **kwargs,
    ).show()


def scatter(x, y, xaxis="", yaxis="", caxis="", **kwargs):
    x = utils.to_numpy(x)
    y = utils.to_numpy(y)
    px.scatter(
        y=y,
        x=x,
        labels={"x": xaxis, "y": yaxis, "color": caxis},
        **kwargs,
    ).show()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### FUNCTION DEFINITIONS ###

# Loss function is MSE (reconstruction loss) + L1 norm of the learned activations + similarity loss
def loss_fn(decoded_activations, learned_activations, resid_streams, resid_labels, lambda_=0.01, alpha_=0.5, verbose=False):

    # RECONSTRUCTION LOSS
    recon_loss = F.mse_loss(decoded_activations, resid_streams)

    # SPARSITY LOSS
    learned_activations_flat = einops.rearrange(learned_activations, 'b s n -> (b s) n')
    sparsity_loss = torch.mean(torch.norm(learned_activations_flat, p=1, dim=1))

    # SIMILARITY LOSS
    # Pos and neg - pos is where resid_labels == 1, neg is where resid_labels == 0
    if alpha_ > 0:
        learned_activations_pos = learned_activations[resid_labels == 1, :, :]
        learned_activations_neg = learned_activations[resid_labels == 0, :, :]
        # Currently (N, S, D) and (M, S, D) -> need to be (D, S, N) and (D, S, M)
        learned_activations_pos = einops.rearrange(learned_activations_pos, 'n s d -> d s n')
        learned_activations_neg = einops.rearrange(learned_activations_neg, 'n s d -> d s n')
        pos_sim_loss = calculate_similarity_loss(learned_activations_pos, learned_activations_neg, verbose=verbose)
    else: 
        pos_sim_loss = torch.tensor(0.0)

    # combine
    return recon_loss + (lambda_ * sparsity_loss) + (alpha_ * pos_sim_loss), recon_loss, sparsity_loss, pos_sim_loss


def train(model, n_epochs, optimizer, train_streams, eval_streams, lambda_=0.01, alpha_=0.5, verbose=False):
    for epoch in tqdm(range(n_epochs)):
        model.train()
        optimizer.zero_grad()
        learned_activations, decoded_activations = model(train_streams)
        loss, recon_loss, sparsity_loss, pos_sim_loss = loss_fn(decoded_activations, learned_activations, train_streams, 
                                                                train_labels, lambda_=lambda_, alpha_=alpha_)
        loss.backward()
        optimizer.step()
        if epoch % (n_epochs // 10) == 0:
            model.eval()
            with torch.no_grad():
                eval_learned_activations, eval_decoded_activations = model(eval_streams)
                eval_loss, _, _, eval_pos_sim_loss = loss_fn(eval_decoded_activations, eval_learned_activations,
                                                             eval_streams, eval_labels, lambda_=lambda_, alpha_=alpha_, verbose=verbose)
                print(f"Train loss = {loss.item():.4f}, Eval loss = {eval_loss.item():.4f}")
    return model

def feature_string_to_head_and_layer(feature_index, head_labels):

    extraction = head_labels[feature_index]

    if 'mlp' in extraction.lower(): 
        layer = int(extraction.split('_')[0])
        head = 12
        return layer, head

    # Get head and layer e.g. 'L0H1' -> (0, 1)
    # Layer is everything after L and before H
    layer = int(re.findall(r'L(\d+)H', extraction)[0])
    # Head is everything after H
    head = int(re.findall(r'H(\d+)', extraction)[0])

    return layer, head

def gen_array_template(head_labels):

    # Plot the ground truth (head, layer) pairs (1 if in ground truth, 0 otherwise)
    heads = []
    layers = []
    for i, l in enumerate(head_labels):
        layer, head = feature_string_to_head_and_layer(i, head_labels)
        heads.append(head)
        layers.append(layer)

    heads = list(set(heads))
    layers = list(set(layers))

    return np.zeros((len(layers), len(heads)))

def softmax(x, axis):
    """Return the softmax of x (if x is a vector) or the softmax of each row (if x is a matrix)"""
    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return e_x / e_x.sum(axis=axis, keepdims=True)

def gen_softmaxed_unique_to_pos(all_indices, ground_truth_array, head_labels, normalise=False, across_layer=False):
    # Negative and positive indices
    halfway = all_indices.shape[0] // 2
    positive_indices = all_indices[:halfway, :]
    negative_indices = all_indices[halfway:, :]

    unique_to_positive_array = gen_array_template(head_labels)
    unique_to_negative_array = gen_array_template(head_labels)

    for i in range(len(head_labels)):
        # Calculate head and layer
        layer, head = feature_string_to_head_and_layer(i, head_labels)

        positive = set(positive_indices[:, i].tolist())
        negative = set(negative_indices[:, i].tolist())
        total_unique = positive.union(negative)

        # In positive but not negative
        unique_to_positive = list(positive - negative)
        # In negative but not positive
        unique_to_negative = list(negative - positive)

        if normalise:
            # Normalise by total number of unique indices
            unique_to_positive_array[layer, head] = len(unique_to_positive) / len(total_unique)
            unique_to_negative_array[layer, head] = len(unique_to_negative) / len(total_unique)
        
        else:
            # Set the values
            unique_to_positive_array[layer, head] = len(unique_to_positive)
            unique_to_negative_array[layer, head] = len(unique_to_negative)

    # If across_layer, we apply softmax to the rows (layers) of the array
    if across_layer:
        unique_to_positive_array = softmax(unique_to_positive_array, axis=0)
        unique_to_negative_array = softmax(unique_to_negative_array, axis=0)

    # Plot the ROC curve in plotly
    y_true = ground_truth_array.flatten()
    y_pred = unique_to_positive_array.flatten()

    # Normalise y_pred with softmax
    if not across_layer:
        y_pred = softmax(y_pred, axis=0)

    fpr, tpr, thresholds = roc_curve(y_true, y_pred)

    # Calculate ROC AUC
    roc_auc = auc(fpr, tpr)

    # Calculate F1
    f1 = 2 * (tpr * (1 - fpr)) / (tpr + (1 - fpr))

    return y_true, y_pred, fpr, tpr, roc_auc, f1, thresholds
    

# def gen_softmaxed_unique_to_pos(all_indices, ground_truth_array, head_labels, normalise=False):
#     # Negative and positive indices
#     positive_indices = all_indices[:250, :]
#     negative_indices = all_indices[250:, :]

#     unique_to_positive_array = gen_array_template(head_labels)
#     unique_to_negative_array = gen_array_template(head_labels)

#     for i in range(len(head_labels)):
#         # Calculate head and layer
#         layer, head = feature_string_to_head_and_layer(i, head_labels)

#         positive = set(positive_indices[:, i].tolist())
#         negative = set(negative_indices[:, i].tolist())
#         total_unique = positive.union(negative)

#         # In positive but not negative
#         unique_to_positive = list(positive - negative)
#         # In negative but not positive
#         unique_to_negative = list(negative - positive)

#         if normalise:
#             # Normalise by total number of unique indices
#             unique_to_positive_array[layer, head] = len(unique_to_positive) / len(total_unique)
#             unique_to_negative_array[layer, head] = len(unique_to_negative) / len(total_unique)
        
#         else:
#             # Set the values
#             unique_to_positive_array[layer, head] = len(unique_to_positive)
#             unique_to_negative_array[layer, head] = len(unique_to_negative)

#     # Plot the ROC curve in plotly
#     y_true = ground_truth_array.flatten()
#     y_pred = unique_to_positive_array.flatten()

#     # Normalise y_pred with softmax
#     def softmax(x): return np.exp(x) / np.sum(np.exp(x), axis=0)
#     y_pred = softmax(y_pred)

#     fpr, tpr, thresholds = roc_curve(y_true, y_pred)

#     # Calculate ROC AUC
#     roc_auc = auc(fpr, tpr)

#     # Calculate F1
#     f1 = 2 * (tpr * (1 - fpr)) / (tpr + (1 - fpr))

#     return y_true, y_pred, fpr, tpr, roc_auc, f1, thresholds

def gen_co_occurrence_matrix(all_indices, n_heads, n_feat):
    co_occurrence_matrix = np.zeros((n_heads, n_heads, n_feat, n_feat))

    for e in range(all_indices.shape[0]):  # For each example
        for h1 in range(n_heads):  # For each head
            c1 = all_indices[e, h1]  # Code in head h1
            for h2 in range(n_heads):  # For each other head
                if h1 != h2:  # Skip counting co-occurrence of a head with itself
                    c2 = all_indices[e, h2]  # Code in head h2
                    # Increment co-occurrence count for (h1, h2)
                    co_occurrence_matrix[h1, h2, c1, c2] += 1

    return co_occurrence_matrix

def normalize_co_occurrence_matrix(co_occurrence_matrix):
    # Assuming co_occurrence_matrix is of shape (n_heads, n_heads, n_feat, n_feat)
    n_heads, _, n_feat, _ = co_occurrence_matrix.shape
    normalized_matrix = np.zeros_like(co_occurrence_matrix)

    for h1 in range(n_heads):
        for h2 in range(n_heads):
            if h1 != h2:  # Skip self co-occurrences
                total_co_occurrences = np.sum(co_occurrence_matrix[h1, h2, :, :])
                if total_co_occurrences > 0:  # Avoid division by zero
                    normalized_matrix[h1, h2, :, :] = co_occurrence_matrix[h1, h2, :, :] / total_co_occurrences

    return normalized_matrix

def unique_co_occurrences(positive_matrix, negative_matrix, normalise=True):
    # Normalize matrices
    if normalise:
        positive_matrix = normalize_co_occurrence_matrix(positive_matrix)
        negative_matrix = normalize_co_occurrence_matrix(negative_matrix)

    n_heads, _, n_feat, _ = positive_matrix.shape
    unique_co_occurrence_counts = np.zeros((n_heads, n_heads))
    
    for h1 in range(n_heads):
        for h2 in range(n_heads):
            if h1 != h2:  # Skip self co-occurrences
                # Find co-occurrences in positive not present in negative
                unique_positives = positive_matrix[h1, h2, :, :] > 0
                negatives = negative_matrix[h1, h2, :, :] > 0
                # Boolean array of unique positives
                unique = unique_positives & ~negatives
                if normalise:
                    # Normalize count by total co-occurrences for this head pair in positive matrix
                    total_co_occurrences = np.sum(positive_matrix[h1, h2, :, :] > 0) + np.sum(negative_matrix[h1, h2, :, :] > 0)
                    if total_co_occurrences > 0:  # Avoid division by zero
                        unique_count_normalized = np.sum(unique) / total_co_occurrences
                    else:
                        unique_count_normalized = 0
                    # Set normalized unique counts for this head pair
                    unique_co_occurrence_counts[h1, h2] = unique_count_normalized
                else:
                    # Count unique co-occurrences
                    unique_co_occurrence_counts[h1, h2] = np.sum(unique)

    return unique_co_occurrence_counts

def pairwise_cosine_similarities(pos_examples, neg_examples, eps=1e-12):
    """
    pos_examples = (D, S, N)
    neg_examples = (D, S, M)

    Calculate the average cosine similarity for vectors at the same sequence
    position in pos_examples and neg_examples, vectorized.
    """

    # Reshape tensors for dot product computation
    pos_examples_perm = pos_examples.permute(1, 2, 0)  # Change to shape (S, N, D) for batch processing
    neg_examples_perm = neg_examples.permute(1, 0, 2)  # Change to shape (S, D, M) for correct dot product

    # Compute dot products. Now, using einsum for clarity and correctness
    dot_products = torch.einsum('snd,sdm->snm', pos_examples_perm, neg_examples_perm)

    # Calculate magnitudes for normalization
    magnitude_p = torch.sqrt(torch.einsum('snd,snd->sn', pos_examples_perm, pos_examples_perm) + eps).unsqueeze(-1)
    magnitude_n = torch.sqrt(torch.einsum('sdm,sdm->sm', neg_examples_perm, neg_examples_perm) + eps).unsqueeze(-2)

    # Calculate cosine similarities
    cosine_similarities = dot_products / (magnitude_p * magnitude_n + eps)

    # Average the cosine similarities for each position across all N, M pairs
    average_cosine_similarities_per_position = torch.mean(cosine_similarities, dim=(1, 2))

    # Finally, average these across all sequence positions
    final_scalar = torch.mean(average_cosine_similarities_per_position)

    return final_scalar

def calculate_similarity_loss(pos_examples, neg_examples, eps=1e-12, delta=1.0, verbose=False):

    # Positive-negative
    pos_neg_scalar = pairwise_cosine_similarities(pos_examples, neg_examples, eps)
    if verbose: print(f"Pos-neg loss = {pos_neg_scalar.item():.4f}")

    # Positive-positive
    pos_pos_scalar = pairwise_cosine_similarities(pos_examples, pos_examples, eps)
    if verbose: print(f"Pos-pos loss = {pos_pos_scalar.item():.4f}")
    
    return pos_neg_scalar + (delta - (pos_pos_scalar))

def calculate_f1_score(y_true, y_pred):
    # Calculate True Positives (TP)
    TP = np.sum((y_true == 1) & (y_pred == 1))
    
    # Calculate False Positives (FP)
    FP = np.sum((y_true == 0) & (y_pred == 1))
    
    # Calculate False Negatives (FN)
    FN = np.sum((y_true == 1) & (y_pred == 0))

    # Calculate Treu Negatives (TN)
    TN = np.sum((y_true == 0) & (y_pred == 0))
    
    # Calculate Precision and Recall
    Precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    Recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    
    # Calculate F1 Score
    F1 = 2 * (Precision * Recall) / (Precision + Recall) if (Precision + Recall) > 0 else 0
    
    return F1, Precision, Recall, TP, FP, TN, FN

In [4]:
results_overall = {}
tasks = ['gt', 'ioi', 'ds']

for task in tasks:
    results_task = {}
    num_train_examples = [5, 10, 25, 50, 100, 250]

    for num_examples in num_train_examples:
        trial_aucs = []
        for _ in range(5):
            print(f"Doing {num_examples} examples...")

            task = 'ioi'
            task_type = 'node'
            assert task_type in ['node', 'edge'], "Type must be either 'node' or 'edge'"
            print(f"Type: {task_type}")
            task_mappings = {
                'gt': 'Greater-than',
                'ioi': 'Indirect Object Identification',
                'ds': 'Docstring',
                'induction': 'Induction',
            }

            print(f"Task: {task_mappings[task]}")
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            print(f"Using device: {device}")

            num_unique = 300
            n_epochs = 250


            roc_results = []

            # Load residual streams
            resid_streams = torch.load(f"../data/{task}/resid_heads_mean.pt").to(device)
            head_labels = torch.load(f'../data/{task}/labels_heads_mean.pt')
            ground_truth = torch.load(f'../data/{task}/ground_truth.pt')
            print(ground_truth)


            # Shuffle and create the labels
            labels = torch.ones(resid_streams.shape[0]//2) # BIG ASSUMPTION: assumes first half is positive and second half is negative
            labels = torch.cat((labels, torch.zeros_like(labels)))
            permutation = torch.randperm(resid_streams.shape[0])
            resid_shuffled = resid_streams[permutation, :, :]
            labels_shuffled = labels[permutation]
            cutoff = num_examples #int(resid_shuffled.shape[0] * 0.8)
            train_streams = resid_shuffled[:cutoff, :, :].to(device)
            train_labels = labels_shuffled[:cutoff].to(device)
            eval_streams = resid_shuffled[cutoff:, :, :].to(device)
            eval_labels = labels_shuffled[cutoff:].to(device)


            model = SparseAutoencoder(n_input_features=resid_streams.shape[-1], n_learned_features=num_unique, geometric_median_dataset=None).to(device)

            optimizer = optim.Adam(model.parameters(), lr=0.001)

            model = train(model, n_epochs, optimizer, train_streams, eval_streams, lambda_=0.02, alpha_=0.0)
            model = model.to('cpu')
            resid_streams = resid_streams.to('cpu')
            # Save model
            torch.save(model, f'../models/{task}/sparse_autoencoder.pt')

            heads = []
            layers = []
            for i, l in enumerate(head_labels):
                layer, head = feature_string_to_head_and_layer(i, head_labels)
                heads.append(head)
                layers.append(layer)

            heads = list(set(heads))
            layers = list(set(layers))

            ground_truth_array = np.zeros((len(layers), len(heads)))
            for layer, head in ground_truth:
                ground_truth_array[layer, head] = 1

            normalise = False# if task == 'ds' else True

            # Plot the ground truth (head, layer) pairs (1 if in ground truth, 0 otherwise)

            model.eval()
            learned_activations = model(resid_streams)[0].detach().cpu().numpy()
            all_indices = np.argmax(learned_activations, axis=2)

            print(f"\n\nNormalise: {normalise}")
            y_true, y_pred, fpr, tpr, node_roc_auc, f1, thresholds = gen_softmaxed_unique_to_pos(all_indices, ground_truth_array, head_labels, normalise=normalise)

            # Print best f1 score (and corresponding threshold)
            node_best_f1 = np.max(f1)
            best_threshold = thresholds[np.argmax(f1)]
            print(f"Best F1 score: {node_best_f1:.4f}")

            # Print ROC AUC
            print(f"ROC AUC: {node_roc_auc:.4f}\n\n")

            trial_aucs.append(node_roc_auc)
        
        
        results_task[num_examples] = trial_aucs

    results_overall[task] = results_task

# Save to JSON
import json

with open('../results/sparse_autoencoder_results.json', 'w') as f:
    json.dump(results_overall, f)

Doing 5 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  7%|▋         | 17/250 [00:00<00:03, 60.08it/s]

Train loss = 1.1388, Eval loss = 1.0191


 16%|█▌        | 40/250 [00:00<00:02, 72.32it/s]

Train loss = 0.2864, Eval loss = 0.2848


 24%|██▎       | 59/250 [00:01<00:03, 59.95it/s]

Train loss = 0.1906, Eval loss = 0.1927


 39%|███▉      | 97/250 [00:01<00:01, 87.67it/s]

Train loss = 0.1614, Eval loss = 0.1639


 48%|████▊     | 121/250 [00:01<00:01, 88.77it/s]

Train loss = 0.1442, Eval loss = 0.1472


 58%|█████▊    | 146/250 [00:01<00:01, 92.26it/s]

Train loss = 0.1320, Eval loss = 0.1352


 69%|██████▉   | 172/250 [00:02<00:00, 95.22it/s]

Train loss = 0.1225, Eval loss = 0.1260


 79%|███████▉  | 198/250 [00:02<00:00, 95.43it/s]

Train loss = 0.1150, Eval loss = 0.1186


 84%|████████▎ | 209/250 [00:02<00:00, 83.48it/s]

Train loss = 0.1084, Eval loss = 0.1123


100%|██████████| 250/250 [00:03<00:00, 77.41it/s]

Train loss = 0.1028, Eval loss = 0.1068







Normalise: False
Best F1 score: 0.7421
ROC AUC: 0.8259


Doing 5 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:36,  6.86it/s]

Train loss = 1.2060, Eval loss = 1.0624


 17%|█▋        | 42/250 [00:00<00:02, 96.90it/s]

Train loss = 0.2926, Eval loss = 0.2900


 28%|██▊       | 70/250 [00:00<00:01, 106.02it/s]

Train loss = 0.1917, Eval loss = 0.1951


 40%|████      | 100/250 [00:01<00:01, 113.00it/s]

Train loss = 0.1618, Eval loss = 0.1654


 45%|████▌     | 113/250 [00:01<00:01, 98.08it/s] 

Train loss = 0.1446, Eval loss = 0.1482


 57%|█████▋    | 143/250 [00:01<00:01, 106.00it/s]

Train loss = 0.1322, Eval loss = 0.1357


 67%|██████▋   | 167/250 [00:01<00:00, 96.90it/s] 

Train loss = 0.1240, Eval loss = 0.1277


 77%|███████▋  | 192/250 [00:02<00:00, 83.31it/s]

Train loss = 0.1179, Eval loss = 0.1217


 88%|████████▊ | 220/250 [00:02<00:00, 96.99it/s]

Train loss = 0.1125, Eval loss = 0.1164


100%|██████████| 250/250 [00:02<00:00, 90.45it/s]

Train loss = 0.1073, Eval loss = 0.1114







Normalise: False
Best F1 score: 0.7368
ROC AUC: 0.8437


Doing 5 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:29,  8.49it/s]

Train loss = 1.1792, Eval loss = 1.0463


 18%|█▊        | 46/250 [00:00<00:01, 104.91it/s]

Train loss = 0.3190, Eval loss = 0.3171


 30%|██▉       | 74/250 [00:00<00:01, 106.61it/s]

Train loss = 0.1979, Eval loss = 0.1983


 34%|███▍      | 86/250 [00:00<00:01, 90.94it/s] 

Train loss = 0.1656, Eval loss = 0.1666


 47%|████▋     | 118/250 [00:01<00:01, 104.69it/s]

Train loss = 0.1472, Eval loss = 0.1484


 59%|█████▉    | 147/250 [00:01<00:00, 108.98it/s]

Train loss = 0.1343, Eval loss = 0.1356


 64%|██████▍   | 160/250 [00:01<00:00, 95.58it/s] 

Train loss = 0.1247, Eval loss = 0.1262


 77%|███████▋  | 193/250 [00:01<00:00, 107.18it/s]

Train loss = 0.1179, Eval loss = 0.1195


 89%|████████▉ | 223/250 [00:02<00:00, 110.51it/s]

Train loss = 0.1124, Eval loss = 0.1140


100%|██████████| 250/250 [00:02<00:00, 97.65it/s] 

Train loss = 0.1076, Eval loss = 0.1093







Normalise: False
Best F1 score: 0.7529
ROC AUC: 0.8331


Doing 5 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  6%|▌         | 15/250 [00:00<00:03, 68.60it/s]

Train loss = 1.1683, Eval loss = 1.0479


 18%|█▊        | 44/250 [00:00<00:02, 101.32it/s]

Train loss = 0.2839, Eval loss = 0.2939


 28%|██▊       | 71/250 [00:00<00:01, 103.49it/s]

Train loss = 0.1903, Eval loss = 0.2004


 40%|████      | 100/250 [00:01<00:01, 110.56it/s]

Train loss = 0.1577, Eval loss = 0.1659


 45%|████▌     | 113/250 [00:01<00:01, 94.82it/s] 

Train loss = 0.1398, Eval loss = 0.1471


 57%|█████▋    | 142/250 [00:01<00:01, 99.49it/s]

Train loss = 0.1276, Eval loss = 0.1345


 69%|██████▉   | 172/250 [00:01<00:00, 108.97it/s]

Train loss = 0.1188, Eval loss = 0.1252


 74%|███████▍  | 185/250 [00:02<00:00, 92.99it/s] 

Train loss = 0.1115, Eval loss = 0.1177


 88%|████████▊ | 219/250 [00:02<00:00, 105.42it/s]

Train loss = 0.1064, Eval loss = 0.1125


100%|██████████| 250/250 [00:02<00:00, 94.41it/s] 

Train loss = 0.1025, Eval loss = 0.1085







Normalise: False
Best F1 score: 0.7408
ROC AUC: 0.8318


Doing 5 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:26,  9.46it/s]

Train loss = 1.2775, Eval loss = 1.1422


 18%|█▊        | 45/250 [00:00<00:01, 105.64it/s]

Train loss = 0.3268, Eval loss = 0.3309


 29%|██▉       | 72/250 [00:00<00:01, 90.25it/s] 

Train loss = 0.2009, Eval loss = 0.2025


 40%|████      | 100/250 [00:01<00:01, 102.03it/s]

Train loss = 0.1618, Eval loss = 0.1637


 45%|████▍     | 112/250 [00:01<00:01, 90.29it/s] 

Train loss = 0.1431, Eval loss = 0.1453


 57%|█████▋    | 143/250 [00:01<00:01, 103.29it/s]

Train loss = 0.1307, Eval loss = 0.1332


 69%|██████▉   | 172/250 [00:01<00:00, 108.89it/s]

Train loss = 0.1217, Eval loss = 0.1244


 80%|███████▉  | 199/250 [00:02<00:00, 100.18it/s]

Train loss = 0.1153, Eval loss = 0.1182


 84%|████████▍ | 211/250 [00:02<00:00, 88.31it/s] 

Train loss = 0.1099, Eval loss = 0.1129


100%|██████████| 250/250 [00:02<00:00, 94.01it/s] 

Train loss = 0.1054, Eval loss = 0.1086







Normalise: False
Best F1 score: 0.7874
ROC AUC: 0.8600


Doing 10 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:26,  9.33it/s]

Train loss = 1.2643, Eval loss = 1.0777


 17%|█▋        | 43/250 [00:00<00:02, 77.80it/s]

Train loss = 0.3130, Eval loss = 0.2934


 25%|██▍       | 62/250 [00:00<00:02, 72.86it/s]

Train loss = 0.2055, Eval loss = 0.1975


 37%|███▋      | 92/250 [00:01<00:02, 74.70it/s]

Train loss = 0.1727, Eval loss = 0.1670


 45%|████▍     | 112/250 [00:01<00:01, 73.80it/s]

Train loss = 0.1512, Eval loss = 0.1473


 57%|█████▋    | 143/250 [00:01<00:01, 77.47it/s]

Train loss = 0.1370, Eval loss = 0.1345


 61%|██████    | 152/250 [00:02<00:01, 62.57it/s]

Train loss = 0.1272, Eval loss = 0.1256


 75%|███████▍  | 187/250 [00:02<00:00, 66.43it/s]

Train loss = 0.1189, Eval loss = 0.1180


 87%|████████▋ | 218/250 [00:03<00:00, 73.81it/s]

Train loss = 0.1118, Eval loss = 0.1113


 95%|█████████▌| 238/250 [00:03<00:00, 73.12it/s]

Train loss = 0.1064, Eval loss = 0.1062


100%|██████████| 250/250 [00:03<00:00, 69.56it/s]




Normalise: False
Best F1 score: 0.7615
ROC AUC: 0.8193


Doing 10 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:30,  8.27it/s]

Train loss = 1.2063, Eval loss = 1.0529


 17%|█▋        | 42/250 [00:00<00:02, 75.16it/s]

Train loss = 0.3134, Eval loss = 0.3033


 24%|██▍       | 61/250 [00:00<00:02, 71.26it/s]

Train loss = 0.1942, Eval loss = 0.1926


 36%|███▌      | 90/250 [00:01<00:02, 74.62it/s]

Train loss = 0.1565, Eval loss = 0.1564


 45%|████▍     | 112/250 [00:01<00:01, 74.13it/s]

Train loss = 0.1377, Eval loss = 0.1382


 57%|█████▋    | 142/250 [00:02<00:01, 75.93it/s]

Train loss = 0.1259, Eval loss = 0.1268


 64%|██████▍   | 161/250 [00:02<00:01, 73.23it/s]

Train loss = 0.1164, Eval loss = 0.1177


 77%|███████▋  | 192/250 [00:02<00:00, 74.15it/s]

Train loss = 0.1093, Eval loss = 0.1108


 85%|████████▍ | 212/250 [00:03<00:00, 73.72it/s]

Train loss = 0.1043, Eval loss = 0.1060


 97%|█████████▋| 242/250 [00:03<00:00, 75.41it/s]

Train loss = 0.1004, Eval loss = 0.1022


100%|██████████| 250/250 [00:03<00:00, 69.79it/s]




Normalise: False
Best F1 score: 0.7662
ROC AUC: 0.8044


Doing 10 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:25,  9.72it/s]

Train loss = 1.2003, Eval loss = 1.0778


 17%|█▋        | 43/250 [00:00<00:02, 77.89it/s]

Train loss = 0.3095, Eval loss = 0.3143


 25%|██▌       | 63/250 [00:00<00:02, 75.11it/s]

Train loss = 0.1915, Eval loss = 0.1943


 38%|███▊      | 95/250 [00:01<00:01, 79.72it/s]

Train loss = 0.1573, Eval loss = 0.1600


 46%|████▌     | 115/250 [00:01<00:01, 75.46it/s]

Train loss = 0.1392, Eval loss = 0.1415


 54%|█████▍    | 136/250 [00:01<00:01, 72.62it/s]

Train loss = 0.1279, Eval loss = 0.1301


 66%|██████▋   | 166/250 [00:02<00:01, 75.82it/s]

Train loss = 0.1202, Eval loss = 0.1223


 75%|███████▍  | 187/250 [00:02<00:00, 75.08it/s]

Train loss = 0.1142, Eval loss = 0.1162


 88%|████████▊ | 220/250 [00:03<00:00, 79.05it/s]

Train loss = 0.1088, Eval loss = 0.1108


 96%|█████████▌| 240/250 [00:03<00:00, 76.16it/s]

Train loss = 0.1041, Eval loss = 0.1060


100%|██████████| 250/250 [00:03<00:00, 73.73it/s]




Normalise: False
Best F1 score: 0.7709
ROC AUC: 0.8427


Doing 10 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:27,  9.17it/s]

Train loss = 1.1638, Eval loss = 1.0308


 17%|█▋        | 42/250 [00:00<00:02, 71.49it/s]

Train loss = 0.2919, Eval loss = 0.2855


 25%|██▍       | 62/250 [00:00<00:02, 71.56it/s]

Train loss = 0.1918, Eval loss = 0.1908


 37%|███▋      | 93/250 [00:01<00:02, 65.63it/s]

Train loss = 0.1606, Eval loss = 0.1604


 45%|████▌     | 113/250 [00:01<00:01, 69.07it/s]

Train loss = 0.1430, Eval loss = 0.1431


 58%|█████▊    | 144/250 [00:02<00:01, 73.39it/s]

Train loss = 0.1315, Eval loss = 0.1319


 66%|██████▌   | 164/250 [00:02<00:01, 74.21it/s]

Train loss = 0.1227, Eval loss = 0.1233


 78%|███████▊  | 195/250 [00:02<00:00, 78.26it/s]

Train loss = 0.1154, Eval loss = 0.1161


 86%|████████▌ | 215/250 [00:03<00:00, 72.17it/s]

Train loss = 0.1096, Eval loss = 0.1105


 95%|█████████▍| 237/250 [00:03<00:00, 73.88it/s]

Train loss = 0.1041, Eval loss = 0.1053


100%|██████████| 250/250 [00:03<00:00, 70.24it/s]




Normalise: False
Best F1 score: 0.7423
ROC AUC: 0.8330


Doing 10 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:26,  9.49it/s]

Train loss = 1.1900, Eval loss = 1.0379


 18%|█▊        | 44/250 [00:00<00:02, 77.53it/s]

Train loss = 0.2973, Eval loss = 0.2860


 26%|██▌       | 64/250 [00:00<00:02, 75.13it/s]

Train loss = 0.1886, Eval loss = 0.1856


 37%|███▋      | 93/250 [00:01<00:02, 68.67it/s]

Train loss = 0.1566, Eval loss = 0.1547


 44%|████▍     | 111/250 [00:01<00:02, 68.47it/s]

Train loss = 0.1397, Eval loss = 0.1385


 56%|█████▋    | 141/250 [00:02<00:01, 72.01it/s]

Train loss = 0.1282, Eval loss = 0.1274


 65%|██████▍   | 162/250 [00:02<00:01, 72.93it/s]

Train loss = 0.1208, Eval loss = 0.1203


 77%|███████▋  | 193/250 [00:02<00:00, 77.00it/s]

Train loss = 0.1149, Eval loss = 0.1147


 85%|████████▌ | 213/250 [00:03<00:00, 76.00it/s]

Train loss = 0.1105, Eval loss = 0.1105


 98%|█████████▊| 245/250 [00:03<00:00, 78.54it/s]

Train loss = 0.1065, Eval loss = 0.1066


100%|██████████| 250/250 [00:03<00:00, 71.10it/s]




Normalise: False
Best F1 score: 0.7702
ROC AUC: 0.8103


Doing 25 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:28,  8.59it/s]

Train loss = 1.1775, Eval loss = 1.0458


 14%|█▎        | 34/250 [00:00<00:04, 50.64it/s]

Train loss = 0.3007, Eval loss = 0.2978


 24%|██▍       | 61/250 [00:01<00:03, 52.27it/s]

Train loss = 0.1933, Eval loss = 0.1946


 36%|███▌      | 89/250 [00:01<00:03, 52.89it/s]

Train loss = 0.1600, Eval loss = 0.1616


 44%|████▎     | 109/250 [00:02<00:02, 52.00it/s]

Train loss = 0.1410, Eval loss = 0.1424


 55%|█████▍    | 137/250 [00:02<00:02, 52.22it/s]

Train loss = 0.1291, Eval loss = 0.1303


 65%|██████▍   | 162/250 [00:03<00:01, 47.85it/s]

Train loss = 0.1206, Eval loss = 0.1218


 75%|███████▌  | 188/250 [00:03<00:01, 48.70it/s]

Train loss = 0.1145, Eval loss = 0.1157


 83%|████████▎ | 208/250 [00:04<00:00, 50.34it/s]

Train loss = 0.1099, Eval loss = 0.1110


 94%|█████████▍| 236/250 [00:04<00:00, 51.49it/s]

Train loss = 0.1051, Eval loss = 0.1062


100%|██████████| 250/250 [00:04<00:00, 50.79it/s]




Normalise: False
Best F1 score: 0.7147
ROC AUC: 0.7696


Doing 25 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:27,  9.15it/s]

Train loss = 1.1321, Eval loss = 1.0145


 14%|█▍        | 35/250 [00:00<00:04, 51.30it/s]

Train loss = 0.2926, Eval loss = 0.2887


 25%|██▍       | 62/250 [00:01<00:03, 51.99it/s]

Train loss = 0.1881, Eval loss = 0.1888


 33%|███▎      | 83/250 [00:01<00:03, 51.54it/s]

Train loss = 0.1548, Eval loss = 0.1558


 44%|████▍     | 111/250 [00:02<00:02, 52.42it/s]

Train loss = 0.1374, Eval loss = 0.1386


 56%|█████▌    | 139/250 [00:02<00:02, 52.99it/s]

Train loss = 0.1273, Eval loss = 0.1285


 64%|██████▍   | 160/250 [00:03<00:01, 52.05it/s]

Train loss = 0.1197, Eval loss = 0.1209


 75%|███████▌  | 188/250 [00:03<00:01, 53.16it/s]

Train loss = 0.1135, Eval loss = 0.1146


 83%|████████▎ | 208/250 [00:04<00:00, 43.06it/s]

Train loss = 0.1082, Eval loss = 0.1093


 94%|█████████▍| 235/250 [00:04<00:00, 50.42it/s]

Train loss = 0.1033, Eval loss = 0.1045


100%|██████████| 250/250 [00:04<00:00, 51.40it/s]




Normalise: False
Best F1 score: 0.6894
ROC AUC: 0.7603


Doing 25 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:27,  9.00it/s]

Train loss = 1.1243, Eval loss = 0.9914


 14%|█▍        | 35/250 [00:00<00:04, 51.06it/s]

Train loss = 0.2960, Eval loss = 0.2848


 25%|██▌       | 63/250 [00:01<00:03, 51.69it/s]

Train loss = 0.1869, Eval loss = 0.1825


 33%|███▎      | 83/250 [00:01<00:03, 51.25it/s]

Train loss = 0.1573, Eval loss = 0.1544


 44%|████▍     | 111/250 [00:02<00:02, 52.99it/s]

Train loss = 0.1405, Eval loss = 0.1385


 56%|█████▌    | 139/250 [00:02<00:02, 53.62it/s]

Train loss = 0.1295, Eval loss = 0.1282


 64%|██████▍   | 160/250 [00:03<00:01, 51.18it/s]

Train loss = 0.1213, Eval loss = 0.1204


 75%|███████▌  | 188/250 [00:03<00:01, 52.78it/s]

Train loss = 0.1153, Eval loss = 0.1146


 83%|████████▎ | 208/250 [00:04<00:00, 48.72it/s]

Train loss = 0.1108, Eval loss = 0.1103


 93%|█████████▎| 233/250 [00:04<00:00, 48.24it/s]

Train loss = 0.1073, Eval loss = 0.1069


100%|██████████| 250/250 [00:04<00:00, 50.35it/s]




Normalise: False
Best F1 score: 0.7340
ROC AUC: 0.8277


Doing 25 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:29,  8.51it/s]

Train loss = 1.1441, Eval loss = 1.0247


 14%|█▎        | 34/250 [00:00<00:04, 46.24it/s]

Train loss = 0.2914, Eval loss = 0.2907


 24%|██▍       | 60/250 [00:01<00:03, 48.04it/s]

Train loss = 0.1877, Eval loss = 0.1901


 35%|███▍      | 87/250 [00:01<00:03, 49.03it/s]

Train loss = 0.1552, Eval loss = 0.1580


 45%|████▍     | 112/250 [00:02<00:02, 47.94it/s]

Train loss = 0.1366, Eval loss = 0.1392


 55%|█████▍    | 137/250 [00:02<00:02, 48.17it/s]

Train loss = 0.1246, Eval loss = 0.1273


 65%|██████▌   | 163/250 [00:03<00:01, 49.14it/s]

Train loss = 0.1170, Eval loss = 0.1196


 73%|███████▎  | 183/250 [00:03<00:01, 48.85it/s]

Train loss = 0.1113, Eval loss = 0.1139


 84%|████████▍ | 210/250 [00:04<00:00, 49.59it/s]

Train loss = 0.1067, Eval loss = 0.1093


 94%|█████████▍| 236/250 [00:04<00:00, 50.78it/s]

Train loss = 0.1025, Eval loss = 0.1051


100%|██████████| 250/250 [00:05<00:00, 48.29it/s]




Normalise: False
Best F1 score: 0.7617
ROC AUC: 0.7963


Doing 25 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:28,  8.81it/s]

Train loss = 1.2017, Eval loss = 1.0594


 14%|█▍        | 35/250 [00:00<00:04, 50.00it/s]

Train loss = 0.3003, Eval loss = 0.2922


 24%|██▍       | 61/250 [00:01<00:03, 49.40it/s]

Train loss = 0.1945, Eval loss = 0.1920


 35%|███▌      | 88/250 [00:01<00:03, 50.26it/s]

Train loss = 0.1633, Eval loss = 0.1620


 43%|████▎     | 107/250 [00:02<00:02, 48.64it/s]

Train loss = 0.1428, Eval loss = 0.1422


 53%|█████▎    | 133/250 [00:02<00:02, 48.17it/s]

Train loss = 0.1295, Eval loss = 0.1294


 64%|██████▎   | 159/250 [00:03<00:01, 48.94it/s]

Train loss = 0.1214, Eval loss = 0.1215


 74%|███████▍  | 186/250 [00:03<00:01, 49.96it/s]

Train loss = 0.1156, Eval loss = 0.1158


 85%|████████▌ | 213/250 [00:04<00:00, 48.98it/s]

Train loss = 0.1101, Eval loss = 0.1105


 95%|█████████▍| 237/250 [00:04<00:00, 45.24it/s]

Train loss = 0.1058, Eval loss = 0.1062


100%|██████████| 250/250 [00:05<00:00, 48.69it/s]




Normalise: False
Best F1 score: 0.7423
ROC AUC: 0.8116


Doing 50 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:36,  6.88it/s]

Train loss = 1.1883, Eval loss = 1.0573


 13%|█▎        | 32/250 [00:01<00:07, 27.57it/s]

Train loss = 0.2981, Eval loss = 0.2924


 22%|██▏       | 56/250 [00:02<00:06, 28.09it/s]

Train loss = 0.1872, Eval loss = 0.1871


 32%|███▏      | 80/250 [00:02<00:06, 27.17it/s]

Train loss = 0.1559, Eval loss = 0.1565


 42%|████▏     | 104/250 [00:03<00:05, 25.95it/s]

Train loss = 0.1378, Eval loss = 0.1386


 53%|█████▎    | 132/250 [00:04<00:04, 28.46it/s]

Train loss = 0.1272, Eval loss = 0.1281


 62%|██████▏   | 155/250 [00:05<00:03, 25.63it/s]

Train loss = 0.1197, Eval loss = 0.1206


 72%|███████▏  | 179/250 [00:06<00:02, 26.72it/s]

Train loss = 0.1138, Eval loss = 0.1147


 83%|████████▎ | 207/250 [00:07<00:01, 28.85it/s]

Train loss = 0.1092, Eval loss = 0.1102


 92%|█████████▏| 231/250 [00:08<00:00, 29.03it/s]

Train loss = 0.1053, Eval loss = 0.1063


100%|██████████| 250/250 [00:08<00:00, 29.15it/s]




Normalise: False
Best F1 score: 0.6931
ROC AUC: 0.7841


Doing 50 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:30,  8.30it/s]

Train loss = 1.1768, Eval loss = 1.0383


 12%|█▏        | 29/250 [00:01<00:08, 26.91it/s]

Train loss = 0.2977, Eval loss = 0.2870


 23%|██▎       | 57/250 [00:01<00:06, 28.71it/s]

Train loss = 0.1862, Eval loss = 0.1821


 32%|███▏      | 81/250 [00:02<00:05, 29.16it/s]

Train loss = 0.1549, Eval loss = 0.1526


 42%|████▏     | 105/250 [00:03<00:04, 29.19it/s]

Train loss = 0.1386, Eval loss = 0.1372


 52%|█████▏    | 129/250 [00:04<00:04, 27.02it/s]

Train loss = 0.1287, Eval loss = 0.1276


 63%|██████▎   | 157/250 [00:05<00:03, 27.54it/s]

Train loss = 0.1213, Eval loss = 0.1206


 72%|███████▏  | 181/250 [00:06<00:02, 28.83it/s]

Train loss = 0.1152, Eval loss = 0.1147


 82%|████████▏ | 205/250 [00:06<00:01, 28.74it/s]

Train loss = 0.1105, Eval loss = 0.1101


 93%|█████████▎| 233/250 [00:07<00:00, 28.98it/s]

Train loss = 0.1068, Eval loss = 0.1066


100%|██████████| 250/250 [00:08<00:00, 29.81it/s]




Normalise: False
Best F1 score: 0.7141
ROC AUC: 0.7989


Doing 50 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:30,  8.21it/s]

Train loss = 1.1322, Eval loss = 0.9982


 12%|█▏        | 29/250 [00:01<00:08, 25.72it/s]

Train loss = 0.3228, Eval loss = 0.3112


 22%|██▏       | 56/250 [00:01<00:06, 27.74it/s]

Train loss = 0.2006, Eval loss = 0.1962


 32%|███▏      | 80/250 [00:02<00:05, 28.53it/s]

Train loss = 0.1637, Eval loss = 0.1611


 42%|████▏     | 104/250 [00:03<00:05, 26.97it/s]

Train loss = 0.1448, Eval loss = 0.1429


 52%|█████▏    | 130/250 [00:04<00:04, 26.29it/s]

Train loss = 0.1330, Eval loss = 0.1316


 62%|██████▏   | 154/250 [00:05<00:03, 26.89it/s]

Train loss = 0.1240, Eval loss = 0.1228


 73%|███████▎  | 182/250 [00:06<00:02, 28.31it/s]

Train loss = 0.1170, Eval loss = 0.1159


 82%|████████▏ | 206/250 [00:07<00:01, 28.31it/s]

Train loss = 0.1112, Eval loss = 0.1102


 92%|█████████▏| 230/250 [00:07<00:00, 28.50it/s]

Train loss = 0.1057, Eval loss = 0.1048


100%|██████████| 250/250 [00:08<00:00, 29.28it/s]




Normalise: False
Best F1 score: 0.7124
ROC AUC: 0.7749


Doing 50 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:30,  8.27it/s]

Train loss = 1.1337, Eval loss = 1.0113


 12%|█▏        | 29/250 [00:01<00:08, 26.26it/s]

Train loss = 0.3107, Eval loss = 0.3083


 23%|██▎       | 57/250 [00:01<00:06, 28.59it/s]

Train loss = 0.1954, Eval loss = 0.1957


 32%|███▏      | 81/250 [00:02<00:05, 28.65it/s]

Train loss = 0.1582, Eval loss = 0.1591


 42%|████▏     | 105/250 [00:03<00:05, 26.97it/s]

Train loss = 0.1400, Eval loss = 0.1409


 52%|█████▏    | 129/250 [00:04<00:04, 27.08it/s]

Train loss = 0.1277, Eval loss = 0.1285


 63%|██████▎   | 157/250 [00:05<00:03, 28.22it/s]

Train loss = 0.1188, Eval loss = 0.1196


 72%|███████▏  | 181/250 [00:06<00:02, 28.48it/s]

Train loss = 0.1120, Eval loss = 0.1127


 82%|████████▏ | 205/250 [00:06<00:01, 28.86it/s]

Train loss = 0.1060, Eval loss = 0.1068


 92%|█████████▏| 229/250 [00:07<00:00, 26.42it/s]

Train loss = 0.1013, Eval loss = 0.1021


100%|██████████| 250/250 [00:08<00:00, 29.54it/s]




Normalise: False
Best F1 score: 0.7219
ROC AUC: 0.8033


Doing 50 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  2%|▏         | 5/250 [00:00<00:13, 18.52it/s]

Train loss = 1.2503, Eval loss = 1.0987


 12%|█▏        | 29/250 [00:01<00:08, 26.03it/s]

Train loss = 0.3142, Eval loss = 0.3063


 23%|██▎       | 57/250 [00:02<00:06, 28.57it/s]

Train loss = 0.1957, Eval loss = 0.1939


 32%|███▏      | 81/250 [00:02<00:06, 26.89it/s]

Train loss = 0.1609, Eval loss = 0.1601


 42%|████▏     | 105/250 [00:03<00:05, 28.86it/s]

Train loss = 0.1425, Eval loss = 0.1422


 52%|█████▏    | 129/250 [00:04<00:04, 27.20it/s]

Train loss = 0.1312, Eval loss = 0.1311


 63%|██████▎   | 157/250 [00:05<00:03, 28.71it/s]

Train loss = 0.1224, Eval loss = 0.1225


 72%|███████▏  | 181/250 [00:06<00:02, 28.58it/s]

Train loss = 0.1146, Eval loss = 0.1147


 82%|████████▏ | 205/250 [00:07<00:01, 28.80it/s]

Train loss = 0.1080, Eval loss = 0.1083


 92%|█████████▏| 229/250 [00:07<00:00, 27.16it/s]

Train loss = 0.1031, Eval loss = 0.1035


100%|██████████| 250/250 [00:08<00:00, 29.53it/s]




Normalise: False
Best F1 score: 0.7481
ROC AUC: 0.8258


Doing 100 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:37,  6.61it/s]

Train loss = 1.1998, Eval loss = 1.0666


 12%|█▏        | 29/250 [00:01<00:15, 14.34it/s]

Train loss = 0.3238, Eval loss = 0.3195


 21%|██        | 53/250 [00:03<00:13, 14.30it/s]

Train loss = 0.1934, Eval loss = 0.1928


 32%|███▏      | 79/250 [00:05<00:12, 14.15it/s]

Train loss = 0.1522, Eval loss = 0.1526


 41%|████      | 103/250 [00:06<00:10, 14.31it/s]

Train loss = 0.1320, Eval loss = 0.1326


 52%|█████▏    | 129/250 [00:08<00:08, 14.68it/s]

Train loss = 0.1201, Eval loss = 0.1207


 61%|██████    | 153/250 [00:09<00:06, 14.65it/s]

Train loss = 0.1125, Eval loss = 0.1132


 72%|███████▏  | 179/250 [00:11<00:04, 14.84it/s]

Train loss = 0.1072, Eval loss = 0.1078


 81%|████████  | 203/250 [00:13<00:03, 14.74it/s]

Train loss = 0.1028, Eval loss = 0.1034


 92%|█████████▏| 229/250 [00:14<00:01, 14.79it/s]

Train loss = 0.0992, Eval loss = 0.0998


100%|██████████| 250/250 [00:16<00:00, 15.60it/s]




Normalise: False
Best F1 score: 0.6780
ROC AUC: 0.7221


Doing 100 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  1%|          | 3/250 [00:00<00:19, 12.35it/s]

Train loss = 1.1562, Eval loss = 1.0255


 12%|█▏        | 29/250 [00:01<00:15, 14.46it/s]

Train loss = 0.2913, Eval loss = 0.2840


 21%|██        | 53/250 [00:03<00:13, 14.82it/s]

Train loss = 0.1879, Eval loss = 0.1865


 32%|███▏      | 79/250 [00:05<00:11, 14.79it/s]

Train loss = 0.1545, Eval loss = 0.1540


 41%|████      | 103/250 [00:06<00:10, 14.69it/s]

Train loss = 0.1373, Eval loss = 0.1372


 52%|█████▏    | 129/250 [00:08<00:08, 14.72it/s]

Train loss = 0.1262, Eval loss = 0.1262


 61%|██████    | 153/250 [00:09<00:06, 14.75it/s]

Train loss = 0.1178, Eval loss = 0.1180


 72%|███████▏  | 179/250 [00:11<00:04, 14.75it/s]

Train loss = 0.1114, Eval loss = 0.1116


 81%|████████  | 203/250 [00:12<00:03, 14.82it/s]

Train loss = 0.1056, Eval loss = 0.1060


 92%|█████████▏| 229/250 [00:14<00:01, 14.91it/s]

Train loss = 0.1004, Eval loss = 0.1007


100%|██████████| 250/250 [00:15<00:00, 15.83it/s]




Normalise: False
Best F1 score: 0.6550
ROC AUC: 0.7130


Doing 100 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:34,  7.18it/s]

Train loss = 1.1684, Eval loss = 1.0354


 12%|█▏        | 29/250 [00:01<00:14, 14.90it/s]

Train loss = 0.2748, Eval loss = 0.2676


 21%|██        | 53/250 [00:03<00:13, 14.76it/s]

Train loss = 0.1830, Eval loss = 0.1809


 32%|███▏      | 79/250 [00:05<00:11, 14.45it/s]

Train loss = 0.1529, Eval loss = 0.1518


 41%|████      | 103/250 [00:06<00:09, 14.74it/s]

Train loss = 0.1344, Eval loss = 0.1339


 52%|█████▏    | 129/250 [00:08<00:08, 14.74it/s]

Train loss = 0.1215, Eval loss = 0.1213


 61%|██████    | 153/250 [00:09<00:06, 14.73it/s]

Train loss = 0.1132, Eval loss = 0.1132


 72%|███████▏  | 179/250 [00:11<00:04, 14.64it/s]

Train loss = 0.1071, Eval loss = 0.1071


 81%|████████  | 203/250 [00:12<00:03, 14.82it/s]

Train loss = 0.1021, Eval loss = 0.1022


 92%|█████████▏| 229/250 [00:14<00:01, 14.84it/s]

Train loss = 0.0981, Eval loss = 0.0983


100%|██████████| 250/250 [00:15<00:00, 15.94it/s]




Normalise: False
Best F1 score: 0.7182
ROC AUC: 0.7854


Doing 100 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:35,  7.05it/s]

Train loss = 1.1350, Eval loss = 1.0057


 12%|█▏        | 29/250 [00:01<00:15, 14.70it/s]

Train loss = 0.2983, Eval loss = 0.2896


 21%|██        | 53/250 [00:03<00:13, 14.75it/s]

Train loss = 0.1893, Eval loss = 0.1866


 32%|███▏      | 79/250 [00:05<00:11, 14.98it/s]

Train loss = 0.1570, Eval loss = 0.1555


 41%|████      | 103/250 [00:06<00:09, 14.83it/s]

Train loss = 0.1387, Eval loss = 0.1375


 52%|█████▏    | 129/250 [00:08<00:08, 14.72it/s]

Train loss = 0.1262, Eval loss = 0.1252


 61%|██████    | 153/250 [00:09<00:06, 14.74it/s]

Train loss = 0.1180, Eval loss = 0.1172


 72%|███████▏  | 179/250 [00:11<00:04, 14.88it/s]

Train loss = 0.1124, Eval loss = 0.1117


 81%|████████  | 203/250 [00:12<00:03, 14.76it/s]

Train loss = 0.1078, Eval loss = 0.1072


 92%|█████████▏| 229/250 [00:14<00:01, 14.59it/s]

Train loss = 0.1040, Eval loss = 0.1035


100%|██████████| 250/250 [00:15<00:00, 15.96it/s]




Normalise: False
Best F1 score: 0.7159
ROC AUC: 0.7834


Doing 100 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  0%|          | 1/250 [00:00<00:34,  7.26it/s]

Train loss = 1.1448, Eval loss = 1.0259


 12%|█▏        | 29/250 [00:01<00:14, 14.81it/s]

Train loss = 0.2999, Eval loss = 0.2960


 21%|██        | 53/250 [00:03<00:13, 14.29it/s]

Train loss = 0.1871, Eval loss = 0.1862


 32%|███▏      | 79/250 [00:05<00:11, 14.86it/s]

Train loss = 0.1553, Eval loss = 0.1555


 41%|████      | 103/250 [00:06<00:10, 14.66it/s]

Train loss = 0.1398, Eval loss = 0.1403


 52%|█████▏    | 129/250 [00:08<00:08, 14.68it/s]

Train loss = 0.1292, Eval loss = 0.1299


 61%|██████    | 153/250 [00:09<00:06, 14.90it/s]

Train loss = 0.1208, Eval loss = 0.1216


 72%|███████▏  | 179/250 [00:11<00:04, 14.86it/s]

Train loss = 0.1147, Eval loss = 0.1155


 81%|████████  | 203/250 [00:12<00:03, 14.84it/s]

Train loss = 0.1098, Eval loss = 0.1106


 92%|█████████▏| 229/250 [00:14<00:01, 14.86it/s]

Train loss = 0.1054, Eval loss = 0.1062


100%|██████████| 250/250 [00:15<00:00, 15.99it/s]




Normalise: False
Best F1 score: 0.7623
ROC AUC: 0.8287


Doing 250 examples...
Type: node
Task: Indirect Object Identification
Using device: cpu
[(2, 2), (4, 11), (0, 1), (3, 0), (0, 10), (5, 5), (6, 9), (5, 8), (5, 9), (7, 3), (7, 9), (8, 6), (8, 10), (10, 7), (11, 10), (9, 9), (9, 6), (10, 0), (10, 10), (10, 6), (10, 2), (10, 1), (11, 2), (11, 9), (11, 3), (9, 7)]


  1%|          | 2/250 [00:00<00:51,  4.80it/s]

Train loss = 1.1764, Eval loss = 1.0430


 11%|█         | 27/250 [00:03<00:32,  6.81it/s]

Train loss = 0.3197, Eval loss = 0.3120


 21%|██        | 52/250 [00:07<00:29,  6.82it/s]

Train loss = 0.1936, Eval loss = 0.1910


 31%|███       | 77/250 [00:11<00:25,  6.70it/s]

Train loss = 0.1595, Eval loss = 0.1582


 35%|███▌      | 88/250 [00:12<00:23,  6.99it/s]


KeyboardInterrupt: 