In [1]:
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#

# A Simple Demo
## Papers retrieval with minimum supervision

#### What we need:
- Pretrained word embeddings (e.g. Fasttext)
- An arXiv dataset
- Few annotated abstracts
- Our model to detect interesting abstracts

#### Software requirements:
- torch
- torch_scatter
- gensim
- oaiharvest
- IPython

### Step 1: load Fasttext

In [1]:
import os
import gensim.models.fasttext
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# Our path to fasttext model
cap_path = "../data/fasttextmodel/wiki.en.bin"

# Let's load the dictionary only (no need to pretrain)
gensim_model = gensim.models.fasttext.load_facebook_vectors(cap_path)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Step 2: load arXiv metadata (this will occupy approximately 100 MB of disk space)

In [155]:
import os
from pathlib import Path

from_date, until_date = '2019-01-01', '2019-12-31'

for set_type in ['cs']:
    DIR = Path(f'arXiv/{set_type}')
    
    # This may take a while
    command = f'oai-harvest --from {from_date} --until {until_date} --dir {DIR} --set {set_type} http://export.arxiv.org/oai2'
    os.system(command)

### Step 3: preprocess arXiv data into lists of abstracts

In [5]:
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
nlp = English()

# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

In [6]:
import os
from pathlib import Path
import xml.etree.ElementTree as ET

# This variable will hold all unseen abstracts that we want to filter
unseen_titles = []
unseen_abstracts = []


completed = 0
for set_type in ['cs']:
    DIR = Path(f'arXiv/{set_type}')
    
    for filename in os.listdir(DIR):
        try:
            root = ET.parse(Path(f'arXiv/{set_type}', filename)).getroot()

            title = None
            abstract = None
            
            for child in root:
                if 'title' in child.tag and title is None:
                    title = [t.text for t in tokenizer(child.text)]
                    
                if 'description' in child.tag and abstract is None:
                    abstract = [t.text for t in tokenizer(child.text)]
                    
                    
            assert title is not None and abstract is not None, f'{title} {abstract}'
            unseen_titles.append(title)
            unseen_abstracts.append(abstract)
            
        except Exception as e:
            print(e)
            
        completed += 1
        if (completed % 5000) == 0:
            print(f'Parsed {completed} papers')
print('Done.')

Parsed 5000 papers
Parsed 10000 papers
Parsed 15000 papers
Parsed 20000 papers
Parsed 25000 papers
Parsed 30000 papers
Parsed 35000 papers
Parsed 40000 papers
Parsed 45000 papers
Done.


### Step 4: Label a small training set augmented with rationales

In [1]:
labelled_titles = [
    ('Contextual <POS> Graph <POS> Markov Model: A <POS> deep and generative approach to graph processing <POS> .', 1),
    ( 'Low-rank Kernel Learning for <POS> Graph-based Clustering <POS> ', 1),
    ( 'Fast <POS> Graph Representation Learning <POS> with PyTorch Geometric', 1),
]
labelled_abstracts = [
    ('We introduce the Contextual <POS> Graph <POS> Markov Model, an approach combining ideas from generative models and neural networks for the <POS> processing of graph data <POS> . It founds on a constructive methodology to build a <POS> deep architecture <POS> comprising layers of probabilistic models that learn to encode the structured information in an incremental fashion. <POS> Context is diffused <POS> in an efficient and scalable way across the <POS> graph vertexes and edges <POS> . The resulting <POS> graph encoding <POS> is used in combination with discriminative models to address <POS> structure classification benchmarks <POS> .', 1),
    ('Constructing the adjacency graph is fundamental to graph-based clustering. <POS> Graph learning <POS> in kernel space has shown impressive performance on a number of benchmark data sets. However, its performance is largely determined by the chosen kernel matrix. To address this issue, the previous multiple kernel learning algorithm has been applied to learn an optimal kernel from a group of predefined kernels. This approach might be sensitive to noise and limits the representation ability of the consensus kernel. In contrast to existing methods, we propose to learn a low-rank kernel matrix which exploits the similarity nature of the kernel matrix and seeks an optimal kernel from the <POS> neighborhood <POS> of candidate kernels. By formulating <POS> graph construction <POS> and kernel learning in a unified framework, the graph and consensus kernel can be iteratively enhanced by each other. Extensive experimental results validate the efficacy of the proposed method.', 1),
    ('We introduce PyTorch Geometric, a library for deep learning on <POS> irregularly structured input data such as graphs <POS>, point clouds and manifolds, built upon PyTorch. In addition to general <POS> graph data structures <POS> and processing methods, it contains a variety of recently published methods from the domains of <POS> relational learning <POS> and 3D data processing. PyTorch Geometric achieves high data throughput by leveraging sparse GPU acceleration, by providing dedicated CUDA kernels and by introducing efficient mini-batch handling for input examples of different size. In this work, we present the library in detail and perform a comprehensive comparative study of the implemented methods in homogeneous evaluation scenarios.', 1),    
]

In [2]:
import random
rand_idxs = [random.randint(0, 45000) for _ in range(50)]

In [3]:
import pickle

for paper_id in rand_idxs:
    label = 0
    if paper_id == 3129:
        label = 1

    labelled_titles.append((' '.join(unseen_titles[paper_id]), label))
    labelled_abstracts.append((' '.join(unseen_abstracts[paper_id]), label))

with open(Path(f'arXiv/training_set_raw.pkl'), 'wb') as f:
    pickle.dump({'training_titles': labelled_titles,
                 'training_abstracts': labelled_abstracts}, f)
    

NameError: name 'unseen_titles' is not defined

### Step 5: Preprocess rationales

In [294]:
import pickle
from pathlib import Path

with open(Path(f'arXiv/training_set_raw.pkl'), 'rb') as f:
    training_raw = pickle.load(f)
    
raw_titles = training_raw['training_titles']
raw_abstracts = training_raw['training_abstracts']

processed_titles = []
processed_abstracts = []

for processed, raw in [(processed_titles, raw_titles), (processed_abstracts, raw_abstracts)]:
    for sample, label in raw:

        processed_sample = []
        processed_rationales = []

        found = 0
        for token in sample.split():

            if '<POS>' not in token:
                processed_sample.append(token)
                processed_rationales.append(found)
            else:
                if found == 1:
                    found = 0
                else:
                    found = 1
        processed.append((processed_sample, processed_rationales, label))    

with open(Path(f'arXiv/training_set.pkl'), 'wb') as f:
    pickle.dump({'training_titles': processed_titles,
                 'training_abstracts': processed_abstracts}, f)

### Step 6: define Dataset and DataLoader utility functions

In [331]:
import torch
from torch._six import container_abcs
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

def custom_collate(batch):
    error_msg_fmt = "batch must contain tensors, numbers, dicts or lists; found {}"
    elem_type = type(batch[0])

    if isinstance(batch[0], container_abcs.Sequence):
        # This will be called first
        transposed = zip(*batch)
        return [custom_collate(samples) for samples in transposed]
    elif isinstance(batch[0], torch.Tensor):
        
        # This will be called by the previous guard with tail recursion
        # If we're in a background process, concatenate directly into a
        # shared memory tensor to avoid an extra copy
        numel = sum([x.numel() for x in batch])
        storage = batch[0].storage()._new_shared(numel)
        out = batch[0].new(storage)

        # We address batching of each element of the tuple here
        torch.cat(batch, dim=0, out=out)

        # Create batch masks to reconstruct sentences
        batch_idxs = torch.zeros(batch[0].shape[0], dtype=torch.int).unsqueeze(dim=1)
        for i in range(1, len(batch)):
            batch_idxs = torch.cat((batch_idxs,
                                    torch.full((batch[i].shape[0], 1), i, dtype=torch.int)),
                                   dim=0)
        return out, batch_idxs.squeeze()
    else:
        raise TypeError((error_msg_fmt.format(type(batch[0]))))

class ArXivDataset(Dataset):

    def __init__(self, samples, gensim_model):
        self.samples = samples
        self.gensim_model = gensim_model
        self.cached = [None for _ in range(len(self.samples))]

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):

        if self.cached[idx] is None:
            
            tokens, rationales, target = self.samples[idx]
            
            tokens_embeddings = torch.tensor([self.gensim_model[t.lower()] for t in tokens])
            
            annotations = torch.tensor(rationales).unsqueeze(1).float()
            label = torch.tensor([target]).long()
            
            self.cached[idx] = (tokens_embeddings, annotations, label, tokens)
        
        example = self.cached[idx]
        embeddings, annotations, target, tokens = example
        
        return embeddings, annotations, target, torch.tensor([idx]).long()

### Step 7: define our model

In [332]:
import torch
from torch_scatter import scatter_add
from torch.autograd import Function
import torch.nn.functional as F

from models.LogisticRegression import LogisticRegression


class WeightedIdentity(Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def compute_weight_vector(annotations, base):
        # avoid having 0 when no tokens are highlighted
        if base is None:
            return torch.exp(torch.sum(annotations.float(), dim=1))
        else:
            return torch.pow(base, torch.sum(annotations.float(), dim=1))

    @staticmethod
    def forward(ctx, input, annotations, base=None):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(annotations, base)
        return torch.mul(input, torch.pow(base, annotations.float()))

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        annotations, base = ctx.saved_tensors

        grad_input = grad_output.clone()

        # Weight each pairs according to the correspondent annotation
        sample_weight = WeightedIdentity.compute_weight_vector(annotations, base)
        grad_input = torch.mul(grad_input, sample_weight.unsqueeze(dim=1))

        # forward took 2 arguments, backward is expecting 2 backward arguments.
        # Just return None for annotations and base
        return grad_input, None, None


class NeuralPM(torch.nn.Module):
    """
    Labeling Function that takes 2 words (to be parametrized) and returns a score.
    We assume independence between prototypes, that is at first this holds because they are random,
    but then this assumption allows us to easily model "probability of a word of being similar
    to a prototype"  and interpret the model.
    """

    def __init__(self, dim_features, dim_targets, gating_param=100, highlights_pow_base=None, num_prototypes=3):
        """
        LF
        :param dim_features:
        :param a: default:2
        """
        super().__init__()
        self.dim_features = dim_features
        self.dim_targets = dim_targets

        if highlights_pow_base is not None:
            self.highlights_pow_base = torch.nn.Parameter(torch.Tensor([highlights_pow_base]))
            self.highlights_pow_base.requires_grad = False
        else:
            self.highlights_pow_base = None

        self.gating_param = torch.nn.Parameter(torch.Tensor([gating_param]))
        self.gating_param.requires_grad = False
        
        # learnable prototypes for token embedding
        self.prototypes = torch.nn.Parameter(torch.rand(num_prototypes, dim_features))
        self.prototypes.requires_grad = True

        self.no_logic_feats = num_prototypes * 2 + 2

        # Bias is very important here
        self.lin = torch.nn.Linear(self.no_logic_feats, dim_targets, bias=False)

    def _compute_delta(self, w, p):
        """
        Fires when word has similar meaning with respect to prototype
        :param w: the word/concept vector representation
        :param p: a prototype
        """
        # w has size (#pairs, dim_embedding), p has size (num_prototypes, dim-embedding)
        w_exp = w.unsqueeze(1)
        p_exp = p.unsqueeze(0)
        return self._activation(F.cosine_similarity(w_exp, p_exp, dim=2))

    def _compute_not_delta(self, w, p):
        """
        Fires when word has opposite meaning with respect to prototype
        :param w: the word/concept vector representation
        :param p: a prototype
        """
        # w has size (#pairs, dim_embedding), p has size (num_prototypes, dim-embedding)
        w_exp = w.unsqueeze(1)
        p_exp = p.unsqueeze(0)

        return self._activation(-F.cosine_similarity(w_exp, p_exp, dim=2))

    def _or_auxiliary(self, d):
        # size (#inputs, num_prototypes)
        return torch.pow((d - 2), -2 * self.a_or) - torch.pow(self.two, -2 * self.a_or) * (1 - d)

    def _or(self, deltas, and_f):
        # both have size (?, num_prototypes)
        tmp1 = self._or_auxiliary(deltas)

        return torch.sum(tmp1, dim=1) - and_f

    def _xor(self, and_f, or_f):
        # both have size (#pairs, 1)
        return or_f - and_f

    def _activation(self, out):
        return torch.pow(self.gating_param, out - 1)

    def forward(self, *data):

        x, annotations, batch_idx = data

        deltas = self._compute_delta(x, self.prototypes)
        not_deltas = self._compute_not_delta(x, self.prototypes)

        and_deltas, _ = torch.min(deltas, dim=1)
        or_deltas, _ = torch.max(deltas, dim=1)
        xor_deltas = self._xor(and_deltas, or_deltas)

        all_feats = torch.cat((
            deltas, not_deltas,
            and_deltas.unsqueeze(1),
            or_deltas.unsqueeze(1),

            # more_feats[:, 2:],
        ), dim=1)

        src = self.lin(all_feats).squeeze()

        # Apply importance weighting to word pairs, then tanh
        if annotations is not None:
            src = WeightedIdentity.apply(src, annotations, self.highlights_pow_base)

        out = scatter_add(src, batch_idx.long(), dim=0)
        return out, src

In [333]:
def train(train, model, l1_coeff, optimizer, max_epochs, validation=None, device='cpu'):

    loss_function = torch.nn.CrossEntropyLoss()

    model.train()
    print('Training...')

    epochs_loss = []
    for epoch in range(1, max_epochs + 1):  # again, normally you would NOT do 300 epochs, it is toy data

        epoch_losses = []

        b_idx = 0
        for inputs, annotations, targets, _ in train:

            b_idx += 1

            inputs, batch = inputs
            annotations, _ = annotations
            targets, _ = targets

            inputs = inputs.to(device)
            batch = batch.to(device)
            annotations = annotations.long().to(device)
            targets = targets.to(device)

            # Reset the gradient after a mini-batch update
            optimizer.zero_grad()

            # Run the forward pass.
            inputs = (inputs, annotations, batch)
            out, _ = model(*inputs)

            # Compute the loss, gradients, and update the parameters by calling optimizer.step()
            loss = loss_function(out, targets.long())

            # L1 regularization on linear model to get sparse activations
            l1_norm = torch.norm(model.lin.weight, p=1)
            loss += l1_norm * l1_coeff

            loss.backward()
            optimizer.step()

            epoch_losses.append(float(loss))

            # This solves memory issues when on GPU
            inputs = None
            batch = None
            annotations = None
            targets = None
            out = None
            loss = None

        epoch_avg_loss = sum(epoch_losses) / len(epoch_losses)
        epochs_loss.append(epoch_avg_loss)

        if validation is not None and epoch % 10 == 0:
            valid_loss, _, _, _ = computeLossLF(validation, model, reduction='mean')  # default reduction is 'none'
            print(f'Epoch {epoch}, train avg loss is {epoch_avg_loss}, valid avg loss is {valid_loss}')
        elif epoch == 1 or epoch % 10 == 0:
            print(f'Epoch {epoch}, train avg loss is {epoch_avg_loss}')
            
def predict(data, model, device='cpu'):
    
    losses = []
    idx_order = []  # needed to keep track of the samples order (to update weights in boosting)
    outs = None
    all_targets = None

    loss_function = torch.nn.CrossEntropyLoss(reduction='none')

    model.eval()

    b_idx = 0
    with torch.no_grad():
        for inputs, annotations, targets, indices in data:

            b_idx += 1
            inputs, batch = inputs
            annotations, _ = annotations
            targets, _ = targets

            inputs = inputs.to(device)
            batch = batch.to(device)
            annotations = annotations.long().to(device)
            targets = targets.to(device)

            # Run the forward pass.
            inputs = (inputs, None, batch)
            out, _ = model(*inputs)

            if outs is None:
                outs = out.detach().cpu()
            else:
                outs = torch.cat((outs, out.detach().cpu()), dim=0)

            if all_targets is None:
                all_targets = targets.detach().cpu()
            else:
                all_targets = torch.cat((all_targets, targets.detach().cpu()), dim=0)

            # Compute the loss, gradients, and update the parameters by calling optimizer.step()
            loss = loss_function(out.detach(), targets.detach().long())

            loss = loss.detach().cpu().numpy()
            losses.extend(list(loss))
            idx_order.extend(list(indices))

            # This solves memory issues when on GPU
            inputs = None
            batch = None
            annotations = None
            targets = None
            loss = None

            # print('', end='')

    return losses, idx_order, outs, all_targets

### Step 8: construct the training set

In [334]:
import pickle
from pathlib import Path

with open(Path(f'arXiv/training_set.pkl'), 'rb') as f:
    training_raw = pickle.load(f)

train_titles = training_raw['training_titles']
train_abstracts = training_raw['training_abstracts']

# Merge labelled titles and abstracts
assert len(train_titles) == len(train_abstracts)
train_merged = []
for i in range(len(train_titles)):
    train_merged.append(train_titles[i])
    train_merged.append(train_abstracts[i])

In [335]:
train_set = ArXivDataset(train_merged, gensim_model)

In [340]:
import math 

# We want to be sure that our few training examples are included
batch_size = len(train_set)

embedding_dim = 300  # fasttext embedding dimension is 300
dim_target = 2
l1 = 1e-4
l2 = 1e-1
learning_rate = 1e-2
num_prototypes = 5
gating_param = 10
highlights_pow_base = 5


# Create a DataLoader
train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=custom_collate, shuffle=True)

# Istantiate our model
model = NeuralPM(embedding_dim, dim_target, num_prototypes=num_prototypes, gating_param=gating_param, highlights_pow_base=highlights_pow_base)

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2)

In [341]:
train(train_loader, model, l1, optimizer, max_epochs=500)

Training...
Epoch 1, train avg loss is 0.46683746576309204
Epoch 10, train avg loss is 0.42800572514533997
Epoch 20, train avg loss is 0.42619532346725464
Epoch 30, train avg loss is 0.42501842975616455
Epoch 40, train avg loss is 0.42360803484916687
Epoch 50, train avg loss is 0.42235827445983887
Epoch 60, train avg loss is 0.4206400215625763
Epoch 70, train avg loss is 0.4181598722934723
Epoch 80, train avg loss is 0.4138307571411133
Epoch 90, train avg loss is 0.4053349196910858
Epoch 100, train avg loss is 0.3887597918510437
Epoch 110, train avg loss is 0.359354168176651
Epoch 120, train avg loss is 0.3216119706630707
Epoch 130, train avg loss is 0.2830828130245209
Epoch 140, train avg loss is 0.24623601138591766
Epoch 150, train avg loss is 0.21759721636772156
Epoch 160, train avg loss is 0.20393307507038116
Epoch 170, train avg loss is 0.1813851147890091
Epoch 180, train avg loss is 0.6190871000289917
Epoch 190, train avg loss is 0.2862916588783264
Epoch 200, train avg loss is 0.

### Step 9: Visualize heatmaps

In [342]:
from IPython.display import display, HTML

def colorize(words, opacity_array):
    # words is a list of words
    # color_array is an array of numbers between 0 and 1 of length equal to words
    string = ''
    for word, opacity in zip(words, opacity_array):
        string += f'<span style="background-color:hsl(360,100%,{int(opacity * 50 + 50)}%"> {word} </span>'
    return f'<div>{string}</div>'


def get_token_info(model, loader, idx=0, device='cpu'):

    inputs, annotations, targets, tokens = loader.dataset.cached[idx]

    model.eval()

    # print('New Batch (should have just one)')
    inputs = inputs.to(device)
    targets = targets.to(device)

    x = inputs

    # Run the forward pass.
    inputs = (inputs, None, torch.ones(inputs.shape[0]))
    classification, predictions = model(*inputs)

    deltas = model._compute_delta(x, model.prototypes)
    n_deltas = model._compute_not_delta(x, model.prototypes)
        
    return tokens, annotations, F.softmax(predictions, dim=1), F.softmax(classification, dim=1), deltas, n_deltas, targets


def generate_html(tokens, predictions):
    words = tokens
    opacity_array = predictions[:, 1].detach().numpy()
    s = colorize(words, 1-(opacity_array - 0.5))

    display(HTML(s))        
    return s

In [343]:
# Get (token, prediction) for a sentence, for each model
# 8, 17 is a good idx (False positive)
sentences = []

train_loader = DataLoader(train_set, batch_size=1, collate_fn=custom_collate, 
                          shuffle=False) # must stay false
for _ in train_loader:
    pass  # just loads tokens in cache

for idx in range(len(train_set)):
    o = get_token_info(model, train_loader, idx=idx)
    tokens, annotations, predictions, classification, _, _, target = o
    
    print(f'PREDICTION IS: {classification}')
    # Generate html code for each model and store it into a file
    sentences.append(generate_html(tokens, predictions))

PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.4851, 0.5149]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.9744, 0.0256]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.5493, 0.4507]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.9549, 0.0451]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.4837, 0.5163]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.9909, 0.0091]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8140, 0.1860]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.5066e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7759, 0.2241]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 3.3585e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7824, 0.2176]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9999e-01, 7.9640e-06]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6702, 0.3298]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 8.4941e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8221, 0.1779]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.8062e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8288, 0.1712]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 8.4033e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8240, 0.1760]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9995e-01, 5.2948e-05]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7393, 0.2607]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 9.4505e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8578, 0.1422]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9999e-01, 7.2223e-06]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.5237, 0.4763]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.9981, 0.0019]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8454, 0.1546]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.6091e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6706, 0.3294]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 7.2178e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7757, 0.2243]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.0136e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8980, 0.1020]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.3496e-10]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6315, 0.3685]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.6457e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7683, 0.2317]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.2330e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6576, 0.3424]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9981e-01, 1.9070e-04]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6336, 0.3664]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9983e-01, 1.6768e-04]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8085, 0.1915]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.5220e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6799, 0.3201]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.1235e-06]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.9115, 0.0885]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.3668e-11]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6956, 0.3044]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9994e-01, 6.1153e-05]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8136, 0.1864]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.7955e-06]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8566, 0.1434]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.4423e-11]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8464, 0.1536]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9996e-01, 3.7959e-05]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7195, 0.2805]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 8.2519e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8474, 0.1526]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 9.9524e-10]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7332, 0.2668]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9982e-01, 1.8422e-04]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8211, 0.1789]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 3.5621e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8480, 0.1520]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 3.4132e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8413, 0.1587]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.8288e-09]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8466, 0.1534]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.0835e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7500, 0.2500]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.9788e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8113, 0.1887]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.4320e-06]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7738, 0.2262]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 3.6763e-10]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7547, 0.2453]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 3.7066e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6192, 0.3808]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.7907e-06]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.5414, 0.4586]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 3.4024e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7100, 0.2900]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.7475e-06]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7064, 0.2936]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9998e-01, 2.0508e-05]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8771, 0.1229]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.0494e-11]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6008, 0.3992]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.2698e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7703, 0.2297]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 3.3793e-07]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8282, 0.1718]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.0363e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6942, 0.3058]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 8.7454e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.7263, 0.2737]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.3393e-09]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6704, 0.3296]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 2.7152e-09]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.8211, 0.1789]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 1.1104e-09]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6141, 0.3859]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [1.0000e+00, 6.8509e-08]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[0.5000, 0.5000],
        [0.6793, 0.3207]], grad_fn=<SoftmaxBackward>)


PREDICTION IS: tensor([[5.0000e-01, 5.0000e-01],
        [9.9999e-01, 1.1636e-05]], grad_fn=<SoftmaxBackward>)
