In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%%capture
!pip install transformers

In [3]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

import transformers
from sklearn.metrics import *
from transformers import AdamW
from tqdm.notebook import tqdm
from scipy.special import softmax
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split as tts
from transformers import BertTokenizerFast, BertConfig, BertForSequenceClassification, AutoModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [4]:
# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#### Download

In [5]:
cd drive/My Drive/Colab Notebooks/experiments

/content/drive/My Drive/Colab Notebooks/experiments


In [6]:
# Download humor detection data
# Paper: https://arxiv.org/abs/2004.12765
data = pd.read_csv("data/moh-x.csv")
print("\nThere are", len(data), "sentences")

# Use the standard text/label columns
# Create labels: 1 --> humorous, 0 --> not humorous
data["label"] = data["label"].apply(int)
data.head()


There are 647 sentences


Unnamed: 0,arg1,arg2,verb,sentence,verb_idx,label
0,knowledge,,absorb,He absorbed the knowledge or beliefs of his t...,1,1
1,cost,,absorb,He absorbed the costs for the accident .,1,1
2,tax,,absorb,The sales tax is absorbed into the state inco...,4,1
3,immigrant,,absorb,The immigrants were quickly absorbed into soc...,4,1
4,interest,,absorb,Her interest in butterflies absorbs her compl...,4,1


#### Split to training, validation and test



In [7]:
# Use a subset for quick experiments
#subset_data = data[:10000]

# Split to train, val and test
train, test = tts(data[["sentence", "label"]], random_state=42, test_size=0.1)
train, val = tts(train, random_state=42, test_size=test.shape[0])

#### Tokenize and encode with BERT tokenizer

In [8]:
# Construct a BERT tokenizer based on WordPiece
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [9]:
# A sanity check of the tokenizer
encoded_instance = bert_tokenizer.batch_encode_plus([train.iloc[0].sentence], padding=True)
print(encoded_instance)

{'input_ids': [[101, 1045, 6187, 1050, 1005, 1056, 4965, 2023, 2466, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [10]:
print("Original text:", train.iloc[0].sentence)
print("BERT BPEs:", bert_tokenizer.convert_ids_to_tokens(encoded_instance["input_ids"][0]))

Original text:  I ca n't buy this story .
BERT BPEs: ['[CLS]', 'i', 'ca', 'n', "'", 't', 'buy', 'this', 'story', '.', '[SEP]']


In [11]:
# Set max_len to the maximum length of the training data 
max_len = max([len(bert_tokenizer.encode(s)) for s in train.sentence.to_list()])
print("The maximum sentence length in training based on BERT BPEs is", max_len)

The maximum sentence length in training based on BERT BPEs is 21


In [12]:
# Tokenize and encode sentences in each set
x_train = bert_tokenizer.batch_encode_plus(
    train.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)
x_val = bert_tokenizer.batch_encode_plus(
    val.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)
x_test = bert_tokenizer.batch_encode_plus(
    test.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)

In [13]:
# Convert lists to tensors in order to feed them to our PyTorch model
train_seq = torch.tensor(x_train['input_ids'])
train_mask = torch.tensor(x_train['attention_mask'])
train_y = torch.tensor(train.label.tolist())

val_seq = torch.tensor(x_val['input_ids'])
val_mask = torch.tensor(x_val['attention_mask'])
val_y = torch.tensor(val.label.tolist())

test_seq = torch.tensor(x_test['input_ids'])
test_mask = torch.tensor(x_test['attention_mask'])
test_y = torch.tensor(test.label.tolist())

In [14]:
batch_size = 32

# Create a dataloader for each set

# TensorDataset: Creates a PyTorch dataset object to load data from
train_data = TensorDataset(train_seq, train_mask, train_y)
# RandomSampler: specify the sequence of indices/keys used in data loading
train_sampler = RandomSampler(train_data)
# DataLoader: a Python iterable over a dataset
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)

## Build and train the model

In [15]:
# Define which BERT model to use
# We will use BERT base pre-trained on uncased text
model_name = "bert-base-uncased"
# The BertForSequenceClassification class creates a model with BERT and a classifier on top
# The classifier is a linear layer with two outputs (two is the default, if you have more labels change the config)
# It uses the CrossEntropyLoss from PyTorch
# from_pretrained() is used to load pre-trained weights
model = BertForSequenceClassification.from_pretrained(model_name, output_attentions=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
# Training method
def training():
  # Set to train mode
  model.train()
  total_loss, total_accuracy = 0, 0
  # Iterate through the training batches
  for batch in tqdm(train_dataloader, desc="Iteration"):    
    # Push the batch to gpu
    batch = [r.to(device) for r in batch] 
    sent_id, mask, labels = batch
    # Clear gradients 
    model.zero_grad()
    # Get model outputs
    outputs = model(sent_id, attention_mask=mask, labels=labels)
    # Get loss
    loss = outputs.loss
    # Add to the total loss
    total_loss = total_loss + loss
    # Backward pass to calculate the gradients
    loss.backward()
    # Update parameters
    optimizer.step()
  # Compute the training loss of the epoch
  epoch_loss = total_loss / len(train_dataloader)

  return epoch_loss

In [17]:
# Evaluation method
def evaluate():  
  print("\nEvaluating...")  
  # Set to eval mode
  model.eval()
  total_loss, total_accuracy = 0, 0
  predictions, targets = [], []
  # Iterate through the validation batches
  for batch in val_dataloader:
    # Push the batch to gpu
    batch = [t.to(device) for t in batch]
    sent_id, mask, labels = batch
    # Save the gold labels to use them for evaluation
    targets.extend(labels.detach().cpu().numpy())
    # Deactivate autograd
    with torch.no_grad():
      # Get model outputs
      outputs = model(sent_id, attention_mask=mask, labels=labels)
      # Get loss
      loss = outputs.loss
      total_loss = total_loss + loss
      # Apply softmax to the output of the model
      output_probs = softmax(outputs.logits.detach().cpu().numpy(), axis=1)
      # Get the index with the largest probability as the predicted label
      predictions.extend(np.argmax(output_probs, axis=1))
  # Compute the validation loss of the epoch
  epoch_loss = total_loss / len(val_dataloader)

  return epoch_loss, targets, predictions

In [18]:
# Push model to gpu
model = model.to(device)
# Define the optimizer and the learning rate
optimizer = AdamW(model.parameters(), lr = 2e-5)

best_val_loss = float('inf')
best_epoch = -1
train_losses=[]
val_losses=[]
epochs = 5
# Define the number of epochs to wait for early stopping
patience = 3

# Train the model
for epoch in range(epochs):     
  print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))    
  train_loss = training()
  val_loss, val_targets, val_predictions = evaluate()

  train_losses.append(train_loss)
  val_losses.append(val_loss)

  print("\nTraining Loss:", train_loss)
  print("Validation Loss:", val_loss)
  # Calculate the validation F1 score for the current epoch
  f1 = f1_score(val_targets, val_predictions, average="binary")
  print("F1 score:", round(f1, 3))

  # Save the model with the best validation loss
  if val_loss < best_val_loss:
    best_val_loss = val_loss
    best_epoch = epoch
    torch.save(model.state_dict(), 'saved_weights.pt')

  # Early stopping
  if ((epoch - best_epoch) >= patience):
    print("No improvement in", patience, "epochs. Stopped training.")
    break



 Epoch 1 / 5


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=17.0, style=ProgressStyle(description_wid…



Evaluating...

Training Loss: tensor(0.6608, device='cuda:0', grad_fn=<DivBackward0>)
Validation Loss: tensor(0.6224, device='cuda:0')
F1 score: 0.714

 Epoch 2 / 5


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=17.0, style=ProgressStyle(description_wid…



Evaluating...

Training Loss: tensor(0.4780, device='cuda:0', grad_fn=<DivBackward0>)
Validation Loss: tensor(0.4587, device='cuda:0')
F1 score: 0.702

 Epoch 3 / 5


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=17.0, style=ProgressStyle(description_wid…



Evaluating...

Training Loss: tensor(0.3642, device='cuda:0', grad_fn=<DivBackward0>)
Validation Loss: tensor(0.4626, device='cuda:0')
F1 score: 0.653

 Epoch 4 / 5


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=17.0, style=ProgressStyle(description_wid…



Evaluating...

Training Loss: tensor(0.2107, device='cuda:0', grad_fn=<DivBackward0>)
Validation Loss: tensor(0.4229, device='cuda:0')
F1 score: 0.755

 Epoch 5 / 5


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=17.0, style=ProgressStyle(description_wid…



Evaluating...

Training Loss: tensor(0.1285, device='cuda:0', grad_fn=<DivBackward0>)
Validation Loss: tensor(0.5069, device='cuda:0')
F1 score: 0.694


In [19]:
# Save checkpoint to your drive
# Zip
#!zip saved_weights.zip  saved_weights.pt
# Mount
#from google.colab import drive
#drive.mount('/content/gdrive')
# Copy to your drive folder
#!cp -r saved_weights.zip /content/gdrive/MyDrive/

## Inference

#### Load the saved checkpoint

In [20]:
# Use this code to download the model saved in your drive 
# Add the id from the shareable link of the file 
# !gdown --id add_shareable_link_id
# !unzip saved_weights.zip

In [21]:
# Create the model
model_e = BertForSequenceClassification.from_pretrained("bert-base-uncased", output_attentions=True)
# Load pre-trained weights
checkpoint = torch.load("saved_weights.pt", map_location="cpu")
# Add them to the model
model_e.load_state_dict(checkpoint)
model_e = model_e.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### Get predictions for test

In [22]:
# Predict for the test set and save the results
model_e.eval()
test_predictions = []
test_targets = []
test_attentions = []
test_inputs = []

for batch in test_dataloader:
  batch = [t.to(device) for t in batch]
  sent_id, mask, labels = batch
  # Get gold labels
  test_targets.extend(labels.detach().cpu().numpy())
  # Get input words
  test_inputs.append(bert_tokenizer.convert_ids_to_tokens(sent_id.detach().cpu().numpy()[0]))
  with torch.no_grad():
    # Get predictions
    outputs = model_e(sent_id, attention_mask=mask)
    # Apply softmax to the outputs
    output_probs = softmax(outputs.logits.detach().cpu().numpy(), axis=1)
    # Get the with the highest probability as the predicted label
    test_predictions.extend(np.argmax(output_probs, axis=1))
    # Get attention weights
    # Attention weights from all layers are returned in a tuple
    # The weights from each layer are in a tensor with shape (batch_size, attention_heads, max_len, max_len)
    test_attentions.append(outputs.attentions)

#### Evaluate

In [23]:
print("F1:", f1_score(test_targets, test_predictions, average="binary"))
print("ACC:", accuracy_score(test_targets, test_predictions))
print("RECALL:", recall_score(test_targets, test_predictions))
print("PRECISION:", average_precision_score(test_targets, test_predictions))
print("AUPR:", average_precision_score(test_targets, test_predictions))
print("AUC:", roc_auc_score(test_targets, test_predictions))

F1: 0.7945205479452055
ACC: 0.7692307692307693
RECALL: 0.8055555555555556
PRECISION: 0.7390736890736891
AUPR: 0.7390736890736891
AUC: 0.7648467432950192


In [None]:
#max pooling to generate a fixed sized sentence embedding


#Max Pooling - Take the max value over time for every dimension
def max_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.resize_(token_embeddings.size())
    #input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
    max_over_time = torch.max(token_embeddings, 1)[0]
    return max_over_time

def avg_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.resize_(token_embeddings.size())
    #input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
    avg_over_time = torch.mean(token_embeddings, 1)[0]
    return avg_over_time


#Sentences we want sentence embeddings for
sentences = ['The', 'stars', 'gravitate', 'towards', 'each', 'other.']

#Tokenize sentences
encoded_input = bert_tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
encoded_input = encoded_input.to(device)

#Compute token embeddings
with torch.no_grad():
    model_output = model_e(**encoded_input)

#Perform pooling. In this case, max pooling
sentence_embeddings = max_pooling(model_output, encoded_input['attention_mask'])
#avg_sentence_embeddings = avg_pooling(model_output, encoded_input['attention_mask'])


print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([ 4.2417e-01, -1.9496e-01, -9.3668e-01,  8.9708e-01, -1.0000e+09,
         1.2806e+00], device='cuda:0')


In [None]:
# numpy implementation of argmax
from numpy import argmax

sentence_embeddings = sentence_embeddings.cpu()

# get argmax
result = argmax(sentence_embeddings)
print('arg max of %s: %d' % (sentence_embeddings, result))

arg max of tensor([ 4.2417e-01, -1.9496e-01, -9.3668e-01,  8.9708e-01, -1.0000e+09,
         1.2806e+00]): 5


In [None]:
#CLS token of each input represents the sentence embedding


#Sentences we want sentence embeddings for
sentences = ['The', 'stars', 'gravitate', 'towards', 'each', 'other']


#Tokenize sentences
encoded_input = bert_tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
encoded_input = encoded_input.to(device)

#Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    #model_output = model_output.to(device)
    
sentence_embeddings = model_output[0][:,0] #Take the first token ([CLS]) from each sentence 

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([ 0.3607, -0.1950,  0.9743, -0.1271, -0.1818,  0.1982], device='cuda:0')


In [None]:
# numpy implementation of argmax
from numpy import argmax

sentence_embeddings = sentence_embeddings.cpu()

# get argmax
result = argmax(sentence_embeddings)
print('arg max of %s: %d' % (sentence_embeddings, result))

arg max of tensor([ 0.3607, -0.1950,  0.9743, -0.1271, -0.1818,  0.1982]): 2


## Attention analysis

In [None]:
# Get attention heatmaps
import matplotlib
from IPython.core.display import display, HTML
def colorize(words, color_array):
    cmap=matplotlib.cm.Reds
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

In [None]:
test_attentions1 = np.array(test_attentions)

In [None]:
test_attentions1.shape

(65, 12)

#### What does the CLS token attend to?




In [None]:
# Max Pooling for all tokens in sentences and argmax

# Select some sentences randomly
sent_index = [0,1,2]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    for h, head in enumerate(attention):
      print("Head", h+1)
      # Get the attention for the cls token
      encoded_tokens = bert_tokenizer(tokens, truncation=True, padding=True, max_length=128, return_tensors='pt')
      encoded_tokens = encoded_tokens.to(device)
      with torch.no_grad():
        model_output1 = model_e(**encoded_tokens)
        tokens_embeddings = max_pooling(model_output1, encoded_tokens['attention_mask'])
        tokens_embeddings = tokens_embeddings.cpu()
        cls_attentions = head[0]
        display(HTML(colorize(tokens, cls_attentions)))
        print("Tokens embeddings:")
        print(tokens_embeddings)
        arg = argmax(tokens_embeddings)
        print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 2
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 3
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 4
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 5
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 6
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 7
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 8
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 9
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 10
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 11
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 12
Head 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
Head 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
****************************************************************************************************

Layer 1
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 2
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 3
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 4
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 5
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 6
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 7
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 8
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 9
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 10
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 11
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 12
Head 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
Head 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
****************************************************************************************************

Layer 1
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 2
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 3
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 4
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 5
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 6
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 7
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 8
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 9
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 10
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 11
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 12
Head 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
Head 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7


In [None]:
# Max Pooling for all tokens in sentences and argmax

# Select some sentences randomly
sent_index = [0,1,2,3,4,5,6,7,8,9,10]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, truncation=True, padding=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = max_pooling(model_output1, encoded_tokens['attention_mask'])
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.2473, 0.9523, 0.4242, 0.8031, 0.4781, 0.9407, 0.4242,
        0.8332, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  4.0820e-01, -4.7039e-01,
        -1.0000e+09,  8.9708e-01, -1.8184e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01,  9.0877e-01]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 2


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 3


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 4


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 5


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 6


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 7


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 8


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 9


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 10


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 11


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 12


Tokens embeddings:
tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7786, 0.7300, 1.0843, 0.7023, 0.9684, 0.8043, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.7497, 0.5880, 0.6526, 0.5257, 0.7654, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7453, 0.8685, 0.9340, 0.5461, 1.1698, 0.4242, 0.8888, 0.5257,
        0.4242, 0.7122, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 2


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 3


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 4


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 5


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 6


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 7


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 8


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 9


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 10


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 11


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7

Layer 12


Tokens embeddings:
tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088])
arg max of tensor([0.9895, 0.5281, 1.0524, 0.4242, 0.7774, 0.9190, 0.4061, 1.1698, 1.1527,
        0.4242, 0.5712, 0.5257, 0.4242, 0.4901, 1.1193, 0.9143, 0.9088]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4164, 0.7307, 0.6770, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7380, 0.7786, 1.0187, 0.4761, 1.1173, 0.5142, 0.5991,
        0.8886, 1.1363, 1.0971, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  5.9875e-01, -7.7087e-01,
        -1.0000e+09,  8.7854e-01, -1.7416e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0872, 0.8572, 1.1698, 1.1841, 0.4242, 0.7143, 0.4259,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.0121e-01,  7.3074e-01,  7.8681e-01, -1.0000e+09,
         5.1179e-01,  3.6069e-01,  4.5132e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8


In [None]:
# Select some sentences randomly
sent_index = [11,12,13,14,15,16,17,18,19,20]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, truncation=True, padding=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = max_pooling(model_output1, encoded_tokens['attention_mask'])
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6076, 0.8439, 0.5118, 0.4728, 0.8954, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6750, 0.5550, 0.4496, 0.7141, 1.5588, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01, -1.0000e+09,  7.8571e-01, -3.7677e-01,
        -1.0000e+09,  7.7252e-01, -8.5529e-01, -1.0000e+09,  4.4874e-01,
        -6.5369e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8875, 0.5550, 0.3893, 0.4195, 1.3044, 0.4600, 0.6823,
        0.5168, 0.5257, 0.7382, 0.7850, 1.1193, 0.9143, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -3.2309e-01,  8.7149e-01,  4.6920e-01, -1.0000e+09,
         7.8861e-01, -5.6361e-01,  9.9193e-01,  6.0661e-01, -1.0000e+09,
         9.1429e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.7848, 0.7382, 0.8525, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.1183, 1.2218, 1.0700, 0.5363, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5376, 0.4242, 1.0980, 0.5257, 1.1452, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9703, 0.6242, 1.1698, 0.3996, 0.4242, 1.0595, 0.8456,
        0.4195, 0.4242, 1.2093, 0.8680, 1.1193, 0.9143, 0.9088, 0.9088]): 11
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.5298, 0.8282, 0.8634, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5


In [None]:
# Pooling First token [CLS] for each sentence - argmax

# Select some sentences randomly
sent_index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #for h, head in enumerate(attention):
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = model_output1[0][:,0]
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.9532, -0.3229,  0.3607,  0.8031,  0.4673, -0.1697,
         0.3607,  0.8332, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1950,  0.4082, -0.4704, -0.7181, -0.1271, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1314, -0.0271,  1.0843,  0.1819, -0.1319, -0.1278, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.0858,  0.2622,  0.6526,  0.3548, -0.0890, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.4144, -0.3231,  0.0827, -0.5636,  0.3607, -0.2293,
         0.3548,  0.3607, -0.1500, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  1.0524,  0.3607, -0.1165, -0.2271,  0.3736, -0.5636,
        -0.5413,  0.3607,  0.1299,  0.3548,  0.3607,  0.1492, -0.5098, -0.3798,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.3496, -0.1012,  0.6770, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7380,  0.1314, -0.2119,  0.4761, -0.3852,  0.2590,
         0.0973, -0.2270, -0.8948, -0.4802, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6420,  0.1182, -0.7709,  0.4400, -0.3110, -0.1742,
        -0.7891, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  1.0872, -0.1847, -0.5636, -0.5544,  0.3607,  0.7143,
         0.2263, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5012, -0.1012,  0.7868, -0.5079,  0.2305,  0.3607,  0.2451,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3


In [None]:
# Pooling First token [CLS] for each sentence - argmax

# Select some sentences randomly
sent_index = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #for h, head in enumerate(attention):
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = model_output1[0][:,0]
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6076,  0.8439,  0.2305,  0.4728,  0.8954, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1321,  0.3443,  0.4496,  0.0536, -1.1238, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.7017, -0.0104, -0.3768, -0.3048,  0.1122, -0.8553,
         0.0536,  0.4487, -0.6537, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1136,  0.3443,  0.3893,  0.4195, -0.7570,  0.3510,
         0.1545,  0.1217,  0.3548,  0.0335,  0.0498, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.3231, -0.3362,  0.2568, -0.4200, -0.2508, -0.5636, -0.3115,
         0.0333, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.7848,  0.0335,  0.0056, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5500, -0.3782, -0.3837,  0.2425, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0902,  0.3607, -0.4600,  0.3548, -0.5022, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.0147,  0.1437, -0.5636,  0.3996,  0.3607, -0.4070,
        -0.1878,  0.4195,  0.3607, -0.7143, -0.2401, -0.5098, -0.3798, -0.2843,
        -0.2843]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832, -0.0261, -0.0819, -0.0388, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1


In [None]:
# Max Pooling for all tokens in sentences and argmax

# Select some sentences randomly
sent_index = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, truncation=True, padding=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = max_pooling(model_output1, encoded_tokens['attention_mask'])
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143])
arg max of tensor([0.9895, 0.6823, 0.9243, 1.1698, 0.5754, 0.8282, 1.4670, 0.4195, 0.7382,
        0.7738, 1.1698, 0.7485, 0.4781, 0.8282, 1.2511, 1.1193, 0.9143]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9571, 0.5298, 0.4600, 0.4242, 1.1202, 0.8635, 0.4242,
        0.8998, 1.0932, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6923, 1.1603, 0.4242, 0.4241, 0.5349, 0.8282, 0.7548, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.3591e-01,  1.2598e+00,  4.2417e-01, -1.0000e+09,
         1.1193e+00,  9.1429e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01,  3.3471e-02, -1.0000e+09,  1.5053e+00,  1.7339e-01,
        -1.0000e+09,  1.0089e+00,  3.6069e-01, -1.0000e+09,  7.9095e-01,
         4.3998e-01, -1.0000e+09,  1.1426e+00, -5.0977e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4651, 0.4581, 0.4098, 0.4195, 0.9634, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  7.5750e-01,  3.9625e-01,  4.2417e-01, -1.0000e+09,
         1.0414e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 2


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 3


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 4


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 5


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 6


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 7


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 8


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 9


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 10


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 11


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 12


Tokens embeddings:
tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9634, 1.2222, 0.5378, 0.4563, 0.4242, 1.0414, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6823, 0.5550, 1.4318, 0.7141, 0.9190, 0.5001, 1.2812, 1.1363,
        0.6823, 0.4623, 0.8283, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -8.1088e-02, -1.0000e+09,  7.7862e-01,  1.0410e+00,
        -1.0000e+09,  7.8450e-01, -5.3911e-01, -1.0000e+09,  1.1193e+00,
        -3.7976e-01, -1.0000e+09,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01,  9.0877e-01]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 1.1603, 0.4242, 0.7868, 0.4709, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.4276e-01,  3.2241e-01, -1.0000e+09,
         1.0955e+00, -5.6361e-01,  8.1418e-01,  5.8208e-01, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.6960, 0.9608, 0.7382, 0.4195, 0.5037, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 2


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 3


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 4


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 5


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 6


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 7


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 8


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 9


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 10


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 11


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 12


Tokens embeddings:
tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8670, 0.5076, 0.5641, 1.1698, 0.5850, 1.1363, 0.9055, 1.1692,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0519, 0.3893, 0.8010, 0.5396, 1.2065, 0.5349, 0.4242,
        0.5072, 0.4400, 0.6560, 0.9583, 1.1193, 0.9143, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 2


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 3


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 4


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 5


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 6


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 7


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 8


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 9


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 10


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 11


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 12


Tokens embeddings:
tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9578, 0.5931, 0.4895, 0.9210, 0.9210, 0.5076, 0.5550, 1.0303,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.4767, 1.1157, 0.9523, 0.4242, 1.3051, 0.5257, 0.9178,
        1.0134, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.8588, 0.3893, 0.7259, 0.5761, 0.7886, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.6971, 0.5790, 0.4242, 1.1694, 0.5118, 0.4242, 0.9885,
        1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 2


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 3


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 4


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 5


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 6


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 7


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 8


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 9


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 10


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 11


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 12


Tokens embeddings:
tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.5866, 1.2483, 0.5118, 0.4242, 1.3157, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.8292, 0.8469, 0.4400, 0.6560, 0.7229, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01])
arg max of tensor([ 9.8949e-01, -4.2111e-02, -1.0000e+09,  1.1099e+00, -8.1559e-01,
        -1.0000e+09,  5.1179e-01,  3.6069e-01, -1.0000e+09,  1.2652e+00,
         9.5397e-01, -5.0977e-01,  9.1429e-01,  9.0877e-01, -2.8433e-01,
         9.0877e-01,  9.0877e-01]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 2


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 3


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 4


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 5


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 6


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 7


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 8


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 9


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 10


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 11


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8

Layer 12


Tokens embeddings:
tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.0129, 0.7307, 0.9347, 0.7172, 0.8818, 0.7307, 0.7342, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 2


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 3


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 4


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 5


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 6


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 7


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 8


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 9


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 10


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 11


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4

Layer 12


Tokens embeddings:
tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.2935, 0.7023, 1.4206, 0.8692, 1.3885, 0.5257, 1.1459,
        0.4242, 1.0796, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0129, 1.2955, 0.7857, 0.4968, 0.9523, 0.4242, 1.1183,
        1.2431, 1.2955, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5115, 1.2046, 0.9340, 0.8715, 0.5378, 0.4563, 0.4242,
        1.3157, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.7528, 0.5879, 0.4242, 0.6256, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  7.7243e-01,  5.5505e-01, -1.0000e+09,
         1.2665e+00, -5.0977e-01,  9.1429e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01,
         9.0877e-01, -2.8433e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  5.8813e-01,  1.0955e+00,  4.6955e-01, -1.0000e+09,
         1.0524e+00,  1.1173e+00,  7.7862e-01,  7.8942e-01, -1.0000e+09,
         1.1193e+00, -3.7976e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 2


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 3


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 4


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 5


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 6


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 7


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 8


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 9


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 10


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 11


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3

Layer 12


Tokens embeddings:
tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5281, 0.6242, 1.1698, 0.8363, 1.1173, 0.8995, 0.8639, 1.1698,
        0.5195, 0.4242, 0.7131, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.9102, 0.6474, 0.5880, 0.9236, 1.1698, 0.5378, 0.9295, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -2.2710e-01,  1.3626e+00,  4.2417e-01, -1.0000e+09,
         4.1948e-01,  8.5875e-01,  9.0770e-01,  1.0061e+00, -1.0000e+09,
         9.1429e-01, -2.8433e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -4.4469e-02, -1.0000e+09,  1.2082e+00, -9.7390e-02,
        -1.0000e+09,  1.3055e+00, -6.0056e-01, -1.0000e+09,  9.1429e-01,
         9.0877e-01,  9.0877e-01,  9.0877e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.4285, 1.4318, 0.6340, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01,  3.6069e-01,  5.0221e-01,  6.6180e-01, -1.0000e+09,
         7.3820e-01,  1.9878e-01,  6.6097e-01,  1.1193e+00, -1.0000e+09,
         9.0877e-01,  9.0877e-01,  9.0877e-01,  9.0877e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 2


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 3


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 4


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 5


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 6


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 7


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 8


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 9


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 10


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 11


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5

Layer 12


Tokens embeddings:
tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.7840, 0.4242, 0.9716, 0.4195, 1.4649, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.5501, 0.9279, 0.5349, 0.8888, 1.1698, 1.0884, 1.1193,
        0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0438, 0.4729, 0.4600, 0.6960, 0.7020, 0.8282, 0.6586,
        1.4431, 1.1193, 0.9143, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 0.9539, 0.6242, 0.7524, 0.7382, 0.6490, 1.1193, 0.9143,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 2


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 3


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 4


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 5


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 6


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 7


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 8


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 9


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 10


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 11


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7

Layer 12


Tokens embeddings:
tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.4242, 1.0809, 0.9401, 0.5378, 0.4242, 0.5461, 1.1698, 0.4242,
        0.8888, 0.5257, 0.4242, 0.8417, 1.1193, 0.9143, 0.9088, 0.9088]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 2


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 3


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 4


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 5


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 6


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 7


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 8


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 9


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 10


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 11


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1

Layer 12


Tokens embeddings:
tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 1.4852, 0.9634, 0.5761, 0.8469, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 2


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 3


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 4


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 5


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 6


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 7


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 8


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 9


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 10


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 11


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0

Layer 12


Tokens embeddings:
tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01])
arg max of tensor([ 9.8949e-01, -8.3499e-02, -1.0000e+09,  4.2417e-01, -1.8829e-01,
        -1.0000e+09,  7.7862e-01,  5.5381e-01,  9.5229e-01,  7.7862e-01,
        -2.6366e-01, -1.0000e+09,  9.1429e-01, -2.8433e-01, -1.0000e+09,
         9.0877e-01, -2.8433e-01]): 0
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 2


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 3


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 4


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 5


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 6


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 7


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 8


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 9


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 10


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 11


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6

Layer 12


Tokens embeddings:
tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 0.7268, 0.8913, 0.4327, 0.7878, 1.1193, 0.9143, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 2


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 3


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 4


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 5


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 6


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 7


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 8


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 9


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 10


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 11


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2

Layer 12


Tokens embeddings:
tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088])
arg max of tensor([0.9895, 0.5880, 1.1668, 0.7651, 0.8292, 1.1193, 0.9143, 0.9088, 0.9088,
        0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088, 0.9088]): 2


In [None]:
# Pooling First token [CLS] for each sentence - argmax

# Select some sentences randomly
sent_index = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #for h, head in enumerate(attention):
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = model_output1[0][:,0]
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798])
arg max of tensor([-0.5361,  0.1545, -0.1858, -0.5636,  0.3010, -0.0819, -0.7962,  0.4195,
         0.0335,  0.0273, -0.5636, -0.1332,  0.4673, -0.0819, -0.6754, -0.5098,
        -0.3798]): 12
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1042, -0.0261,  0.3510,  0.3607, -0.4053, -0.1384,
         0.3607, -0.1574, -0.3641, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0631,  1.1603,  0.3607,  0.4241,  0.2081, -0.0819,  0.7548,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7359, -0.5864,  0.3607, -0.3390, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0335, -0.6691, -1.0966,  0.1734,  0.7850, -0.2470,  0.3607,
        -0.4661,  0.0087,  0.4400,  0.0255, -0.5746, -0.5098, -0.3798, -0.2843,
        -0.2843]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2175,  0.2343,  0.4098,  0.4195,  0.9634, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.7575,  0.3164,  0.3607,  0.0399,  1.0414, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.9634,  1.2222,  0.1439,  0.4185,  0.3607,  1.0414, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.1545,  0.3443, -0.9971,  0.0536, -0.2271,  0.2221, -0.7487,
        -0.8948,  0.1545,  0.4004, -0.0733, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.1299,  0.1314,  1.0410, -0.2421,  0.7845, -0.5391,
        -0.7203, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  1.1603,  0.3607,  0.7868,  0.3981, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445,  0.6859,  0.3121,  0.8185, -0.4200, -0.5636, -0.1818,
         0.1982, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0832,  0.9608,  0.0335,  0.4195,  0.5037, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2240,  0.2844,  0.2053, -0.5636,  0.5850, -0.8948, -0.1407,
        -0.6433, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3532,  0.3246, -0.1048,  0.1870, -0.7730,  0.2081,
         0.3607,  0.0514,  0.4400,  0.0255,  0.9583, -0.5098, -0.3798, -0.2843,
        -0.2843]): 12
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1746,  0.5931,  0.4895, -0.1481, -0.1481,  0.2844,  0.3443,
        -0.2835, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.4767, -0.5698, -0.3229,  0.3607, -0.4403,  0.3548,
        -0.2576, -0.1472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.2899,  0.3246,  0.1175,  0.2238, -0.2508, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.6971,  0.2054,  0.3607, -0.7445,  0.2305,  0.3607,
        -0.3585, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -1.0807, -0.6689,  0.2305,  0.3607, -0.8200, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1626, -0.0089,  0.4400,  0.0255, -0.0787, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0421, -0.1713, -0.3200, -0.8156, -0.4200,  0.2305,  0.3607,
         0.1163, -0.5593, -0.0102, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.4726, -0.1012,  0.9347,  0.0726, -0.1823, -0.1012,  0.1276,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622, -0.9558,  0.1819, -0.9064, -0.0541, -0.8429,  0.3548,
        -0.3993,  0.3607, -0.4610, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.4726, -0.5987, -0.0104,  0.2209, -0.3229,  0.3607,
        -0.5500, -0.7279, -0.5987, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1529, -0.6665, -0.3231, -0.3362,  0.1439,  0.4185,
         0.3607, -0.8200, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.7528,  0.5879,  0.3607,  0.1172, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1294,  0.3443, -0.1309, -0.7215, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.5881, -0.4200,  0.2035, -0.5636,  1.0524, -0.3852,  0.1314,
        -0.1220, -0.5840, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.0380,  0.1437, -0.5636,  0.8363, -0.3852, -0.1710, -0.2427,
        -0.5636,  0.1507,  0.3607, -0.0479, -0.5098, -0.3798, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0811,  0.0178,  0.2622, -0.2269, -0.5636,  0.1439, -0.2304,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.2271, -0.7487,  0.3607, -0.2714,  0.4195, -0.2899, -0.0891,
         1.0061, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0445, -0.1406, -0.4818, -0.0974,  0.3607, -0.8862, -0.6006,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1630, -0.9971,  0.6340, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.1963,  0.6618, -0.4200,  0.0335,  0.1988,  0.2009,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.1159,  0.3607, -0.1955,  0.4195, -1.0880, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607,  0.2819, -0.4306,  0.2081, -0.2293, -0.5636, -0.3156,
        -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.5291,  0.4729,  0.3510,  0.0832, -0.0507, -0.0819,
         0.6586, -1.0472, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.1274,  0.1437,  0.7524,  0.0335,  0.6490, -0.5098,
        -0.3798, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.3607, -0.3161,  0.9401,  0.1439,  0.3607,  0.0827, -0.5636,
         0.3607, -0.2293,  0.3548,  0.3607, -0.2585, -0.5098, -0.3798, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 2


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 3


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 4


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 5


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 6


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 7


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 8


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 9


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 10


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 11


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1

Layer 12


Tokens embeddings:
tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  1.4852,  0.9634,  0.2238, -0.0089, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 2


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 3


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 4


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 5


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 6


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 7


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 8


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 9


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 10


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 11


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3

Layer 12


Tokens embeddings:
tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361, -0.0835,  0.1054,  0.3607, -0.1883,  0.3548,  0.1314,  0.2506,
        -0.3229,  0.1314, -0.2637, -0.5098, -0.3798, -0.2843, -0.2843, -0.2843,
        -0.2843]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  0.7268, -0.0996,  0.3913,  0.0396, -0.5098, -0.3798,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 2


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 3


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 4


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 5


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 6


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 7


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 8


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 9


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 10


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 11


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2

Layer 12


Tokens embeddings:
tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843])
arg max of tensor([-0.5361,  0.2622,  1.1668, -0.0241, -0.1626, -0.5098, -0.3798, -0.2843,
        -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843, -0.2843,
        -0.2843]): 2


#### Visualize attentions for specific types of grammatical errors

In [None]:
# Lack of Subject-Verb Agreement
sentence1 = "The stars gravitate towards each other."
# Pronoun Disagreement
sentence2 = "A hot soup will revive me."

In [None]:
# Encode the first sentence
encoded_sentence1 = bert_tokenizer.batch_encode_plus([sentence1], padding=True)

# Give as input to the model and get the outputs
inputs = torch.tensor(encoded_sentence1["input_ids"]).to(device)
att = torch.tensor(encoded_sentence1["attention_mask"]).to(device)
outputs = model_e(inputs, attention_mask=att)

In [None]:
# Get the predictions
output_probs = softmax(outputs.logits.detach().cpu().numpy(), axis=1)
predictions = (np.argmax(output_probs, axis=1))
print(sentence1, ":", predictions[0])

The stars gravitate towards each other. : 1


In [None]:
# Visualize the attention heatmaps for the CLS token
tokens = bert_tokenizer.convert_ids_to_tokens(inputs.detach().cpu().numpy()[0])
for l in range(12):
  print("\nLayer", l+1)
  attention = np.squeeze(outputs.attentions[l].detach().cpu().numpy(), axis=0)
  cls_attentions = []
  for h, head in enumerate(attention):
    print("Head", h+1)
    # Get the attention for the cls token
    cls_attentions = head[0]
    display(HTML(colorize(tokens, cls_attentions)))


Layer 1
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 2
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 3
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 4
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 5
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 6
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 7
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 8
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 9
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 10
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 11
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 12
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12


In [None]:
# Encode the second sentence
encoded_sentence2 = bert_tokenizer.batch_encode_plus([sentence2], padding=True)

# Give as input to the model and get the outputs
inputs = torch.tensor(encoded_sentence2["input_ids"]).to(device)
att = torch.tensor(encoded_sentence2["attention_mask"]).to(device)
outputs = model_e(inputs, attention_mask=att)

# Get the predictions
output_probs = softmax(outputs.logits.detach().cpu().numpy(), axis=1)
predictions = (np.argmax(output_probs, axis=1))
print(sentence2, ":", predictions[0])

A hot soup will revive me. : 0


In [None]:
# Visualize the attention heatmaps for the CLS token
tokens = bert_tokenizer.convert_ids_to_tokens(inputs.detach().cpu().numpy()[0])
for l in range(12):
  print("\nLayer", l+1)
  attention = np.squeeze(outputs.attentions[l].detach().cpu().numpy(), axis=0)
  cls_attentions = []
  for h, head in enumerate(attention):
    print("Head", h+1)
    # Get the attention for the cls token
    cls_attentions = head[0]
    display(HTML(colorize(tokens, cls_attentions)))


Layer 1
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 2
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 3
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 4
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 5
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 6
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 7
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 8
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 9
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 10
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 11
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 12
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12
