In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/ac/aa/1437691b0c7c83086ebb79ce2da16e00bef024f24fec2a5161c35476f499/sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2MB)
[K     |▎                               | 10kB 16.9MB/s eta 0:00:01[K     |▌                               | 20kB 23.1MB/s eta 0:00:01[K     |▉                               | 30kB 27.2MB/s eta 0:00:01[K     |█                               | 40kB 19.3MB/s eta 0:00:01[K     |█▍                              | 51kB 17.1MB/s eta 0:00:01[K     |█▋                              | 61kB 15.8MB/s eta 0:00:01[K     |██                              | 71kB 14.6MB/s eta 0:00:01[K     |██▏                             | 81kB 15.6MB/s eta 0:00:01[K     |██▍                             | 92kB 14.9MB/s eta 0:00:01[K     |██▊                             | 102kB 14.3MB/s eta 0:00:01[K     |███                             | 112kB 14.3MB/s eta 0:00:01

In [3]:
%%capture
!pip install transformers

In [4]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

import transformers
from sklearn.metrics import *
from transformers import AdamW
from tqdm.notebook import tqdm
from scipy.special import softmax
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split as tts
from transformers import BertTokenizerFast, BertConfig, BertForSequenceClassification, AutoModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [5]:
# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

#### Download

In [6]:
cd drive/My Drive/Colab Notebooks/experiments

/content/drive/My Drive/Colab Notebooks/experiments


In [7]:
# Download humor detection data
# Paper: https://arxiv.org/abs/2004.12765
data = pd.read_csv("data/moh-x.csv")
print("\nThere are", len(data), "sentences")

# Use the standard text/label columns
# Create labels: 1 --> humorous, 0 --> not humorous
#data = data.dropna()
data["label"] = data["label"].apply(int)
data.head()


There are 647 sentences


Unnamed: 0,arg1,arg2,verb,sentence,verb_idx,label
0,knowledge,,absorb,He absorbed the knowledge or beliefs of his t...,1,1
1,cost,,absorb,He absorbed the costs for the accident .,1,1
2,tax,,absorb,The sales tax is absorbed into the state inco...,4,1
3,immigrant,,absorb,The immigrants were quickly absorbed into soc...,4,1
4,interest,,absorb,Her interest in butterflies absorbs her compl...,4,1


#### Split to training, validation and test



In [8]:
# Use a subset for quick experiments
#data = data[:10000]

# Split to train, val and test
train, test = tts(data[["sentence", "label"]], random_state=42, test_size=0.1)
train, val = tts(train, random_state=42, test_size=test.shape[0])

#### Tokenize and encode with BERT tokenizer

In [9]:
from transformers import XLMRobertaForSequenceClassification

output_dir = 'stockholm/xlm_code/mohx_xlmroberta/xlm-roberta_model_save'

print(output_dir)

stockholm/xlm_code/mohx_xlmroberta/xlm-roberta_model_save


In [10]:
from transformers import XLMRobertaTokenizer
import torch
# Load the BERT tokenizer.
print('Loading XLMRobertaTokenizer...')
bert_tokenizer = XLMRobertaTokenizer.from_pretrained(output_dir)
model_e = XLMRobertaForSequenceClassification.from_pretrained(output_dir, num_labels = 2, output_attentions = True, output_hidden_states = True,)

Loading XLMRobertaTokenizer...


Some weights of the model checkpoint at stockholm/xlm_code/mohx_xlmroberta/xlm-roberta_model_save were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
MAX_LEN = 21 # This value could be set as 256, 512 etc.

encoded_instance = bert_tokenizer.encode_plus(
            train.iloc[0].sentence,
            truncation = True,                
            add_special_tokens = True,
            max_length = MAX_LEN,     
            pad_to_max_length = True,
            return_attention_mask = True,  
            return_tensors = 'pt' # return pytorch tensors
       )


encoded_instance



{'input_ids': tensor([[    0,    87,   377,   653,    25,    18, 22113,   903, 13765,     6,
             5,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [12]:
print("Original text:", train.iloc[0].sentence)
print("BERT BPEs:", bert_tokenizer.convert_ids_to_tokens(encoded_instance["input_ids"][0]))

Original text:  I ca n't buy this story .
BERT BPEs: ['<s>', '▁I', '▁ca', '▁n', "'", 't', '▁buy', '▁this', '▁story', '▁', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [13]:
# Set max_len to the maximum length of the training data 
max_len = max([len(bert_tokenizer.encode(s)) for s in train.sentence.to_list()])
print("The maximum sentence length in training based on BERT BPEs is", max_len)

The maximum sentence length in training based on BERT BPEs is 25


In [14]:
# Tokenize and encode sentences in each set
x_train = bert_tokenizer.batch_encode_plus(
    train.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)
x_val = bert_tokenizer.batch_encode_plus(
    val.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)
x_test = bert_tokenizer.batch_encode_plus(
    test.sentence.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)

In [15]:
# Convert lists to tensors in order to feed them to our PyTorch model
train_seq = torch.tensor(x_train['input_ids'])
train_mask = torch.tensor(x_train['attention_mask'])
train_y = torch.tensor(train.label.tolist())

val_seq = torch.tensor(x_val['input_ids'])
val_mask = torch.tensor(x_val['attention_mask'])
val_y = torch.tensor(val.label.tolist())

test_seq = torch.tensor(x_test['input_ids'])
test_mask = torch.tensor(x_test['attention_mask'])
test_y = torch.tensor(test.label.tolist())

In [16]:
batch_size = 32

# Create a dataloader for each set

# TensorDataset: Creates a PyTorch dataset object to load data from
train_data = TensorDataset(train_seq, train_mask, train_y)
# RandomSampler: specify the sequence of indices/keys used in data loading
train_sampler = RandomSampler(train_data)
# DataLoader: a Python iterable over a dataset
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=1)

## Inference

#### Load the saved checkpoint

In [17]:
model_e = model_e.to(device)

#### Get predictions for test

In [18]:
# Predict for the test set and save the results
model_e.eval()
test_predictions = []
test_targets = []
test_attentions = []
test_inputs = []

for batch in test_dataloader:
  batch = [t.to(device) for t in batch]
  sent_id, mask, labels = batch
  # Get gold labels
  test_targets.extend(labels.detach().cpu().numpy())
  # Get input words
  test_inputs.append(bert_tokenizer.convert_ids_to_tokens(sent_id.detach().cpu().numpy()[0]))
  with torch.no_grad():
    # Get predictions
    outputs = model_e(sent_id, attention_mask=mask)
    # Apply softmax to the outputs
    output_probs = softmax(outputs.logits.detach().cpu().numpy(), axis=1)
    # Get the with the highest probability as the predicted label
    test_predictions.extend(np.argmax(output_probs, axis=1))
    # Get attention weights
    # Attention weights from all layers are returned in a tuple
    # The weights from each layer are in a tensor with shape (batch_size, attention_heads, max_len, max_len)
    test_attentions.append(outputs.attentions)

#### Evaluate

In [19]:
print("F1:", f1_score(test_targets, test_predictions, average="binary"))
print("ACC:", accuracy_score(test_targets, test_predictions))
print("AUPR:", average_precision_score(test_targets, test_predictions))
print("PRECISION:", precision_score(test_targets, test_predictions))
print("RECALL:", recall_score(test_targets, test_predictions))
print("AUC:", roc_auc_score(test_targets, test_predictions))

F1: 0.8732394366197184
ACC: 0.8615384615384616
AUPR: 0.8396214896214896
PRECISION: 0.8857142857142857
RECALL: 0.8611111111111112
AUC: 0.8615900383141761


## Attention analysis


In [None]:
# Get attention heatmaps
import matplotlib
from IPython.core.display import display, HTML
def colorize(words, color_array):
    cmap=matplotlib.cm.Reds
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

In [None]:
#max pooling to generate a fixed sized sentence embedding


#Max Pooling - Take the max value over time for every dimension
def max_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.resize_(token_embeddings.size())
    #input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
    max_over_time = torch.max(token_embeddings, 1)[0]
    return max_over_time

def avg_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.resize_(token_embeddings.size())
    #input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
    avg_over_time = torch.mean(token_embeddings, 1)[0]
    return avg_over_time


#Sentences we want sentence embeddings for
sentences = ['The', 'stars', 'gravitate', 'towards', 'each', 'other.']

#Tokenize sentences
encoded_input = bert_tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
encoded_input = encoded_input.to(device)

#Compute token embeddings
with torch.no_grad():
    model_output = model_e(**encoded_input)

#Perform pooling. In this case, max pooling
sentence_embeddings = max_pooling(model_output, encoded_input['attention_mask'])
#avg_sentence_embeddings = avg_pooling(model_output, encoded_input['attention_mask'])


print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([0.2211, 0.3184, 0.0814, 0.2717, 0.8786, 1.0564], device='cuda:0')


In [None]:
# numpy implementation of argmax
from numpy import argmax

sentence_embeddings = sentence_embeddings.cpu()

# get argmax
result = argmax(sentence_embeddings)
print('arg max of %s: %d' % (sentence_embeddings, result))

arg max of tensor([0.2211, 0.3184, 0.0814, 0.2717, 0.8786, 1.0564]): 5


In [None]:
#CLS token of each input represents the sentence embedding


#Sentences we want sentence embeddings for
sentences = ['The', 'stars', 'gravitate', 'towards', 'each', 'other']


#Tokenize sentences
encoded_input = bert_tokenizer(sentences, padding=True, truncation=True, max_length=128, return_tensors='pt')
encoded_input = encoded_input.to(device)

#Compute token embeddings
with torch.no_grad():
    model_output = model_e(**encoded_input)
    #model_output = model_output.to(device)
    
sentence_embeddings = model_output[0][:,0] #Take the first token ([CLS]) from each sentence 

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([-0.2066,  0.3184, -0.0368,  0.2717,  0.8786, -0.4190], device='cuda:0')


In [None]:
# numpy implementation of argmax
from numpy import argmax

sentence_embeddings = sentence_embeddings.cpu()

# get argmax
result = argmax(sentence_embeddings)
print('arg max of %s: %d' % (sentence_embeddings, result))

arg max of tensor([-0.2066,  0.3184, -0.0368,  0.2717,  0.8786, -0.4190]): 4


#### What does the CLS token attend to?




In [None]:
# Original Loop

# Select some sentences randomly
sent_index = [0, 1, 2]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    for h, head in enumerate(attention):
      print("Head", h+1)
      # Get the attention for the cls token
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
    

****************************************************************************************************

Layer 1
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 2
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 3
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 4
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 5
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 6
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 7
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 8
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 9
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 10
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 11
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 12
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12


****************************************************************************************************

Layer 1
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 2
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 3
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 4
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 5
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 6
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 7
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 8
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 9
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 10
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 11
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 12
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12


****************************************************************************************************

Layer 1
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 2
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 3
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 4
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 5
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 6
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 7
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 8
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 9
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 10
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 11
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 12
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12


In [None]:
# Max Pooling for all tokens in sentences and argmax

# Select some sentences randomly
sent_index = [0,1,2,3,4,5,6,7,8,9,10]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, truncation=True, padding=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = max_pooling(model_output1, encoded_tokens['attention_mask'])
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4032,  0.5557, -0.0694,  0.5033, -0.3930,  0.2778, -0.0768,
         0.2889, -0.3930,  0.0308, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  7.7492e-01, -1.5160e-02,  1.7457e-01,
        -2.0252e-01,  2.7171e-01,  8.7857e-01,  4.7195e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.2754e-02,  1.3595e-01,  4.9948e-01,  1.4980e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  5.1466e-01,  2.6677e-01,  8.6337e-02,
         1.2286e-01,  2.8882e-01, -1.2687e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 2


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 3


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 4


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 5


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 6


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 7


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 8


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 9


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 10


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 11


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 12


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.1536,  0.8910,  0.1622, -0.0010,  0.0887,  0.3219,
         0.1270, -0.2035,  0.3219, -0.0124,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 2


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 3


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 4


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 5


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 6


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 7


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 8


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 9


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 10


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 11


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7

Layer 12


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701])
arg max of tensor([ 0.0697,  0.2021,  0.4568, -0.2025,  0.3219,  0.6279,  0.2150,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797,  0.3219,  0.3316,  0.2888, -0.3930,
         0.6885, -0.1123,  0.0688, -0.0010,  0.0701]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.0957e-01, -1.5160e-02,  7.3250e-01,
         9.9242e-02,  5.2017e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.5445e-01, -6.3929e-01,  6.0254e-01,
        -1.6008e-01,  9.8355e-01, -4.8771e-01,  7.7242e-01,  5.2017e-01,
         9.5357e-01,  1.6374e-01,  3.6629e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  3.8789e-02,  2.5350e-01,  1.8744e-01,
        -1.4357e-01,  2.2172e-01,  4.8292e-01,  1.5211e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.3930,  0.1637, -0.3290,  0.0887,  0.0604,  0.3219,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8


In [None]:
# Select some sentences randomly
sent_index = [11,12,13,14,15,16,17,18,19,20]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, truncation=True, padding=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = max_pooling(model_output1, encoded_tokens['attention_mask'])
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.4695e-01,  6.0369e-02,  1.5846e-01,
         1.2314e-01,  5.7259e-01,  6.0280e-01,  1.1332e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01,  1.5846e-01,
        -8.3683e-02,  2.0148e-01,  9.6988e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2471,  0.1030,  0.7213, -0.1170,  0.3545, -0.0837,
         0.2716,  0.1232, -0.0152, -0.0643, -0.0010,  0.0701,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02,  2.2267e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01,  2.4143e-01, -4.0497e-01,
         4.9930e-01,  2.8882e-01,  1.2176e+00,  5.8110e-01,  6.8802e-02,
        -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 12
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.4284e-01, -2.9707e-01,  3.2112e-01,
         5.5261e-01,  7.4403e-01,  8.8741e-02,  1.0250e+00, -7.4404e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01,  1.5846e-01,
        -9.7911e-01,  1.8699e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  8.3441e-01, -1.3910e-01,  2.5338e-01,
        -6.9084e-02,  8.4685e-02, -2.4081e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -6.7545e-02,  3.2186e-01, -3.5389e-01,  5.2750e-01,
        -2.0351e-01,  3.2360e-01, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0475, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
         0.3219, -0.5077,  0.5049,  0.0950,  0.3219,  0.8606,  0.1562,  0.6591,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01,  1.1437e+00,
        -1.2175e-01,  2.5263e-01,  6.8802e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4


In [None]:
# Max Pooling for all tokens in sentences and argmax

# Select some sentences randomly
sent_index = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, truncation=True, padding=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = max_pooling(model_output1, encoded_tokens['attention_mask'])
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.9882e-01,  3.7319e-01,  8.8741e-02,  7.4797e-02,
        -1.0577e+00,  1.1827e+00,  9.4951e-02,  1.2176e+00, -2.0302e-01,
        -8.6322e-02,  8.8741e-02,  4.4967e-01, -7.6757e-02,  1.1437e+00,
         7.1513e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2786,  0.2052,  0.3938, -0.1941,  0.3219, -0.8033,
         0.0364,  0.3219,  0.2233, -0.3822,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01,  1.1437e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -1.3389e-01,  4.9263e-02, -2.2303e-01,  6.5727e-01,
        -3.9304e-01,  4.7623e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 2


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 3


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 4


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 5


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 6


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 7


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 8


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 9


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 10


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 11


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13

Layer 12


Tokens embeddings:
tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0615,  0.1960, -0.3584, -0.0152, -0.1529,  0.3398, -0.0694,
         0.3219, -0.1749,  0.2718, -0.1436, -0.0152,  0.7535,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 13
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3365e-01,  2.4954e-01,  5.3940e-02,
         9.4951e-02,  8.3178e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.3533e-02,  2.7131e-01, -8.9444e-02,  3.2186e-01,
         4.3948e-01,  7.7041e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7534e-01,  3.8479e-01, -1.1768e-01,  6.7170e-01,
        -1.1254e-01,  3.2186e-01,  3.9400e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2988,  0.0371,  0.3652,  0.1888,  0.1585,  0.2750,  0.6885,
         0.4481,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716,  0.1552,  0.1585,
         0.0688, -0.0010,  0.0701, -0.1437,  0.1977]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00,  2.7768e-01,  1.0034e-01,  1.1112e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  8.2651e-01, -1.1768e-01,  3.2186e-01,
         3.2081e-01,  2.0957e-01, -1.1327e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 2


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 3


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 4


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 5


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 6


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 7


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 8


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 9


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 10


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 11


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 12


Tokens embeddings:
tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.0745,  0.2372,  0.2939,  0.1585, -0.4987,  0.3093, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01,  1.2176e+00,
         9.4951e-02,  2.6842e-01, -1.5160e-02,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
         1.3130, -0.4650,  0.0701, -0.1437,  0.1977,  0.1977,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.8802e-02,  8.3658e-01,  3.1421e-02,
        -1.0000e+09,  1.9830e-01,  1.8564e-01,  1.5566e-01, -1.9737e-01,
         3.2186e-01,  3.1362e-01,  2.4542e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 14
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4739,  0.1736, -0.1410,  0.1585, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 2


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 3


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 4


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 5


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 6


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 7


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 8


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 9


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 10


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 11


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4

Layer 12


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.0998, -0.1177,  0.3457, -0.3240,  0.3219, -0.8604,
         0.2888, -0.3730,  0.0920, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.6295e-01,  3.1421e-02,  1.6643e-01,
        -3.3954e-02, -8.6322e-02, -5.8743e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 12
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930,  0.2598, -0.0848,
         0.2972,  0.1231,  0.3219, -0.0018,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -4.7071e-01,  4.0065e-01, -6.6095e-01,  1.1389e-01,
         5.0980e-01,  1.2314e-01, -3.9304e-01,  5.0861e-01, -6.4283e-02,
        -9.5306e-04,  7.0138e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  1.7266e-01, -8.6322e-02, -4.8042e-01,  2.4542e-01,
        -1.5160e-02,  1.0754e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.6456e-02,  1.5625e-01, -1.0938e-01,  1.2314e-01,
        -3.9304e-01,  6.1218e-02,  4.0942e-01,  6.5911e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.0094,  0.7325,  0.7810,  0.5202, -0.2430,  0.1072, -0.6133,
         0.2029, -0.1177,  0.0688, -0.0010,  0.0701, -0.1437,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.8999e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01,  5.1000e-01, -2.0351e-01,  6.8802e-02, -1.1824e+00,
         3.2186e-01, -4.7894e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2758, -0.2805,  0.1030,  0.4687,  0.1585, -0.3240,
         0.3219, -0.4676,  0.6038, -0.2805,  0.0688, -0.0010,  0.0701, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.2552, -0.5237,  0.3506,  0.1622,  0.3691, -0.4961,
         0.2211, -0.3930,  0.5086, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  2.6956e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04,  7.0138e-02,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.1668e-01,  3.7102e-02, -4.3724e-02,
        -2.5309e-02,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.4489,  0.5232,  0.1841,  0.0887,  0.4568,  0.1740, -0.8463,
         0.5293,  0.4093,  0.5055, -0.0643, -0.0010, -0.0650,  0.1977, -0.1437,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 2


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 3


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 4


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 5


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 6


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 7


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 8


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 9


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 10


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 11


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2

Layer 12


Tokens embeddings:
tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463,  0.2675, -0.0121,
         0.0887,  0.3295,  0.3219, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  8.0882e-02,  6.8802e-02, -6.2366e-01,  1.7395e-01,
        -1.0000e+09,  1.2756e+00,  8.8741e-02,  6.7170e-01, -5.6287e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 2


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 3


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 4


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 5


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 6


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 7


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 8


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 9


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 10


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 11


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7

Layer 12


Tokens embeddings:
tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950,  0.1630,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650,  0.1977,  0.1977,
         0.1977, -0.1437,  0.1977, -0.1437,  0.1977]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.4549e-02,  2.9705e-01,  1.0309e+00,  1.5846e-01,
        -6.8482e-02,  3.2186e-01,  1.4499e-01,  1.0974e+00, -2.7025e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  5.7751e-01, -1.5160e-02,  3.6517e-01,
        -2.2062e-01,  1.5847e-01, -3.7936e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  7.7733e-01,  3.2186e-01, -8.0278e-02,  9.4951e-02,
        -1.1703e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  1.5056e-01,  1.2314e-01,  1.8355e-01,
         1.2215e+00,  3.3454e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
         6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  4.3879e-01,  1.1239e+00,  8.7470e-02,
         5.0980e-01,  2.4143e-01,  1.2055e+00,  6.6516e-01, -1.1768e-01,
         1.1437e+00,  3.6414e-01,  1.4931e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01])
arg max of tensor([ 6.9720e-02, -2.0659e-01,  6.2346e-01,  9.4017e-01,  2.5516e-01,
        -6.9396e-02,  1.2176e+00, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,
         1.9772e-01]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 2


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 3


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 4


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 5


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 6


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 7


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 8


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 9


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 10


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 11


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3

Layer 12


Tokens embeddings:
tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977])
arg max of tensor([ 0.0697, -0.2066,  0.0195,  0.7810,  0.1585, -0.4961,  0.3219, -0.0010,
         0.0887, -0.3930,  0.1270, -0.2035,  0.3219,  0.4737,  0.0688, -0.0010,
         0.0701, -0.1437,  0.1977, -0.1437,  0.1977]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  3.8479e-01,  8.3178e-01, -1.3025e-01,  6.5674e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02,  1.9772e-01, -1.4367e-01,
         1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01,  1.0721e+00,
        -1.5160e-02,  2.8882e-01, -6.3929e-01,  8.6076e-01, -3.2398e-01,
         5.2930e-01, -2.5877e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02,
        -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  8.4299e-02,  4.4187e-01,  8.8429e-01,
        -5.1081e-01,  6.8802e-02, -9.5306e-04,  7.0138e-02, -1.4367e-01,
         1.9772e-01, -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,
         1.9772e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01, -1.4367e-01,
         1.9772e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 2


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 3


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 4


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 5


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 6


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 7


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 8


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 9


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 10


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 11


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2

Layer 12


Tokens embeddings:
tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01])
arg max of tensor([ 6.9720e-02,  5.3506e-03,  7.9953e-01,  7.0165e-01, -1.5160e-02,
         2.3438e-01,  1.7266e-01, -1.3025e-01,  6.8802e-02, -9.5306e-04,
         7.0138e-02, -1.4367e-01,  1.9772e-01, -1.4367e-01,  1.9772e-01,
        -1.4367e-01,  1.9772e-01, -1.0000e+09,  1.9772e-01,  1.9772e-01,
         1.9772e-01]): 2


In [None]:
# Pooling First token [CLS] for each sentence - argmax

# Select some sentences randomly
sent_index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    for h, head in enumerate(attention):
      print("Head", h+1)
      # Get the attention for the cls token
      encoded_tokens = bert_tokenizer(tokens, padding=True, truncation=True, max_length=128, return_tensors='pt')
      encoded_tokens = encoded_tokens.to(device)
      with torch.no_grad():
        model_output1 = model_e(**encoded_tokens)
        tokens_embeddings = model_output1[0][:,0]
        tokens_embeddings = tokens_embeddings.cpu()
        cls_attentions = head[0]
        display(HTML(colorize(tokens, cls_attentions)))
        print("Tokens embeddings:")
        print(tokens_embeddings)
        arg = argmax(tokens_embeddings)
        print('arg max of %s: %d' % (tokens_embeddings, arg))

In [None]:
# Pooling First token [CLS] for each sentence - argmax

# Select some sentences randomly
sent_index = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #for h, head in enumerate(attention):
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = model_output1[0][:,0]
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.5557, -0.0694, -0.3240, -0.3930,  0.2778, -0.0768,
        -0.2023, -0.3930, -0.0597, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.7749, -0.0152,  0.1746, -0.2025,  0.2717,  0.8786,
        -0.4190, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 2


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 3


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 4


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 5


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 6


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 7


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 8


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 9


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 10


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 11


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 12


Tokens embeddings:
tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.2754e-02, -9.3398e-02,  4.9948e-01, -1.2336e+00,
        -3.9954e-01,  1.9862e-01, -4.1288e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032, -0.3995,  0.2668, -0.1151,  0.1229, -0.2035, -0.1269,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1144,  0.8910,  0.1622, -0.0010,  0.0887, -0.3930,
        -0.0765, -0.2035, -0.3930, -0.0124, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650])
arg max of tensor([-0.0646,  0.2021, -0.3043, -0.2025, -0.3930,  0.6279, -0.1632,  0.6885,
         0.3226,  0.0887,  0.0944,  0.6797, -0.3930,  0.3316, -0.2035, -0.3930,
         0.6885, -0.1123, -0.0643, -0.0010, -0.0650]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.2096, -0.0152, -0.6133,  0.0992,  0.5202, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.1545, -0.6393, -0.5203, -0.1601, -0.8463, -0.4877,
        -0.5642,  0.5202, -0.7420,  0.1637, -0.2046, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 9
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0388,  0.2535, -0.1413, -0.1436, -0.1670,  0.4829,
         0.0064, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.3930, -0.1226, -0.3290,  0.0887,  0.0604, -0.3930,  0.5473,
         0.0375, -0.2206,  0.3674, -0.1133, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -6.3919e-02, -1.5160e-02, -6.1332e-01,  5.8607e-01,
         9.4284e-01,  1.2314e-01, -3.9304e-01,  1.3893e+00, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 8


In [None]:
# Pooling First token [CLS] for each sentence - argmax

# Select some sentences randomly
sent_index = [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #for h, head in enumerate(attention):
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = model_output1[0][:,0]
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.6469,  0.0604, -0.1177,  0.1231,  0.5726,  0.6028,
        -0.0811, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -1.0342e+00,  3.7102e-02,  4.5119e-01, -1.1768e-01,
        -8.3683e-02, -1.5523e-01,  9.6988e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1782,  0.1030, -0.6015, -0.1170,  0.3545, -0.0837,
        -0.2771,  0.1232, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  8.6649e-03,  3.7102e-02, -1.6952e-01,
         8.6543e-01,  9.4951e-02, -7.0458e-01, -1.9409e-01, -4.0497e-01,
        -4.1106e-01, -2.0351e-01, -9.7911e-01,  5.8110e-01, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2426, -0.2971,  0.3211,  0.5526, -0.5874,  0.0887,
        -0.7508, -0.7440, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01,  1.7266e-01, -1.3025e-01, -1.1768e-01,
        -9.7911e-01, -1.2866e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.6152, -0.1391,  0.2534, -0.0691,  0.0847, -0.2408,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0675, -0.3930, -0.3539,  0.5275, -0.2035,  0.3236, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.0367, -0.0152,  0.9402,  0.0887,  0.1225, -0.2025,
        -0.3930, -0.5077, -0.3467,  0.0950, -0.3930,  0.8606, -0.1280,  0.6591,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  2.0520e-01,  3.9382e-01, -1.0577e+00,
        -1.2175e-01, -2.0340e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3


In [None]:
# Pooling First token [CLS] for each sentence - argmax

# Select some sentences randomly
sent_index = [21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64]

for s in sent_index:
  print("*" * 100)
  # Get the sentence's words
  tokens = test_inputs[s]
  # For each layer...
  for l in range(12):
    print("\nLayer", l+1)
    attention = np.squeeze(test_attentions[s][l].detach().cpu().numpy(), axis=0)
    # and for each head
    #for h, head in enumerate(attention):
    #print("Head", h+1)
    # Get the attention for the cls token
    encoded_tokens = bert_tokenizer(tokens, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_tokens = encoded_tokens.to(device)
    with torch.no_grad():
      model_output1 = model_e(**encoded_tokens)
      tokens_embeddings = model_output1[0][:,0]
      tokens_embeddings = tokens_embeddings.cpu()
      cls_attentions = head[0]
      display(HTML(colorize(tokens, cls_attentions)))
      print("Tokens embeddings:")
      print(tokens_embeddings)
      arg = argmax(tokens_embeddings)
      print('arg max of %s: %d' % (tokens_embeddings, arg))

****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.9882e-01, -2.6628e-01,  8.8741e-02, -5.2115e-02,
        -1.0577e+00, -8.6423e-01,  9.4951e-02, -9.7911e-01, -2.0302e-01,
        -1.3025e-01,  8.8741e-02, -3.3548e-01, -7.6757e-02, -1.0577e+00,
         7.1513e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.2030,  0.2052,  0.3938, -0.1941, -0.3930, -0.8033,
         0.0364, -0.3930,  0.2233, -0.3822, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 2


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 3


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 4


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 5


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 6


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 7


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 8


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 9


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 10


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 11


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10

Layer 12


Tokens embeddings:
tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.0188e-01,  8.2651e-01, -1.1768e-01, -3.9304e-01,
        -2.4522e-01, -1.5160e-02, -1.9737e-01, -1.0577e+00, -2.2379e-02,
         1.1826e+00, -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.1339, -0.0617, -0.2230, -0.6609, -0.3930,  0.4762, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0615, -0.1073, -0.3584, -0.0152, -0.1529, -0.2993, -0.0694,
        -0.3930, -0.1749, -0.1510, -0.1436, -0.0152, -0.5619, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4336,  0.2495, -0.0105,  0.0950,  0.8318, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0435, -0.2703, -0.0894, -0.3930,  0.4395, -0.6234, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.4753,  0.3848, -0.1177, -0.4961, -0.1125, -0.3930,  0.3940,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2988,  0.0371,  0.3652, -0.2206,  0.1585, -0.0837,  0.6885,
        -0.2958,  0.2457,  0.1637, -0.4050, -0.0235, -0.1716, -0.1181,  0.1585,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 2


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 3


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 4


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 5


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 6


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 7


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 8


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 9


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 10


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 11


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 12


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02,  3.3164e-01, -6.3929e-01,  4.0226e-01,
         1.0023e+00, -2.2055e-01,  1.0034e-01, -6.2328e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4032,  0.8265, -0.1177, -0.3930,  0.3208,  0.2096, -0.1133,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0745, -0.1825,  0.2939, -0.1177, -0.4987, -0.2762, -0.1697,
        -0.0152,  0.0887,  0.8786, -0.4190, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -4.0320e-01,  6.6011e-01, -1.1768e-01, -9.7911e-01,
         9.4951e-02,  2.6842e-01, -1.5160e-02, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.6494,  0.1329, -0.0392,  0.0887, -0.1650,  0.8071,  0.1637,
        -1.0534, -0.4650, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 6
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -6.4283e-02,  8.3658e-01,  3.1421e-02,
        -1.2282e-01,  1.9830e-01, -2.4185e-01, -1.3299e-01, -1.9737e-01,
        -3.9304e-01,  3.1362e-01, -1.4357e-01, -1.5160e-02,  9.9638e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 14
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4739,  0.1736, -0.1410, -0.1177, -0.2852,  0.1329,  0.0371,
         0.0923, -0.1479,  0.2380, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 10
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.0998, -0.1177, -0.2986, -0.3240, -0.3930, -0.8604,
        -0.2035, -0.3730, -0.1261, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1266,  0.0314, -0.1228, -0.0340, -0.1302, -0.5874,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.9171, -0.1289,  0.3412, -0.3930, -0.1963, -0.0848,
         0.2972,  0.1231, -0.3930, -0.0018, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4707, -0.1905, -0.6609, -0.1100,  0.5098,  0.1231, -0.3930,
        -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.1727, -0.1302, -0.4804, -0.1436, -0.0152,  0.1075, -0.0643,
        -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0265, -0.1201, -0.1094,  0.1231, -0.3930,  0.0612, -0.3079,
         0.6591, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 8
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.0094, -0.6133,  0.7810,  0.5202, -0.2430, -0.0590, -0.6133,
         0.2029, -0.1177, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 2


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 3


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 4


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 5


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 6


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 7


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 8


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 9


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 10


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 11


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 12


Tokens embeddings:
tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  5.3506e-03, -7.2944e-01, -1.2336e+00,  5.4679e-01,
         1.4331e-01, -4.7399e-01, -2.0351e-01, -6.4283e-02, -1.1824e+00,
        -3.9304e-01, -4.7894e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1622, -0.2805,  0.1030,  0.4687, -0.1177, -0.3240,
        -0.3930, -0.4676, -0.4313, -0.2805, -0.0643, -0.0010, -0.0650, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.1800, -0.5237,  0.3506,  0.1622, -0.2971, -0.4961,
        -0.1125, -0.3930, -0.3232, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.6626e-01, -3.5135e-01,  1.4133e+00,
        -3.9304e-01,  1.0463e-02, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 4
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.4167,  0.0371, -0.0437, -0.0253, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.4489,  0.5232,  0.1841,  0.0887, -0.3043, -0.2025, -0.8463,
        -0.6393,  0.4093, -0.3566, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.2021,  0.9402,  0.0887,  0.5104, -0.8463, -0.1977, -0.0121,
         0.0887,  0.3295, -0.3930, -0.0643,  0.7108, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 2


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 3


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 4


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 5


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 6


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 7


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 8


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 9


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 10


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 11


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7

Layer 12


Tokens embeddings:
tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  8.0882e-02, -6.4283e-02, -6.2366e-01, -2.0252e-01,
        -1.1512e-01, -1.0891e+00,  8.8741e-02, -4.9606e-01, -5.6287e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.6885,  0.5861, -0.3930,  0.5958,  0.0950, -0.1266,  0.7385,
         0.1248, -0.1177, -0.0152, -0.0643, -0.0010, -0.0650, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 7
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 2


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 3


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 4


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 5


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 6


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 7


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 8


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 9


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 10


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 11


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 12


Tokens embeddings:
tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  7.4549e-02, -2.6369e-01,  1.0309e+00, -1.1768e-01,
        -6.8482e-02, -3.9304e-01, -8.1924e-02, -9.6229e-01, -2.7025e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066,  0.5775, -0.0152,  0.3652, -0.2206,  0.1585, -0.3794,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01,  1.8529e-01,  6.2007e-01, -1.5160e-02,
        -9.7911e-01,  1.8296e-01,  3.9043e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.7773, -0.3930, -0.0803,  0.0950, -0.1170, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 1
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -1.3218e-01,  1.2314e-01, -1.4071e-01,
         1.2215e+00, -1.9737e-01, -7.6529e-02,  8.8741e-02,  6.4991e-01,
        -6.4283e-02, -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 5
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -2.5311e-01,  1.1239e+00, -3.4574e-02,
         5.0980e-01, -1.9409e-01, -9.8891e-01, -5.4825e-01, -1.1768e-01,
        -1.0577e+00, -1.7977e-01, -1.1161e-01, -6.4283e-02, -9.5306e-04,
        -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 2


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 3


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 4


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 5


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 6


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 7


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 8


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 9


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 10


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 11


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3

Layer 12


Tokens embeddings:
tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02, -2.0659e-01, -4.2092e-01,  9.4017e-01, -1.8005e-01,
        -6.9396e-02, -9.7911e-01, -2.1015e-02, -1.5160e-02, -6.4283e-02,
        -9.5306e-04, -6.5009e-02, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 2


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 3


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 4


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 5


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 6


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 7


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 8


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 9


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 10


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 11


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 12


Tokens embeddings:
tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646, -0.2066, -0.0188,  0.7810, -0.1177, -0.4961, -0.3930, -0.0010,
         0.0887, -0.3930, -0.0765, -0.2035, -0.3930,  0.4737, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.3848,  0.8318, -0.1302, -0.4804, -0.0643, -0.0010, -0.0650,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 2


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 3


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 4


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 5


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 6


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 7


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 8


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 9


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 10


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 11


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2

Layer 12


Tokens embeddings:
tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01])
arg max of tensor([-6.4571e-02,  4.7718e-01,  1.5636e+00, -3.9304e-01, -8.1832e-01,
        -1.5160e-02, -2.0351e-01, -6.3929e-01, -5.4795e-01, -3.2398e-01,
        -6.3929e-01, -2.5877e-01, -6.4283e-02, -9.5306e-04, -6.5009e-02,
        -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01, -1.4367e-01,
        -1.4367e-01]): 2
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054, -0.0430,  0.4419, -0.7478, -0.5108, -0.0643, -0.0010,
        -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 3
****************************************************************************************************

Layer 1


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 2


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 3


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 4


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 5


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 6


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 7


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 8


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 9


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 10


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 11


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2

Layer 12


Tokens embeddings:
tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437])
arg max of tensor([-0.0646,  0.0054,  0.7995,  0.7017, -0.0152,  0.2344,  0.1727, -0.1302,
        -0.0643, -0.0010, -0.0650, -0.1437, -0.1437, -0.1437, -0.1437, -0.1437,
        -0.1437, -0.1437, -0.1437, -0.1437, -0.1437]): 2


#### Visualize attentions for specific types of grammatical errors

In [None]:
# Lack of Subject-Verb Agreement
sentence1 = "I wrestled with this decision for years ."
# Pronoun Disagreement
sentence2 = "lined books are more enduring ."

In [None]:
# Encode the first sentence
encoded_sentence1 = bert_tokenizer.batch_encode_plus([sentence1], padding=True)

# Give as input to the model and get the outputs
inputs = torch.tensor(encoded_sentence1["input_ids"]).to(device)
att = torch.tensor(encoded_sentence1["attention_mask"]).to(device)
outputs = model_e(inputs, attention_mask=att)

In [None]:
# Get the predictions
output_probs = softmax(outputs.logits.detach().cpu().numpy(), axis=1)
predictions = (np.argmax(output_probs, axis=1))
print(sentence1, ":", predictions[0])

I wrestled with this decision for years . : 1


In [None]:
# Visualize the attention heatmaps for the CLS token
tokens = bert_tokenizer.convert_ids_to_tokens(inputs.detach().cpu().numpy()[0])
for l in range(12):
  print("\nLayer", l+1)
  attention = np.squeeze(outputs.attentions[l].detach().cpu().numpy(), axis=0)
  cls_attentions = []
  for h, head in enumerate(attention):
    print("Head", h+1)
    # Get the attention for the cls token
    cls_attentions = head[0]
    display(HTML(colorize(tokens, cls_attentions)))


Layer 1
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 2
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 3
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 4
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 5
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 6
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 7
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 8
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 9
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 10
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 11
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 12
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12


In [None]:
# Encode the second sentence
encoded_sentence2 = bert_tokenizer.batch_encode_plus([sentence2], padding=True)

# Give as input to the model and get the outputs
inputs = torch.tensor(encoded_sentence2["input_ids"]).to(device)
att = torch.tensor(encoded_sentence2["attention_mask"]).to(device)
outputs = model_e(inputs, attention_mask=att)

# Get the predictions
output_probs = softmax(outputs.logits.detach().cpu().numpy(), axis=1)
predictions = (np.argmax(output_probs, axis=1))
print(sentence2, ":", predictions[0])

lined books are more enduring . : 1


In [None]:
# Visualize the attention heatmaps for the CLS token
tokens = bert_tokenizer.convert_ids_to_tokens(inputs.detach().cpu().numpy()[0])
for l in range(12):
  print("\nLayer", l+1)
  attention = np.squeeze(outputs.attentions[l].detach().cpu().numpy(), axis=0)
  cls_attentions = []
  for h, head in enumerate(attention):
    print("Head", h+1)
    # Get the attention for the cls token
    cls_attentions = head[0]
    display(HTML(colorize(tokens, cls_attentions)))


Layer 1
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 2
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 3
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 4
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 5
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 6
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 7
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 8
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 9
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 10
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 11
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12



Layer 12
Head 1


Head 2


Head 3


Head 4


Head 5


Head 6


Head 7


Head 8


Head 9


Head 10


Head 11


Head 12
