In [1]:
# Preprocessing with a tokenizer
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [2]:
raw_inputs = [
    "I fell for the hype. When shopping for a new TV I made the mistake of getting swayed by the price and the salesperson influence. I wound up buying a 65 inches OLED LG TV. And once installed my troubles started. Right at the outset I noticed the slowness of downloading, opening apps and streaming. Of course I contacted support and after hours and days of arguing they agreed to send a repair crew. It was determined that the motherboard was faulty and replaced it.",
    "My 32 GB Picture Keeper suddenly stopped working. Fearing the worst case scenario. I contacted P.K. Tech Team, Aaron P sent a Link to recover my pictures. It worked, over 15,000 images recovered. Many Thanks Aaron P. I thoroughly recommend P.K.",
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  3062,  2005,  1996,  1044, 18863,  1012,  2043,  6023,
          2005,  1037,  2047,  2694,  1045,  2081,  1996,  6707,  1997,  2893,
         20122,  2011,  1996,  3976,  1998,  1996,  4341, 27576,  3747,  1012,
          1045,  6357,  2039,  9343,  1037,  3515,  5282, 15589,  2094,  1048,
          2290,  2694,  1012,  1998,  2320,  5361,  2026, 13460,  2318,  1012,
          2157,  2012,  1996, 26674,  1045,  4384,  1996,  4030,  2791,  1997,
          8816,  2075,  1010,  3098, 18726,  1998, 11058,  1012,  1997,  2607,
          1045, 11925,  2490,  1998,  2044,  2847,  1998,  2420,  1997,  9177,
          2027,  3530,  2000,  4604,  1037,  7192,  3626,  1012,  2009,  2001,
          4340,  2008,  1996,  2388,  6277,  2001, 28927,  1998,  2999,  2009,
          1012,   102],
        [  101,  2026,  3590, 16351,  3861, 10684,  3402,  3030,  2551,  1012,
         14892,  1996,  5409,  2553, 11967,  1012,  1045, 11925,  1052,  1012,
          1047

In [3]:
# Going through the model
from transformers import AutoModel

model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

torch.Size([2, 102, 768])


In [4]:
# There are many different architectures available in 🤗 Transformers, with each one designed around tackling a specific task. Here is a non-exhaustive list:

# *Model (retrieve the hidden states)
# *ForCausalLM
# *ForMaskedLM
# *ForMultipleChoice
# *ForQuestionAnswering
# *ForSequenceClassification
# *ForTokenClassification
# and others 🤗


In [5]:
# For our example, we will need a model with a sequence classification head (to be able to classify the sentences as positive or negative). 
# So, we won’t actually use the AutoModel class, but AutoModelForSequenceClassification:

from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([2, 2])


In [6]:
# Postprocessing the output

print(outputs.logits)

tensor([[ 4.3398, -3.5608],
        [-2.4272,  2.5886]], grad_fn=<AddmmBackward0>)


In [7]:
# To be converted to probabilities, they need to go through a SoftMax layer
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
print(predictions)

tensor([[9.9963e-01, 3.7040e-04],
        [6.5886e-03, 9.9341e-01]], grad_fn=<SoftmaxBackward0>)


In [8]:
# To get the labels corresponding to each position, we can inspect the id2label attribute of the model config
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [9]:
# Now we can conclude that the model predicted the following:
# First sentence: NEGATIVE: 0.99963, POSITIVE: 0.00037
# Second sentence: NEGATIVE: 0.00659, POSITIVE: 0.99341

In [10]:
# Creating a Transformer
# Creating a model from the default configuration initializes it with random values:
from transformers import BertConfig, BertModel
# Building the config
config = BertConfig()

# Building the model from the config
# Model is randomly initialized!
model = BertModel(config)

# The configuration contains many attributes that are used to build the model:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [11]:
# Loading a Transformer model that is already trained is simple — we can do this using the from_pretrained() method:
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-cased")

print(model)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [12]:
print(model.config)

BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}



In [13]:
# Using a Transformer model for inference
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequences = ["Hello!", "Cool.", "Nice!"] # changing word might cause error due to tensor dim.

encoded_sequences = tokenizer(sequences)
print(encoded_sequences)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'input_ids': [[101, 8667, 106, 102], [101, 13297, 119, 102], [101, 8835, 106, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]}


In [14]:
import torch
print(encoded_sequences['input_ids'])
model_inputs = torch.tensor(encoded_sequences['input_ids'])
print("--------output-------")
output = model(model_inputs)
print(output)

[[101, 8667, 106, 102], [101, 13297, 119, 102], [101, 8835, 106, 102]]
--------output-------
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6283,  0.2166,  0.5605,  ...,  0.0136,  0.6158, -0.1712],
         [ 0.6108, -0.2253,  0.9263,  ..., -0.3028,  0.4500, -0.0714],
         [ 0.8040,  0.1809,  0.7076,  ..., -0.0685,  0.4837, -0.0774],
         [ 1.3290,  0.2360,  0.4567,  ...,  0.1509,  0.9621, -0.4841]],

        [[ 0.3128,  0.1718,  0.2099,  ..., -0.0721,  0.4919, -0.1383],
         [ 0.1545, -0.3757,  0.7187,  ..., -0.3130,  0.2822,  0.1883],
         [ 0.4123,  0.3721,  0.5484,  ...,  0.0788,  0.5681, -0.2757],
         [ 0.8356,  0.3964, -0.4121,  ...,  0.1838,  1.6365, -0.4806]],

        [[ 0.5399,  0.2564,  0.2511,  ..., -0.1760,  0.6063, -0.1803],
         [ 0.2609, -0.3164,  0.5548,  ..., -0.3439,  0.3909,  0.0900],
         [ 0.5161,  0.0721,  0.5606,  ...,  0.0077,  0.3685, -0.2272],
         [ 0.6560,  0.8475, -0.1606,  ..., -0.0468,  1.6309

In [15]:
# Tokenizers

# Loading the BERT tokenizer trained with the same checkpoint as BERT is done the same way as loading the model, except we use the BertTokenizer class:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

# Similar to AutoModel, the AutoTokenizer class will grab the proper tokenizer class in the library based on the checkpoint name, and can be used directly with any checkpoint:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

tokenizer("Loading the BERT tokenizer trained with the same checkpoint as BERT is done the same way as loading the model")

{'input_ids': [101, 10605, 17465, 1103, 139, 9637, 1942, 22559, 17260, 3972, 1114, 1103, 1269, 4031, 7587, 1112, 139, 9637, 1942, 1110, 1694, 1103, 1269, 1236, 1112, 10745, 1103, 2235, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [16]:
# Encoding

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = ["Loading the BERT tokenizer trained with the same checkpoint as BERT is done the same way as loading the model.", 
            "AutoTokenizer class will grab the proper tokenizer class in the library based on the checkpoint name."]
tokens = tokenizer.tokenize(sequence)
print(tokens)


['Lo', '##ading', 'the', 'B', '##ER', '##T', 'token', '##izer', 'trained', 'with', 'the', 'same', 'check', '##point', 'as', 'B', '##ER', '##T', 'is', 'done', 'the', 'same', 'way', 'as', 'loading', 'the', 'model', '.', 'Auto', '##T', '##oken', '##izer', 'class', 'will', 'grab', 'the', 'proper', 'token', '##izer', 'class', 'in', 'the', 'library', 'based', 'on', 'the', 'check', '##point', 'name', '.']


In [17]:
# From tokens to input IDs
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[10605, 17465, 1103, 139, 9637, 1942, 22559, 17260, 3972, 1114, 1103, 1269, 4031, 7587, 1112, 139, 9637, 1942, 1110, 1694, 1103, 1269, 1236, 1112, 10745, 1103, 2235, 119, 12983, 1942, 27443, 17260, 1705, 1209, 6387, 1103, 4778, 22559, 17260, 1705, 1107, 1103, 3340, 1359, 1113, 1103, 4031, 7587, 1271, 119]


In [18]:
# Decoding

string = tokenizer.decode(ids)
print(string)

2024-07-12 13:27:06.846143: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-12 13:27:06.846294: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-12 13:27:07.011875: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading the BERT tokenizer trained with the same checkpoint as BERT is done the same way as loading the model. AutoTokenizer class will grab the proper tokenizer class in the library based on the checkpoint name.


In [20]:
# Models expect a batch of inputs
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "Loading the BERT tokenizer trained with the same checkpoint as BERT is done the same way as loading the model."

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor(ids) # a single sequence.
# This line will fail because we send a single sequence but the model expect list of sequence. We have to add [] to change ids to the lis of ids.
# model(input_ids)
input_ids = torch.tensor([ids]) # list of ids by adding []
print("Input IDs = ", input_ids)

output = model(input_ids)
print("Logits = ", output.logits)

Input IDs =  tensor([[10578,  1996, 14324, 19204, 17629,  4738,  2007,  1996,  2168, 26520,
          2004, 14324,  2003,  2589,  1996,  2168,  2126,  2004, 10578,  1996,
          2944,  1012]])
Logits =  tensor([[ 3.8296, -3.1792]], grad_fn=<AddmmBackward0>)


In [23]:
# Batching is the act of sending multiple sentences through the model, all at once. If you only have one sentence, you can just build a batch with a single sequence:
batched_ids = [ids, ids]
batched_input_ids = torch.tensor(batched_ids)
print("Bached Input IDs = ", batched_input_ids)

batched_output = model(batched_input_ids)
print("Bached Logits = ", batched_output.logits)

Bached Input IDs =  tensor([[10578,  1996, 14324, 19204, 17629,  4738,  2007,  1996,  2168, 26520,
          2004, 14324,  2003,  2589,  1996,  2168,  2126,  2004, 10578,  1996,
          2944,  1012],
        [10578,  1996, 14324, 19204, 17629,  4738,  2007,  1996,  2168, 26520,
          2004, 14324,  2003,  2589,  1996,  2168,  2126,  2004, 10578,  1996,
          2944,  1012]])
Bached Logits =  tensor([[ 3.8296, -3.1792],
        [ 3.8296, -3.1792]], grad_fn=<AddmmBackward0>)


In [25]:
# Attention masks
# Attention masks are tensors with the exact same shape as the input IDs tensor, 
# filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to, 
# and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model).
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [29]:
# Putting it all together
# In the last few sections, we’ve been trying our best to do most of the work by hand. 
# We’ve explored how tokenizers work and looked at tokenization, conversion to input IDs, padding, truncation, and attention masks.
# However, as we saw in section 2, the 🤗 Transformers API can handle all of this for us with a high-level function that we’ll dive into here. 
# When you call your tokenizer directly on the sentence, you get back inputs that are ready to pass through your model:

from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sequences = ["Loading the BERT tokenizer trained with the same checkpoint as BERT is done the same way as loading the model.", 
            "AutoTokenizer class will grab the proper tokenizer class in the library based on the checkpoint name."]
model_inputs = tokenizer(sequences)

# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")
print("Longest = ", model_inputs)
print("---------------------------------")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")
print("max_length = ", model_inputs)
print("---------------------------------")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
print("specified max length = ", model_inputs)

Longest =  {'input_ids': [[101, 10578, 1996, 14324, 19204, 17629, 4738, 2007, 1996, 2168, 26520, 2004, 14324, 2003, 2589, 1996, 2168, 2126, 2004, 10578, 1996, 2944, 1012, 102], [101, 8285, 18715, 18595, 6290, 2465, 2097, 6723, 1996, 5372, 19204, 17629, 2465, 1999, 1996, 3075, 2241, 2006, 1996, 26520, 2171, 1012, 102, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]}
---------------------------------
max_length =  {'input_ids': [[101, 10578, 1996, 14324, 19204, 17629, 4738, 2007, 1996, 2168, 26520, 2004, 14324, 2003, 2589, 1996, 2168, 2126, 2004, 10578, 1996, 2944, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [30]:
# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)
print("truncate = ", model_inputs)
print("---------------------------------")

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)
print("truncate specified max length = ", model_inputs)

truncate =  {'input_ids': [[101, 10578, 1996, 14324, 19204, 17629, 4738, 2007, 1996, 2168, 26520, 2004, 14324, 2003, 2589, 1996, 2168, 2126, 2004, 10578, 1996, 2944, 1012, 102], [101, 8285, 18715, 18595, 6290, 2465, 2097, 6723, 1996, 5372, 19204, 17629, 2465, 1999, 1996, 3075, 2241, 2006, 1996, 26520, 2171, 1012, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
---------------------------------
truncate specified max length =  {'input_ids': [[101, 10578, 1996, 14324, 19204, 17629, 4738, 102], [101, 8285, 18715, 18595, 6290, 2465, 2097, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}


In [33]:
# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")
print("PyTorch = ", model_inputs)
print("---------------------------------")

# Returns TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")
print("TensorFlow = ", model_inputs)
print("---------------------------------")

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")
print("NumPy = ", model_inputs)
print("---------------------------------")

PyTorch =  {'input_ids': tensor([[  101, 10578,  1996, 14324, 19204, 17629,  4738,  2007,  1996,  2168,
         26520,  2004, 14324,  2003,  2589,  1996,  2168,  2126,  2004, 10578,
          1996,  2944,  1012,   102],
        [  101,  8285, 18715, 18595,  6290,  2465,  2097,  6723,  1996,  5372,
         19204, 17629,  2465,  1999,  1996,  3075,  2241,  2006,  1996, 26520,
          2171,  1012,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}
---------------------------------
TensorFlow =  {'input_ids': <tf.Tensor: shape=(2, 24), dtype=int32, numpy=
array([[  101, 10578,  1996, 14324, 19204, 17629,  4738,  2007,  1996,
         2168, 26520,  2004, 14324,  2003,  2589,  1996,  2168,  2126,
         2004, 10578,  1996,  2944,  1012,   102],
       [  101,  8285, 18715, 18595,  6290,  2465,  2097,  6723,  1996,
         5372, 19204, 17

In [34]:
# Wrapping up: From tokenizer to model
# Now that we’ve seen all the individual steps the tokenizer object uses when applied on texts, 
# let’s see one final time how it can handle multiple sequences (padding!), very long sequences (truncation!), 
# and multiple types of tensors with its main API:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["Loading the BERT tokenizer trained with the same checkpoint as BERT is done the same way as loading the model.", 
            "AutoTokenizer class will grab the proper tokenizer class in the library based on the checkpoint name."]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.3540, -2.8169],
        [ 3.0878, -2.5283]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
