In [1]:
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
from torchvision import transforms

from src.models.utils.data import FoodPricingDataset
from src.models.nlp.pretrained_bert import PreTrainedBERT

With PreTrainedBERT Class

In [2]:
pretrained_model_name_or_path = "dbmdz/bert-base-italian-xxl-uncased"

model_kwargs = {
    "pretrained_model_name_or_path": pretrained_model_name_or_path,
}

In [3]:
feature_dim = None
model = PreTrainedBERT(model_kwargs, feature_dim=feature_dim)

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:root:Loading the tokenizer with the same name of the model for the PretrainedBERT.


In [4]:
img_transform = transforms.Compose(
    [
        transforms.Resize(size=(224, 224)),
        transforms.ToTensor(),
        # transforms.Normalize(
        #     mean=[0.485, 0.456, 0.406],
        #     std=[0.229, 0.224, 0.225],
        # )
    ]
)

training_data = FoodPricingDataset(
    img_transform=img_transform, txt_transform=lambda x: x, split="train"
)
dataloader = torch.utils.data.DataLoader(
    training_data,
    shuffle=True,
    batch_size=4,
    num_workers=8,
)
sample = next(iter(dataloader))["txt"]

In [5]:
sent_emb = model(sample)

In [6]:
assert sent_emb.shape[0] == len(sample)
assert sent_emb.shape[1] == feature_dim or model.bert.config.hidden_size

Manual

In [7]:
config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
model = AutoModel.from_pretrained(pretrained_model_name_or_path)

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
encoded_batch = tokenizer(sample, padding=True, return_tensors="pt")

In [9]:
t: torch.Tensor = encoded_batch["input_ids"]
flat_t = t.reshape(t.numel()).tolist()
for el in flat_t:
    if (s := tokenizer.convert_ids_to_tokens(el)) not in ("[CLS]", "[PAD]"):
        print(s)

che
##ese
##burg
##er
emp
##ty
_
descri
##ption
[SEP]
carre
##fo
##ur
caffe
'
espresso
cre
##mos
##o
250
g
-
89
##93
caffe
'
macina
##to
espresso
cr
##f
[SEP]
val
##cale
##pio
rosso
magri
sereno
doc
75
##cl
emp
##ty
_
descri
##ption
[SEP]
pane
##angeli
lievito
vani
##gli
##nato
x
##3
48
g
-
00
##97
136
##10
lievito
x
3
[SEP]


In [10]:
word_embeddings = model(encoded_batch["input_ids"], encoded_batch["attention_mask"])[0]

In [11]:
sentence_embeddings = word_embeddings[:,0,:]

In [12]:
for i1, i2 in [(i, j) for i in range(len(sample)) for j in range(i)]:
    sim = torch.cosine_similarity(sentence_embeddings[i1,:], sentence_embeddings[i2,:], dim=0).item()
    print(f"Similarity: {sim} between sentences: \n{sample[i1]}\n{sample[i2]}\n")

Similarity: 0.4502798318862915 between sentences: 
Carrefour Caffe' Espresso Cremoso 250 g - 8993 CAFFE' MACINATO ESPRESSO CRF
Cheeseburger EMPTY_DESCRIPTION

Similarity: 0.2242051661014557 between sentences: 
Valcalepio Rosso Magri Sereno Doc 75cl EMPTY_DESCRIPTION
Cheeseburger EMPTY_DESCRIPTION

Similarity: 0.8813031911849976 between sentences: 
Valcalepio Rosso Magri Sereno Doc 75cl EMPTY_DESCRIPTION
Carrefour Caffe' Espresso Cremoso 250 g - 8993 CAFFE' MACINATO ESPRESSO CRF

Similarity: 0.3547227084636688 between sentences: 
Paneangeli Lievito vaniglinato X3 48 g - 0097 13610 LIEVITO X 3
Cheeseburger EMPTY_DESCRIPTION

Similarity: 0.9442523717880249 between sentences: 
Paneangeli Lievito vaniglinato X3 48 g - 0097 13610 LIEVITO X 3
Carrefour Caffe' Espresso Cremoso 250 g - 8993 CAFFE' MACINATO ESPRESSO CRF

Similarity: 0.920947790145874 between sentences: 
Paneangeli Lievito vaniglinato X3 48 g - 0097 13610 LIEVITO X 3
Valcalepio Rosso Magri Sereno Doc 75cl EMPTY_DESCRIPTION



In [13]:
s = "Panino con bacon e formaggio asiago: offerta speciale!"
encoded_input = tokenizer.encode(s)
input_ids = torch.tensor(encoded_input).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)