In [47]:
pip install transformers



In [48]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")  # For embeddings
classifier_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # For classification

# Set the model to evaluation mode
bert_model.eval()
classifier_model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [50]:
sentence = "white hat"  # Example sentence
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

# Inputs for embedding extraction
input_ids = inputs["input_ids"]  # Token IDs
attention_mask = inputs["attention_mask"]  # Attention mask

In [51]:
with torch.no_grad():
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    contextual_embeddings = outputs.last_hidden_state  # Shape: [batch_size, seq_length, hidden_size]
    # Original text embeddings: [1, 11, 768]
    pooled_text_embedding = contextual_embeddings.mean(dim=1)  # Shape: [1, 768]


print(f"pooled_text_embedding.shape: {pooled_text_embedding.shape}")

pooled_text_embedding.shape: torch.Size([1, 768])


In [52]:
with torch.no_grad():
    logits = classifier_model(input_ids, attention_mask=attention_mask).logits
    predictions = torch.argmax(logits, dim=-1)

# Interpret the result
if predictions.item() == 1:
    print("The sentence contains an idiom.")
else:
    print("The sentence does not contain an idiom.")

The sentence does not contain an idiom.


In [53]:
pip install torch torchvision transformers



In [54]:
import torch
from transformers import ViTModel, ViTFeatureExtractor
from PIL import Image
import os


In [55]:
# Load pre-trained ViT model and feature extractor
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

# Set the model to evaluation mode
model.eval()





ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTSdpaAttention(
          (attention): ViTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [70]:
# Path to the folder containing images
folder_path = "/content"

# List all image files in the folder
image_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(('png', 'jpg', 'jpeg'))]
print(image_files)


['/content/printer.png', '/content/hat.png', '/content/idiomatic.png', '/content/Literal.png', '/content/idiomatic2.png']


In [57]:
# Preprocess images
images = [Image.open(file).convert("RGB") for file in image_files]  # Open and convert to RGB
inputs = feature_extractor(images=images, return_tensors="pt")  # Batch preprocessing


In [58]:
# Pass the preprocessed images through the ViT model
with torch.no_grad():
    outputs = model(**inputs)

# Extract the [CLS] token embeddings for all images
cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_size]

print(f"CLS Embeddings Shape: {cls_embeddings.shape}")  # Shape: [5, 768] for 5 images


CLS Embeddings Shape: torch.Size([5, 768])


In [59]:
pip install scikit-learn



In [65]:
import torch
from sklearn.decomposition import PCA

# Define a linear projection layer to reduce dimensions from 768 to 512
linear_projection = nn.Linear(768, 512)

# Apply the linear transformation
cls_embeddings_reduced = linear_projection(cls_embeddings)

# Print the shapes to verify
print(f"Original Shape: {cls_embeddings.shape}")  # Output: [5, 768]
print(f"Reduced Shape: {cls_embeddings_reduced.shape}")  # Output: [5, 512]



Original Shape: torch.Size([5, 768])
Reduced Shape: torch.Size([5, 512])


In [66]:
# Path to the .pt file
file_path = "/content/white_hat.pt"

# Load the embeddings
data = torch.load(file_path)

# Check the type of the loaded data
print(data.shape)

(1, 512)


  data = torch.load(file_path)


In [69]:
# Example sentence embedding: Shape [1, 512]
sentence_embedding = torch.tensor(data) # Replace with your actual embedding

# Example image embeddings: Shape [5, 512]
image_embeddings = cls_embeddings_reduced  # Replace with your actual embeddings

# Step 1: Compute cosine similarity
# Cosine similarity is computed separately for each image embedding
similarities = F.cosine_similarity(sentence_embedding, image_embeddings, dim=-1)  # Shape: [5]

# Step 2: Rank similarities
# Sort indices of the images based on similarity scores (descending order)
ranked_indices = torch.argsort(similarities, descending=True)

# Print results
print("Similarity scores:", similarities)  # Shape: [5]
print("Ranked indices:", ranked_indices)   # Indices sorted by similarity

Similarity scores: tensor([ 0.0471, -0.0222, -0.0402, -0.0177, -0.0599], grad_fn=<SumBackward1>)
Ranked indices: tensor([0, 3, 1, 2, 4])


In [None]:
import torch
import torch.nn.functional as F

# Example sentence embedding: Shape [1, 768]
sentence_embedding = pooled_text_embedding  # Replace with your actual embedding

# Example image embeddings: Shape [5, 768]
image_embeddings = cls_embeddings  # Replace with your actual embeddings

# Step 1: Compute cosine similarity
# Broadcast sentence embedding to match image embeddings
similarities = F.cosine_similarity(sentence_embedding, image_embeddings, dim=-1)  # Shape: [5]

# Step 2: Rank similarities
ranked_indices = torch.argsort(similarities, descending=True)  # Indices of images sorted by similarity

# Print results
print("Similarity scores:", similarities)
print("Ranked indices:", ranked_indices)

Similarity scores: tensor([-0.0189,  0.0437,  0.0418, -0.0418, -0.0515])
Ranked indices: tensor([1, 2, 0, 3, 4])


Distances: tensor([[10.0676, 10.1501,  9.6485,  9.8801, 10.6379]])
Ranked indices: tensor([2, 3, 0, 1, 4])


Dot Products: tensor([-2.0198,  2.0034,  1.6785, -0.7186, -2.0665])
Ranked indices: tensor([1, 2, 3, 0, 4])
