In [12]:
pip install transformers



In [13]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch

# Load the pre-trained tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")  # For embeddings
classifier_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)  # For classification

# Set the model to evaluation mode
bert_model.eval()
classifier_model.eval()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
sentence = "white hat"  # Example sentence
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)

# Inputs for embedding extraction
input_ids = inputs["input_ids"]  # Token IDs
attention_mask = inputs["attention_mask"]  # Attention mask

In [36]:
with torch.no_grad():
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    contextual_embeddings = outputs.last_hidden_state  # Shape: [batch_size, seq_length, hidden_size]
    # Original text embeddings: [1, 11, 768]
    pooled_text_embedding = contextual_embeddings.mean(dim=1)  # Shape: [1, 768]


print(f"pooled_text_embedding.shape: {pooled_text_embedding.shape}")
print(type(pooled_text_embedding))

pooled_text_embedding.shape: torch.Size([1, 768])
<class 'torch.Tensor'>


In [16]:
with torch.no_grad():
    logits = classifier_model(input_ids, attention_mask=attention_mask).logits
    predictions = torch.argmax(logits, dim=-1)

# Interpret the result
if predictions.item() == 1:
    print("The sentence contains an idiom.")
else:
    print("The sentence does not contain an idiom.")

The sentence contains an idiom.


In [17]:
pip install torch torchvision transformers



In [51]:
import torch
from transformers import ViTModel, ViTFeatureExtractor
from PIL import Image
import os


In [52]:
# Load pre-trained ViT model and feature extractor
model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")

# Set the model to evaluation mode
model.eval()





ViTModel(
  (embeddings): ViTEmbeddings(
    (patch_embeddings): ViTPatchEmbeddings(
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ViTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ViTLayer(
        (attention): ViTSdpaAttention(
          (attention): ViTSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ViTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ViTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUAct

In [53]:
# 文件夹路径 file path
folder_path = "/content"

# 列出所有图片文件 all the image under the folder
image_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(('png', 'jpg', 'jpeg'))]

# 获取图片ID（文件名）get the image id of each image
image_ids = [os.path.basename(file).split('.')[0] for file in image_files]

print(image_ids)


['50305046415', '11696820520', '12124292214', '39481587509', '13755461305']


In [54]:
# Preprocess images
images = [Image.open(file).convert("RGB") for file in image_files]
inputs = feature_extractor(images=images, return_tensors="pt")  # Batch preprocessing


In [55]:
# Pass the preprocessed images through the ViT model
with torch.no_grad():
    outputs = model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_size]

print(f"CLS Embeddings Shape: {cls_embeddings.shape}")  # Shape: [5, 768] for 5 images


CLS Embeddings Shape: torch.Size([5, 768])


In [60]:
# 生成图片 ID 和对应的 CLS embedding
id_embedding_map = {image_id: image_embedding for image_id, image_embedding in zip(image_ids, cls_embeddings)}

# 打印每个图片 ID 和其对应的 embedding 形状
for image_id, image_emb in id_embedding_map.items():
    print(f"Image ID: {image_id}, Image Emb Shape: {image_emb.shape}")
    print(f"Pooled Text Emb Shape: {pooled_text_embedding.shape}")

    # 修复零维 embedding
    if image_emb.dim() == 0:
        image_emb = image_emb.unsqueeze(0)  # 添加一个维度
        print(f"Fixed Image Emb Shape for ID {image_id}: {image_emb.shape}")



Image ID: 50305046415, Image Emb Shape: torch.Size([768])
Pooled Text Emb Shape: torch.Size([768])
Image ID: 11696820520, Image Emb Shape: torch.Size([768])
Pooled Text Emb Shape: torch.Size([768])
Image ID: 12124292214, Image Emb Shape: torch.Size([768])
Pooled Text Emb Shape: torch.Size([768])
Image ID: 39481587509, Image Emb Shape: torch.Size([768])
Pooled Text Emb Shape: torch.Size([768])
Image ID: 13755461305, Image Emb Shape: torch.Size([768])
Pooled Text Emb Shape: torch.Size([768])


In [62]:
import torch
import torch.nn as nn

# define the nerual network model
class VectorComparisonNet(nn.Module):
    def __init__(self, embedding_dim, hidden_dim):
        super(VectorComparisonNet, self).__init__()
        self.fc1 = nn.Linear(embedding_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, image_emb, text_emb):
        # concat the features
        combined = torch.cat([image_emb, text_emb], dim=-1)
        hidden = self.relu(self.fc1(combined))
        output = self.sigmoid(self.fc2(hidden))
        return output

embedding_dim = 768  # image and text embedding dimension
hidden_dim = 128     # hidden layer dimension
model = VectorComparisonNet(embedding_dim, hidden_dim)

# pooling
pooled_text_embedding = pooled_text_embedding.squeeze(0)

results = []
for image_id, image_emb in id_embedding_map.items():
    # forward computing probability
    probability = model(image_emb, pooled_text_embedding)  # input the model
    results.append((image_id, probability.item()))  # save result

# print the result
for image_id, prob in results:
    print(f"Image ID: {image_id}, Match Probability: {prob:.4f}")


Image ID: 50305046415, Match Probability: 0.4602
Image ID: 11696820520, Match Probability: 0.4556
Image ID: 12124292214, Match Probability: 0.4629
Image ID: 39481587509, Match Probability: 0.4583
Image ID: 13755461305, Match Probability: 0.4636


In [None]:
import torch
from sklearn.decomposition import PCA

# Define a linear projection layer to reduce dimensions from 768 to 512
linear_projection = nn.Linear(768, 512)

# Apply the linear transformation
cls_embeddings_reduced = linear_projection(cls_embeddings)

# Print the shapes to verify
print(f"Original Shape: {cls_embeddings.shape}")  # Output: [5, 768]
print(f"Reduced Shape: {cls_embeddings_reduced.shape}")  # Output: [5, 512]



Original Shape: torch.Size([5, 768])
Reduced Shape: torch.Size([5, 512])
