This file used to generate text embeddings using pre-trained "openai/clip-vit-large-patch14" model.

## CONFIGURATIONS

In [None]:
CONFIG = {
    "model_name": "openai/clip-vit-large-patch14",
    "text_path": "data\\metadata\\video_descriptions_en_short.json", 
    "save_path": "D:\\sjtu文件夹\\PLUS课程文件夹\\PRP\\my_EEG2Video\\data\\metadata\\text_embedding.pt", 
}

## Load tokenizer and text embedding model

In [3]:
import torch
from transformers import CLIPTextModel, CLIPTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[INFO] Using device: {device}")

print("Loading tokenizer...")
tokenizer = CLIPTokenizer.from_pretrained(CONFIG["model_name"])

print("Loading text encoder...")
text_encoder = CLIPTextModel.from_pretrained(CONFIG["model_name"]).to(device)
text_encoder.eval() # 设置为评估模式


  from .autonotebook import tqdm as notebook_tqdm


[INFO] Using device: {device}
Loading tokenizer...
Loading text encoder...


CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

## Tokenize the text

In [14]:
import json

prompts_dict = json.load(open(CONFIG["text_path"], "r"))
prompts = []
for i in range(len(prompts_dict)):
    prompts.append(prompts_dict[f"{i+1}.mp4"])

text_inputs = tokenizer(
        prompts,
        padding="max_length",
        max_length=tokenizer.model_max_length, # 通常是 77
        truncation=True,
        return_tensors="pt",
    )
    
input_ids = text_inputs.input_ids.to(device)

## Text Embedding

In [17]:
import os

with torch.no_grad():
    text_embeddings = text_encoder(input_ids)[0] # 输出的元组中，第一个元素是 last_hidden_state

text_embeddings = text_embeddings.cpu()

# 打印形状以供验证
# 形状应该是 (N, 77, 768)，其中 N 是提示的数量，77 是 token 数量，768 是嵌入维度
print(f"Generated embeddings shape: {text_embeddings.shape}")

# 4. 保存嵌入向量
save_path = CONFIG["save_path"]
output_path = os.path.join(save_path)
torch.save(text_embeddings, output_path)

print(f"Successfully saved text embeddings to: save_path")

Generated embeddings shape: torch.Size([250, 77, 768])
Successfully saved text embeddings to: save_path
