In [4]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPModel, CLIPProcessor, AutoModel, AutoTokenizer
from PIL import Image
import pandas as pd
import numpy as np
from tqdm import tqdm

In [9]:
# ===================== 配置 =====================
INPUT_CSV = "../../train_master.csv"
CHECKPOINT_PATH = "../[2_training]/checkpoints/best_model.pt"  # 最佳模型
OUTPUT_DIR = "./embeddings"
BATCH_SIZE = 64
MAX_TEXT_LEN = 128
DEVICE = "cuda"

IMAGE_MODEL = "openai/clip-vit-large-patch14"
TEXT_MODEL = "hfl/chinese-roberta-wwm-ext-large"
PROJ_DIM = 512

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# ===================== 复用训练时的模型定义 =====================
class DualEncoder(nn.Module):
    def __init__(self, image_model_name, text_model_name, proj_dim=512):
        super().__init__()
        clip_model = CLIPModel.from_pretrained(image_model_name)
        self.image_encoder = clip_model.vision_model
        img_dim = self.image_encoder.config.hidden_size
        
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        txt_dim = self.text_encoder.config.hidden_size
        
        self.image_proj = nn.Sequential(
            nn.Linear(img_dim, proj_dim),
            nn.GELU(),
            nn.Linear(proj_dim, proj_dim)
        )
        self.text_proj = nn.Sequential(
            nn.Linear(txt_dim, proj_dim),
            nn.GELU(),
            nn.Linear(proj_dim, proj_dim)
        )
        
        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))

    def encode_image(self, pixel_values):
        img_out = self.image_encoder(pixel_values=pixel_values)
        img_feat = img_out.pooler_output
        img_emb = F.normalize(self.image_proj(img_feat), dim=-1)
        return img_emb

    def encode_text(self, input_ids, attention_mask):
        txt_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        txt_feat = txt_out.pooler_output
        txt_emb = F.normalize(self.text_proj(txt_feat), dim=-1)
        return txt_emb

In [11]:
# ===================== 全量数据集（不区分 split）=====================
class FullDataset(Dataset):
    def __init__(self, csv_path, image_processor, tokenizer, max_len=128):
        self.df = pd.read_csv(csv_path)
        self.image_processor = image_processor
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # 图像
        image = Image.open(row["image_path"]).convert("RGB")
        image_input = self.image_processor(images=image, return_tensors="pt")["pixel_values"].squeeze(0)
        
        # 文本
        text = str(row["text"]) if pd.notna(row["text"]) else ""
        text_input = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True, 
            return_tensors="pt"
        )
        
        return {
            "id": str(row["id"]),
            "image": image_input,
            "input_ids": text_input["input_ids"].squeeze(0),
            "attention_mask": text_input["attention_mask"].squeeze(0),
            "image_path": row["image_path"],
            "audio_path": row["audio_path"],
            "text": text
        }


In [None]:
print(f"Device: {DEVICE}")
print("Loading model...")

# 加载模型
model = DualEncoder(IMAGE_MODEL, TEXT_MODEL, PROJ_DIM).to(DEVICE)
state_dict = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
model.load_state_dict(state_dict)
model.eval()

# 加载 processor 和 tokenizer
image_processor = CLIPProcessor.from_pretrained(IMAGE_MODEL)
tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL)

# 数据集
dataset = FullDataset(INPUT_CSV, image_processor, tokenizer, MAX_TEXT_LEN)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

print(f"Total samples: {len(dataset)}")

# 导出向量
results = {
    "id": [],
    "image_path": [],
    "audio_path": [],
    "text": [],
    "text_emb": [],
    "image_emb": []
}

print("Exporting embeddings...")
with torch.no_grad():
    for batch in tqdm(loader, desc="Processing"):
        images = batch["image"].to(DEVICE)
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        
        text_emb = model.encode_text(input_ids, attention_mask).cpu().numpy()
        image_emb = model.encode_image(images).cpu().numpy()
        
        results["id"].extend(batch["id"])
        results["image_path"].extend(batch["image_path"])
        results["audio_path"].extend(batch["audio_path"])
        results["text"].extend(batch["text"])
        results["text_emb"].append(text_emb)
        results["image_emb"].append(image_emb)

# 合并向量
results["text_emb"] = np.vstack(results["text_emb"])
results["image_emb"] = np.vstack(results["image_emb"])

# 保存
print("Saving...")
np.save(os.path.join(OUTPUT_DIR, "text_embeddings.npy"), results["text_emb"])
np.save(os.path.join(OUTPUT_DIR, "image_embeddings.npy"), results["image_emb"])

metadata = pd.DataFrame({
    "id": results["id"],
    "image_path":  results["image_path"],
    "audio_path": results["audio_path"],
    "text":  results["text"]
})
metadata.to_csv(os.path.join(OUTPUT_DIR, "metadata.csv"), index=False, encoding="utf-8")

print(f"\n✓ Done! Exported {len(results['id'])} entries to {OUTPUT_DIR}/")
print(f"  - text_embeddings.npy: {results['text_emb'].shape}")
print(f"  - image_embeddings.npy: {results['image_emb'].shape}")
print(f"  - metadata.csv")

Device: cuda
Loading model...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Total samples: 1000
Exporting embeddings...


Processing: 100%|██████████| 16/16 [00:09<00:00,  1.76it/s]

Saving...

✓ Done! Exported 1000 entries to ./embeddings/
  - text_embeddings.npy: (1000, 512)
  - image_embeddings.npy: (1000, 512)
  - metadata.csv



