In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import os
import json
import torch
from PIL import Image
from transformers import CLIPModel, CLIPProcessor

# 初始化 CLIP 模型和处理器
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 文件路径
main_folder_path = "/content/drive/MyDrive/train"
json_file_path = "/content/train_idiom_mean .json"

# 加载 JSON 文件
with open(json_file_path, "r", encoding="utf-8") as json_file:
    compound_definitions = json.load(json_file)

# 初始化 compound_data
compound_data = []

# 遍历 JSON 数据
for compound_name, compound_text in compound_definitions.items():
    # 替换单引号为下划线
    processed_name = compound_name.replace("'", "_")
    compound_path = os.path.join(main_folder_path, processed_name)

    if not os.path.isdir(compound_path):
        print(f"Skipping compound: {processed_name} (folder does not exist)")
        continue

    print(f"Processing compound: {processed_name}")

    # 处理文本嵌入
    text_inputs = processor(text=[compound_text], return_tensors="pt", padding=True)
    with torch.no_grad():
        text_embedding = model.get_text_features(**text_inputs).squeeze().unsqueeze(0)

    # 初始化图片列表
    images = []

    # 遍历图像文件
    for filename in os.listdir(compound_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
            image_path = os.path.join(compound_path, filename)

            try:
                image = Image.open(image_path).convert("RGB")
            except Exception as e:
                print(f"Error opening image {filename}: {e}")
                continue

            image_inputs = processor(images=image, return_tensors="pt", padding=True)
            with torch.no_grad():
                image_embedding = model.get_image_features(**image_inputs).squeeze().unsqueeze(0)

            images.append({
                "image_id": filename,
                "image_embedding": image_embedding
            })

    # 构建 compound_data 条目
    compound_data.append({
        "compound_name": compound_name,  # 原始名称保存
        "text_embedding": text_embedding,
        "images": images
    })

# 保存到文件时恢复原始名称
for compound in compound_data:
    compound["compound_name"] = compound["compound_name"].replace("_", "'")

# 保存到文件
output_file = "clip_train_idiom_embeddings.pt"
torch.save(compound_data, output_file)
print(f"Embeddings saved to {output_file}")

# 加载保存的文件
loaded_data = torch.load(output_file)

# 查看文件结构
print("\nLoaded .pt File Structure:")
print(f"Type of data: {type(loaded_data)}")
print(f"Number of elements: {len(loaded_data)}")

# 查看第一个元素
if len(loaded_data) > 0:
    print("\nFirst Element:")
    first_element = loaded_data[0]
    for key, value in first_element.items():
        if isinstance(value, torch.Tensor):
            print(f"{key}: Tensor of shape {value.shape}")
        elif isinstance(value, list):
            print(f"{key}: List of {len(value)} elements")
        else:
            print(f"{key}: {value}")
else:
    print("\nThe loaded file is empty.")

# 模拟处理每个 compound
for compound in loaded_data:
    print(f"\nProcessing compound: {compound['compound_name']}")
    folder_name = os.path.join(main_folder_path, compound["compound_name"].replace("'", "_"))  # 使用替换后的名字
    if not os.path.exists(folder_name):
        print(f"Skipping compound: {compound['compound_name']} (folder does not exist)")
        continue

    print(f"Processing folder: {folder_name}")
    print(f"Text Embedding Shape: {compound['text_embedding'].shape}")
    for image in compound["images"]:
        print(f"  Image ID: {image['image_id']}")
        print(f"  Image Embedding Shape: {image['image_embedding'].shape}")
    print("-" * 50)


Processing compound: elbow grease
Processing compound: night owl
Processing compound: heart of gold
Processing compound: shrinking violet
Processing compound: banana republic
Processing compound: private eye
Processing compound: pipe dream
Processing compound: rocket science
Processing compound: nest egg
Processing compound: bull market
Processing compound: beached whale
Processing compound: lounge lizard
Processing compound: bear market
Processing compound: white hat
Processing compound: smoking gun
Processing compound: old flame
Processing compound: ivory tower
Processing compound: black sheep
Processing compound: gravy train
Processing compound: spring chicken
Processing compound: inner circle
Processing compound: honey trap
Processing compound: open book
Processing compound: brain surgery
Processing compound: red flag
Processing compound: white elephant
Processing compound: rat run
Processing compound: graveyard shift
Processing compound: dirty money
Processing compound: high life


  loaded_data = torch.load(output_file)


  Image Embedding Shape: torch.Size([1, 512])
  Image ID: 50206720330.png
  Image Embedding Shape: torch.Size([1, 512])
--------------------------------------------------


In [13]:
import torch

# 加载 .pt 文件
output_file = "clip_train_idiom_embeddings.pt"
loaded_data = torch.load(output_file)

# 打印 compound 的数量
print(f"Number of compounds in the .pt file: {len(loaded_data)}\n")

# 打印每个 compound 的结构
print("Structure of each compound:\n")
for i, compound in enumerate(loaded_data):
    print(f"Compound {i + 1}:")
    for key, value in compound.items():
        if isinstance(value, torch.Tensor):
            print(f"  {key}: Tensor of shape {value.shape}")
        elif isinstance(value, list):
            print(f"  {key}: List with {len(value)} elements")
            if len(value) > 0 and isinstance(value[0], dict):
                print(f"    Example element keys: {list(value[0].keys())}")
        else:
            print(f"  {key}: {value}")
    print("-" * 50)


Number of compounds in the .pt file: 34

Structure of each compound:

Compound 1:
  compound_name: elbow grease
  text_embedding: Tensor of shape torch.Size([1, 512])
  images: List with 5 elements
    Example element keys: ['image_id', 'image_embedding']
--------------------------------------------------
Compound 2:
  compound_name: night owl
  text_embedding: Tensor of shape torch.Size([1, 512])
  images: List with 5 elements
    Example element keys: ['image_id', 'image_embedding']
--------------------------------------------------
Compound 3:
  compound_name: heart of gold
  text_embedding: Tensor of shape torch.Size([1, 512])
  images: List with 5 elements
    Example element keys: ['image_id', 'image_embedding']
--------------------------------------------------
Compound 4:
  compound_name: shrinking violet
  text_embedding: Tensor of shape torch.Size([1, 512])
  images: List with 5 elements
    Example element keys: ['image_id', 'image_embedding']
-------------------------------

  loaded_data = torch.load(output_file)


In [None]:
OUTPUT

{

    "compound_name": "silver_bullet",
    "text_embedding": tensor([[...]]),  # text embedding，(1, 512)
    "images": [
        {
            "image_id": "image1.jpg",
            "image_embedding": tensor([[...]]),  #picture1 embedding，shape (1, 512)
        },
        {
            "image_id": "image2.jpg",
            "image_embedding": tensor([[...]]),  # picture2 embedding，shape is (1, 512)
        },
    ]

}


In [10]:
# 文件路径
main_folder_path = "/content/drive/MyDrive/train"
json_file_path = "/content/train_literal_mean .json"

# 加载 JSON 文件
with open(json_file_path, "r", encoding="utf-8") as json_file:
    compound_definitions = json.load(json_file)

# 初始化 compound_data
compound_data = []

# 遍历 JSON 数据
for compound_name, compound_text in compound_definitions.items():
    # 替换单引号为下划线
    processed_name = compound_name.replace("'", "_")
    compound_path = os.path.join(main_folder_path, processed_name)

    if not os.path.isdir(compound_path):
        print(f"Skipping compound: {processed_name} (folder does not exist)")
        continue

    print(f"Processing compound: {processed_name}")

    # 处理文本嵌入
    text_inputs = processor(text=[compound_text], return_tensors="pt", padding=True)
    with torch.no_grad():
        text_embedding = model.get_text_features(**text_inputs).squeeze().unsqueeze(0)

    # 初始化图片列表
    images = []

    # 遍历图像文件
    for filename in os.listdir(compound_path):
        if filename.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
            image_path = os.path.join(compound_path, filename)

            try:
                image = Image.open(image_path).convert("RGB")
            except Exception as e:
                print(f"Error opening image {filename}: {e}")
                continue

            image_inputs = processor(images=image, return_tensors="pt", padding=True)
            with torch.no_grad():
                image_embedding = model.get_image_features(**image_inputs).squeeze().unsqueeze(0)

            images.append({
                "image_id": filename,
                "image_embedding": image_embedding
            })

    # 构建 compound_data 条目
    compound_data.append({
        "compound_name": compound_name,  # 原始名称保存
        "text_embedding": text_embedding,
        "images": images
    })

# 保存到文件时恢复原始名称
for compound in compound_data:
    compound["compound_name"] = compound["compound_name"].replace("_", "'")

# 保存到文件
output_file = "clip_train_literal_embeddings.pt"
torch.save(compound_data, output_file)
print(f"Embeddings saved to {output_file}")

# 加载保存的文件
loaded_data = torch.load(output_file)

# 查看文件结构
print("\nLoaded .pt File Structure:")
print(f"Type of data: {type(loaded_data)}")
print(f"Number of elements: {len(loaded_data)}")

# 查看第一个元素
if len(loaded_data) > 0:
    print("\nFirst Element:")
    first_element = loaded_data[0]
    for key, value in first_element.items():
        if isinstance(value, torch.Tensor):
            print(f"{key}: Tensor of shape {value.shape}")
        elif isinstance(value, list):
            print(f"{key}: List of {len(value)} elements")
        else:
            print(f"{key}: {value}")
else:
    print("\nThe loaded file is empty.")

# 模拟处理每个 compound
for compound in loaded_data:
    print(f"\nProcessing compound: {compound['compound_name']}")
    folder_name = os.path.join(main_folder_path, compound["compound_name"].replace("'", "_"))  # 使用替换后的名字
    if not os.path.exists(folder_name):
        print(f"Skipping compound: {compound['compound_name']} (folder does not exist)")
        continue

    print(f"Processing folder: {folder_name}")
    print(f"Text Embedding Shape: {compound['text_embedding'].shape}")
    for image in compound["images"]:
        print(f"  Image ID: {image['image_id']}")
        print(f"  Image Embedding Shape: {image['image_embedding'].shape}")
    print("-" * 50)

Processing compound: green fingers
Processing compound: ancient history
Processing compound: devil_s advocate
Processing compound: piece of cake
Processing compound: brass ring
Processing compound: apples and oranges
Processing compound: ghost town
Processing compound: copy cat
Processing compound: secret santa
Processing compound: dirty word
Processing compound: close shave
Processing compound: donkey work
Processing compound: top dog
Processing compound: zebra crossing
Processing compound: eye candy
Processing compound: armchair critic
Processing compound: hot potato
Processing compound: love triangle
Processing compound: black box
Processing compound: hen party
Processing compound: pins and needles
Processing compound: bun in the oven
Processing compound: loan shark
Processing compound: two-way street
Processing compound: wet blanket
Processing compound: chicken feed
Embeddings saved to clip_train_literal_embeddings.pt

Loaded .pt File Structure:
Type of data: <class 'list'>
Number 

  loaded_data = torch.load(output_file)


In [11]:
import torch

# 加载 .pt 文件
output_file = "clip_train_literal_embeddings.pt"
loaded_data = torch.load(output_file)

# 打印 compound 的数量
print(f"Number of compounds in the .pt file: {len(loaded_data)}\n")

# 打印每个 compound 的结构
print("Structure of each compound:\n")
for i, compound in enumerate(loaded_data):
    print(f"Compound {i + 1}:")
    for key, value in compound.items():
        if isinstance(value, torch.Tensor):
            print(f"  {key}: Tensor of shape {value.shape}")
        elif isinstance(value, list):
            print(f"  {key}: List with {len(value)} elements")
            if len(value) > 0 and isinstance(value[0], dict):
                print(f"    Example element keys: {list(value[0].keys())}")
        else:
            print(f"  {key}: {value}")
    print("-" * 50)


Number of compounds in the .pt file: 26

Structure of each compound:

Compound 1:
  compound_name: green fingers
  text_embedding: Tensor of shape torch.Size([1, 512])
  images: List with 5 elements
    Example element keys: ['image_id', 'image_embedding']
--------------------------------------------------
Compound 2:
  compound_name: ancient history
  text_embedding: Tensor of shape torch.Size([1, 512])
  images: List with 5 elements
    Example element keys: ['image_id', 'image_embedding']
--------------------------------------------------
Compound 3:
  compound_name: devil's advocate
  text_embedding: Tensor of shape torch.Size([1, 512])
  images: List with 5 elements
    Example element keys: ['image_id', 'image_embedding']
--------------------------------------------------
Compound 4:
  compound_name: piece of cake
  text_embedding: Tensor of shape torch.Size([1, 512])
  images: List with 5 elements
    Example element keys: ['image_id', 'image_embedding']
------------------------

  loaded_data = torch.load(output_file)
