In [1]:
import json
import os
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
import numpy as np

# Load JSON file
with open('/data1/dxw_data/llm/redbook/captions_labeled.json', 'r') as f:
    data_json = json.load(f)

# Set device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Instantiate and load imagebind model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)




ImageBindModel(
  (modality_preprocessors): ModuleDict(
    (vision): RGBDTPreprocessor(
      (cls_token): tensor((1, 1, 1280), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Sequential(
          (0): PadIm2Video()
          (1): Conv3d(3, 1280, kernel_size=(2, 14, 14), stride=(2, 14, 14), bias=False)
        )
      )
      (pos_embedding_helper): SpatioTemporalPosEmbeddingHelper(
        (pos_embed): tensor((1, 257, 1280), requires_grad=True)
        
      )
    )
    (text): TextPreprocessor(
      (pos_embed): tensor((1, 77, 1024), requires_grad=True)
      (mask): tensor((77, 77), requires_grad=False)
      
      (token_embedding): Embedding(49408, 1024)
    )
    (audio): AudioPreprocessor(
      (cls_token): tensor((1, 1, 768), requires_grad=True)
      
      (rgbt_stem): PatchEmbedGeneric(
        (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10), bias=False)
        (norm_layer): LayerNorm((768,), eps=1e-05, elementwise_affine=

In [6]:
# Preprocess transforms for images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to load and transform vision data
def load_and_transform_vision_data_mask(image_paths, mask_paths, device):
    images = []
    for image_path, mask_path in zip(image_paths, mask_paths):
        image = Image.open(image_path).convert('RGB')
        mask = Image.open(mask_path).convert('L')

        # Apply mask to the image
        image_np = np.array(image)
        mask_np = np.array(mask)
        masked_image_np = np.multiply(image_np, mask_np[:, :, None] / 255.0)
        masked_image = Image.fromarray(masked_image_np.astype(np.uint8))

        # Transform the masked image
        transformed_image = transform(masked_image).unsqueeze(0)
        images.append(transformed_image)

    images = torch.cat(images).to(device)
    return images

# Function to load and transform text data
def load_and_transform_text(text_list, device):
    return data.load_and_transform_text(text_list, device)

# Function to load and transform vision data
def load_and_transform_vision_data(image_paths, device):
    images = [transform(Image.open(image_path).convert('RGB')).unsqueeze(0) for image_path in image_paths]
    images = torch.cat(images).to(device)
    return images

# Function to extract embeddings using imagebind model
def extract_text_embeddings(texts, images, masks, device):
    inputs = {
        ModalityType.TEXT: load_and_transform_text(texts, device),
    }
    with torch.no_grad():
        embeddings = model(inputs)
    return embeddings

def extract_image_embeddings(texts, images, masks, device):
    inputs = {
        ModalityType.VISION: load_and_transform_vision_data(images, device),
    }
    with torch.no_grad():
        embeddings = model(inputs)
    return embeddings

def extract_mask_embeddings(texts, images, masks, device):
    inputs = {
        ModalityType.VISION: load_and_transform_vision_data_mask(images, masks, device),
    }
    with torch.no_grad():
        embeddings = model(inputs)
    return embeddings

# Prepare dataset
image_paths = []
mask_paths = []
captions = []
labels = []


# Ensure the cover directory exists
cover_path = '/data1/dxw_data/llm/redbook/cover/'
os.makedirs(cover_path, exist_ok=True)

for item in data_json:
    image_path = os.path.join('/data1/dxw_data/llm/redbook/data', item['image'])
    mask_path = os.path.join('/data1/dxw_data/llm/redbook/processed', item['image'])
    if os.path.exists(image_path) and os.path.exists(mask_path):
        image_paths.append(image_path)
        mask_paths.append(mask_path)
        captions.append(item['caption'])
        labels.append(item['label'])

# Extract embeddings
text_embeddings = extract_text_embeddings(captions, image_paths,mask_paths, device)
image_embeddings = extract_image_embeddings(captions, image_paths,mask_paths, device)
mask_embeddings = extract_mask_embeddings(captions, image_paths,mask_paths, device)
image_embeddings = image_embeddings[ModalityType.VISION].cpu().numpy()
mask_embeddings = mask_embeddings[ModalityType.VISION].cpu().numpy()
text_embeddings = text_embeddings[ModalityType.TEXT].cpu().numpy()


In [None]:
# ---------------------------text和mask和image同时--------------------------- #

In [8]:
# Combine image and text features
combined_features = np.hstack((mask_embeddings, image_embeddings))
combined_features = np.hstack((combined_features, text_embeddings))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)

# Define and train the MLP model
mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=500, random_state=42))
mlp.fit(X_train, y_train)

# Evaluate the model
train_accuracy = mlp.score(X_train, y_train)
test_accuracy = mlp.score(X_test, y_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 1.0000
Test Accuracy: 0.6383


In [None]:
# ---------------------------mask和image同时--------------------------- #

In [9]:
# Combine image and text features
combined_features = np.hstack((mask_embeddings, image_embeddings))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)

# Define and train the MLP model
mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=500, random_state=42))
mlp.fit(X_train, y_train)

# Evaluate the model
train_accuracy = mlp.score(X_train, y_train)
test_accuracy = mlp.score(X_test, y_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 1.0000
Test Accuracy: 0.5957


In [None]:
# ---------------------------mask和text同时--------------------------- #

In [10]:
# Combine image and text features
combined_features = np.hstack((mask_embeddings, text_embeddings))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)

# Define and train the MLP model
mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=500, random_state=42))
mlp.fit(X_train, y_train)

# Evaluate the model
train_accuracy = mlp.score(X_train, y_train)
test_accuracy = mlp.score(X_test, y_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 0.9946
Test Accuracy: 0.6170


In [None]:
# ---------------------------image和text同时--------------------------- #

In [11]:
# Combine image and text features
combined_features = np.hstack((image_embeddings, text_embeddings))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)

# Define and train the MLP model
mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=500, random_state=42))
mlp.fit(X_train, y_train)

# Evaluate the model
train_accuracy = mlp.score(X_train, y_train)
test_accuracy = mlp.score(X_test, y_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 1.0000
Test Accuracy: 0.5745


In [None]:
# ---------------------------只用mask--------------------------- #

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(mask_embeddings, labels, test_size=0.2, random_state=42)

# Define and train the MLP model
mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=500, random_state=42))
mlp.fit(X_train, y_train)

# Evaluate the model
train_accuracy = mlp.score(X_train, y_train)
test_accuracy = mlp.score(X_test, y_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 0.9784
Test Accuracy: 0.5745


In [None]:
# ---------------------------只用image--------------------------- #

In [13]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(image_embeddings, labels, test_size=0.2, random_state=42)

# Define and train the MLP model
mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=500, random_state=42))
mlp.fit(X_train, y_train)

# Evaluate the model
train_accuracy = mlp.score(X_train, y_train)
test_accuracy = mlp.score(X_test, y_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 1.0000
Test Accuracy: 0.6170


In [None]:
# ---------------------------只用text--------------------------- #

# 其实也等价于多模态LLM模型合并图片文本--->文本

In [14]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_embeddings, labels, test_size=0.2, random_state=42)

# Define and train the MLP model
mlp = make_pipeline(StandardScaler(), MLPClassifier(hidden_layer_sizes=(512, 256), max_iter=500, random_state=42))
mlp.fit(X_train, y_train)

# Evaluate the model
train_accuracy = mlp.score(X_train, y_train)
test_accuracy = mlp.score(X_test, y_test)

print(f'Train Accuracy: {train_accuracy:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

Train Accuracy: 0.7135
Test Accuracy: 0.6809
