In [1]:
import json
import os
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision.models import resnet50
from PIL import Image
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
import numpy as np
from sklearn.metrics import mean_squared_error, accuracy_score

# Load JSON file
with open('/data1/dxw_data/llm/MKT_data_mining/Multimodal/data/captions_labeled_with_time.json', 'r') as f:
    data = json.load(f)

# Load pretrained ResNet
resnet = resnet50(pretrained=False)
resnet.load_state_dict(torch.load('/data1/dxw_data/llm/resnet/resnet50-19c8e357.pth'))
resnet = nn.Sequential(*list(resnet.children())[:-1])  # Remove the classification layer
resnet.eval()

word2vec_path = '/data1/dxw_data/llm/word2vec/GoogleNews-vectors-negative300.bin.gz'
word2vec_model  = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)




In [2]:
# Preprocess transforms for images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Function to apply mask, save the merged image, and extract image features
def extract_imagemask_features(image_path, mask_path, cover_path):
    image = Image.open(image_path).convert('RGB')
    mask = Image.open(mask_path).convert('L')

    # Apply mask to the image
    image_np = np.array(image)
    mask_np = np.array(mask)
    masked_image_np = np.multiply(image_np, mask_np[:, :, None] / 255.0)
    masked_image = Image.fromarray(masked_image_np.astype(np.uint8))

    # Save the masked image
    masked_image_save_path = os.path.join(cover_path, os.path.basename(image_path))
    masked_image.save(masked_image_save_path)

    # Apply transformations
    masked_image = transform(masked_image).unsqueeze(0)

    # Extract features using ResNet
    with torch.no_grad():
        features = resnet(masked_image).squeeze().numpy()
    return features

# Ensure the cover directory exists
cover_path = '/data1/dxw_data/llm/redbook/cover'
os.makedirs(cover_path, exist_ok=True)

# Function to extract image features
def extract_image_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet(image).squeeze().numpy()
    return features

# Function to extract text features
def extract_text_features(caption):
    words = caption.split()
    word_vectors = []
    for word in words:
        if word in word2vec_model:
            vector = word2vec_model[word]
            word_vectors.append(vector)
    if not word_vectors:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word_vectors, axis=0)

# Prepare dataset
image_features = []
text_features = []
mask_features= []
labels = []
times = []

for item in data:
    image_path = os.path.join('/data1/dxw_data/llm/redbook/data', item['image'])
    mask_path = os.path.join('/data1/dxw_data/llm/redbook/processed', item['image'])
    if os.path.exists(image_path) and os.path.exists(mask_path):
        mask_feat = extract_imagemask_features(image_path, mask_path, cover_path)
        img_feat = extract_image_features(image_path)
        txt_feat = extract_text_features(item['caption'])
        mask_features.append(mask_feat)
        image_features.append(img_feat)
        text_features.append(txt_feat)
        labels.append(item['label'])
        times.append(item['time'])


# Device configuration
device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')

# Convert to numpy arrays
image_features = np.array(image_features)
mask_features = np.array(mask_features)
text_features = np.array(text_features)
labels = np.array(labels)
times = np.array(times)

In [None]:
# ---------------------------text和mask和image同时--------------------------- #

In [3]:
# Combine image and text features
combined_features = np.hstack((mask_features, image_features))
combined_embeddings  = np.hstack((combined_features, text_features))

# Sequence length for time series prediction
sequence_length = 2

# Create sequences for time series prediction
def create_sequences(data, labels, times, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        # Ensure that we have at least `seq_length` consecutive time steps
        if times[i+seq_length-1] - times[i] < seq_length:
            x = data[i:i+seq_length]
            y = labels[i+seq_length]
            xs.append(x)
            ys.append(y)
    return np.array(xs), np.array(ys)

X, y = create_sequences(combined_embeddings, labels, times, sequence_length)

print("X.shape:", X.shape)  # Expected to be non-zero
print("y.shape:", y.shape)  # Expected to be non-zero

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define the Transformer model
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class TransformerModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(TransformerModel, self).__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(input_size)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=nhead, dim_feedforward=hidden_size)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder = nn.Linear(input_size, output_size)

    def forward(self, src):
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output[-1])
        return output

# Hyperparameters
input_size = combined_embeddings.shape[1]
hidden_size = 128
output_size = 1  # Assuming binary classification (use appropriate value for your task)
num_layers = 2
num_heads = 4  # Number of heads in the multiheadattention models
num_epochs = 20
learning_rate = 0.001

# Model, loss function, and optimizer
model = TransformerModel(input_size, hidden_size, output_size, num_layers, num_heads).to(device)
criterion = nn.MSELoss()  # For regression tasks
# criterion = nn.BCELoss()  # For binary classification tasks
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

X.shape: (230, 2, 4396)
y.shape: (230,)


In [6]:
from sklearn.metrics import mean_squared_error, accuracy_score
# Train the model
# Train the model
for epoch in range(num_epochs):
    model.train()
    outputs = model(torch.tensor(X_train, dtype=torch.float32).to(device).permute(1, 0, 2))  # [seq_len, batch_size, input_size]
    optimizer.zero_grad()
    loss = criterion(outputs, torch.tensor(y_train, dtype=torch.float32).to(device))
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model
model.eval()
with torch.no_grad():
    predicted = model(torch.tensor(X_test, dtype=torch.float32).to(device).permute(1, 0, 2)).cpu().numpy()
    test_loss = mean_squared_error(y_test, predicted)
    print(f'Test Loss: {test_loss:.4f}')
    
    # If the task is classification, calculate accuracy
    if output_size == 1:  # Adjust the condition based on your task
        predicted_labels = (predicted > 0.5).astype(int)
        y_test_labels = (y_test > 0.5).astype(int)
        accuracy = accuracy_score(y_test_labels, predicted_labels)
        print(f'Test Accuracy: {accuracy:.4f}')

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch [1/20], Loss: 0.3735
Epoch [2/20], Loss: 0.3108
Epoch [3/20], Loss: 0.2749
Epoch [4/20], Loss: 0.2708
Epoch [5/20], Loss: 0.2940
Epoch [6/20], Loss: 0.3296
Epoch [7/20], Loss: 0.3613
Epoch [8/20], Loss: 0.3447
Epoch [9/20], Loss: 0.3196
Epoch [10/20], Loss: 0.2890
Epoch [11/20], Loss: 0.2686
Epoch [12/20], Loss: 0.2705
Epoch [13/20], Loss: 0.2731
Epoch [14/20], Loss: 0.2922
Epoch [15/20], Loss: 0.3052
Epoch [16/20], Loss: 0.2987
Epoch [17/20], Loss: 0.2910
Epoch [18/20], Loss: 0.2763
Epoch [19/20], Loss: 0.2658
Epoch [20/20], Loss: 0.2669
Test Loss: 0.2559
Test Accuracy: 0.5435


In [None]:
# ---------------------------mask和image同时--------------------------- #

In [None]:
# ---------------------------mask和text同时--------------------------- #

In [None]:
# ---------------------------image和text同时--------------------------- #

In [None]:
# ---------------------------只用mask--------------------------- #

In [4]:
# ---------------------------只用image--------------------------- #

In [6]:
# ---------------------------只用text--------------------------- #

# 其实也等价于多模态LLM模型合并图片文本--->文本