In [None]:
# 使用四舍五入的数值做label，小数点后2位，最后回归拟合的也是msr等值-离散程度，不是准确率。

In [11]:
import pandas as pd
from dateutil import parser
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms
from tqdm import tqdm
import numpy as np

# Define file paths
data_dir = "/data1/dxw_data/llm/redbook_final/script_next/"
rawdata_path = os.path.join(data_dir, "rawdata_20%.csv")
after2monthdata_path = os.path.join(data_dir, "after2monthdata_20%.csv")
image_dir = os.path.join(data_dir, "data_img_20%")

# Read CSV files
rawdata = pd.read_csv(rawdata_path)
after2monthdata = pd.read_csv(after2monthdata_path)

# Convert date columns to standard format
def parse_date(date_str):
    try:
        return parser.parse(date_str)
    except ValueError:
        return None

rawdata['post_date'] = rawdata['post_date'].apply(parse_date)
rawdata = rawdata.dropna(subset=['post_date'])

# Split data into training and testing sets
train_rawdata = rawdata[(rawdata['post_date'].dt.month >= 1) & (rawdata['post_date'].dt.month <= 9)]
test_rawdata = rawdata[(rawdata['post_date'].dt.month >= 10) & (rawdata['post_date'].dt.month <= 12)]

train_after2monthdata = after2monthdata[after2monthdata['post_id'].isin(train_rawdata['post_id'])]
test_after2monthdata = after2monthdata[after2monthdata['post_id'].isin(test_rawdata['post_id'])]

# Round the proportion values to 2 decimal places
train_after2monthdata['proportion'] = train_after2monthdata['proportion'].round(2)
test_after2monthdata['proportion'] = test_after2monthdata['proportion'].round(2)

# Clean the labels
train_after2monthdata = train_after2monthdata.replace([np.inf, -np.inf], np.nan).dropna(subset=['proportion'])
test_after2monthdata = test_after2monthdata.replace([np.inf, -np.inf], np.nan).dropna(subset=['proportion'])

# Only use 1/10th of the data
def get_subset_indices(data, fraction=0.1):
    data_size = len(data)
    indices = list(range(data_size))
    np.random.shuffle(indices)
    split = int(np.floor(fraction * data_size))
    return indices[:split]

train_indices = get_subset_indices(train_rawdata)
test_indices = get_subset_indices(test_rawdata)

train_rawdata = train_rawdata.iloc[train_indices]
test_rawdata = test_rawdata.iloc[test_indices]

train_after2monthdata = train_after2monthdata[train_after2monthdata['post_id'].isin(train_rawdata['post_id'])]
test_after2monthdata = test_after2monthdata[test_after2monthdata['post_id'].isin(test_rawdata['post_id'])]

# Custom Dataset Class
class MultimodalDataset(Dataset):
    def __init__(self, rawdata, after2monthdata, image_dir, transform=None, max_images=1):
        self.rawdata = rawdata
        self.after2monthdata = after2monthdata
        self.image_dir = image_dir
        self.transform = transform
        self.max_images = max_images
        self.data = self._prepare_data()
    
    def _prepare_data(self):
        data = []
        for _, row in tqdm(self.rawdata.iterrows(), total=self.rawdata.shape[0], desc="Processing data"):
            poster_id = row['poster_id']
            post_id = row['post_id']
            image_files = [f for f in os.listdir(self.image_dir) if f"{poster_id}_{post_id}" in f]
            images = []
            for image_file in image_files[:self.max_images]:
                image_path = os.path.join(self.image_dir, image_file)
                image = Image.open(image_path).convert('RGB')
                if self.transform:
                    image = self.transform(image)
                images.append(image)
            # If less than max_images, pad with zeros
            while len(images) < self.max_images:
                images.append(torch.zeros((3, 224, 224)))
            if images:
                summary = row['summary']
                label_data = self.after2monthdata[self.after2monthdata['post_id'] == post_id]['proportion']
                if not label_data.empty:
                    label = label_data.values[0]
                    data.append((summary, images, label))
        return data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        summary, images, label = self.data[idx]
        return summary, torch.stack(images).float(), torch.tensor(label).float()

# Image Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Create Datasets
train_dataset = MultimodalDataset(train_rawdata, train_after2monthdata, image_dir, transform=transform)
test_dataset = MultimodalDataset(test_rawdata, test_after2monthdata, image_dir, transform=transform)

# Create Data Loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_after2monthdata['proportion'] = train_after2monthdata['proportion'].round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_after2monthdata['proportion'] = test_after2monthdata['proportion'].round(2)
Processing data: 100%|██████████| 3467/3467 [03:15<00:00, 17.73it/s]
Processing data: 100%|██████████| 802/802 [00:45<00:00, 17.81it/s]


In [12]:
# Model Definition
import torch
import torch.nn as nn
import torch.optim as optim
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
from tqdm import tqdm

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load imagebind model
imagebind_model = imagebind_model.imagebind_huge(pretrained=True)
imagebind_model.eval()
imagebind_model.to(device)

class CrossAttentionFusion(nn.Module):
    def __init__(self, text_embedding_dim, vision_embedding_dim, common_embedding_dim, num_heads):
        super(CrossAttentionFusion, self).__init__()
        self.text_linear = nn.Linear(text_embedding_dim, common_embedding_dim)
        self.vision_linear = nn.Linear(vision_embedding_dim, common_embedding_dim)
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=common_embedding_dim, nhead=num_heads), num_layers=2)
        self.lstm = nn.LSTM(common_embedding_dim, common_embedding_dim, batch_first=True)
        self.fc = nn.Linear(common_embedding_dim, 1)
    
    def forward(self, text_embeddings, vision_embeddings):
        text_embeddings = self.text_linear(text_embeddings)
        vision_embeddings = self.vision_linear(vision_embeddings)
        # Flatten embeddings to match Transformer input requirements
        text_embeddings = text_embeddings.view(text_embeddings.size(0), -1, text_embeddings.size(-1))
        vision_embeddings = vision_embeddings.view(vision_embeddings.size(0), -1, vision_embeddings.size(-1))
        multimodal_embeddings = torch.cat((text_embeddings, vision_embeddings), dim=1)
        multimodal_embeddings = self.transformer_encoder(multimodal_embeddings)
        lstm_out, _ = self.lstm(multimodal_embeddings)
        lstm_out = lstm_out[:, -1, :]  # Take output of the last time step
        output = self.fc(lstm_out)
        return output

def get_embeddings(text_list, image_tensors):
    inputs = {
        ModalityType.TEXT: data.load_and_transform_text(text_list, device),
        ModalityType.VISION: image_tensors.to(device)
    }
    
    with torch.no_grad():
        embeddings = imagebind_model(inputs)
    
    # Check for NaNs and Infinities in embeddings
    if torch.isnan(embeddings[ModalityType.TEXT]).any() or torch.isinf(embeddings[ModalityType.TEXT]).any():
        print("NaN or Infinity values found in text embeddings")
    if torch.isnan(embeddings[ModalityType.VISION]).any() or torch.isinf(embeddings[ModalityType.VISION]).any():
        print("NaN or Infinity values found in vision embeddings")
    
    return embeddings

def train_model(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    log_file = open("log.txt", "w")
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
            summaries, images, labels = batch
            images, labels = images.to(device), labels.to(device)
            text_embeddings = get_embeddings(summaries, images)
            optimizer.zero_grad()
            outputs = model(text_embeddings[ModalityType.TEXT], text_embeddings[ModalityType.VISION])
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        epoch_loss = running_loss / len(train_loader)
        log_file.write(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}\n')
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss}')
    log_file.close()

# Initialize model, criterion, and optimizer
text_embedding_dim = 1024
vision_embedding_dim = 1024
common_embedding_dim = 768
num_heads = 8

model = CrossAttentionFusion(text_embedding_dim, vision_embedding_dim, common_embedding_dim, num_heads).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
train_model(model, train_loader, criterion, optimizer, num_epochs)

# Save the model
torch.save(model.state_dict(), os.path.join(data_dir, "multimodal_model.pth"))

print("Model training completed and saved!")

Training Epoch 1/10: 100%|██████████| 96/96 [00:47<00:00,  2.00it/s]


Epoch [1/10], Loss: 0.06847658870674422


Training Epoch 2/10: 100%|██████████| 96/96 [00:48<00:00,  1.99it/s]


Epoch [2/10], Loss: 0.0005873890116466404


Training Epoch 3/10: 100%|██████████| 96/96 [00:48<00:00,  1.99it/s]


Epoch [3/10], Loss: 0.0005854149333875588


Training Epoch 4/10: 100%|██████████| 96/96 [00:48<00:00,  1.97it/s]


Epoch [4/10], Loss: 0.0005911607766696155


Training Epoch 5/10: 100%|██████████| 96/96 [00:48<00:00,  1.99it/s]


Epoch [5/10], Loss: 0.0005790043707444662


Training Epoch 6/10: 100%|██████████| 96/96 [00:48<00:00,  1.99it/s]


Epoch [6/10], Loss: 0.0005807790165211676


Training Epoch 7/10: 100%|██████████| 96/96 [00:47<00:00,  2.00it/s]


Epoch [7/10], Loss: 0.0005931772725489282


Training Epoch 8/10: 100%|██████████| 96/96 [00:48<00:00,  2.00it/s]


Epoch [8/10], Loss: 0.0005708792235357881


Training Epoch 9/10: 100%|██████████| 96/96 [00:48<00:00,  1.99it/s]


Epoch [9/10], Loss: 0.0005663760195299498


Training Epoch 10/10: 100%|██████████| 96/96 [00:48<00:00,  1.98it/s]


Epoch [10/10], Loss: 0.0005702972586429192
Model training completed and saved!


In [13]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Testing the model
def test_model(model, test_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    test_loss = 0.0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for summaries, images, labels in tqdm(test_loader, desc="Testing"):
            images, labels = images.to(device), labels.to(device)
            
            # Check for NaNs or Infinities in input data
            if torch.isnan(images).any() or torch.isinf(images).any():
                print("NaN or Infinity values found in input images")
                continue
            if torch.isnan(labels).any() or torch.isinf(labels).any():
                print("NaN or Infinity values found in input labels")
                continue

            text_embeddings = get_embeddings(summaries, images)
            outputs = model(text_embeddings[ModalityType.TEXT], text_embeddings[ModalityType.VISION])
            
            # Check for NaNs and Infinities in outputs
            if torch.isnan(outputs).any() or torch.isinf(outputs).any():
                print("NaN or Infinity values found in outputs")
                continue
            
            loss = criterion(outputs, labels.unsqueeze(1))
            if torch.isnan(loss).any() or torch.isinf(loss).any():
                print("NaN or Infinity values found in loss")
                continue
            
            test_loss += loss.item()
            
            all_predictions.extend(outputs.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())
    
    avg_test_loss = test_loss / len(test_loader)
    
    # Convert lists to numpy arrays
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    
    # Check for NaNs and Infinities in all_predictions and all_labels
    mask = ~np.isnan(all_predictions) & ~np.isinf(all_predictions) & ~np.isnan(all_labels) & ~np.isinf(all_labels)
    all_predictions = all_predictions[mask]
    all_labels = all_labels[mask]

    mse = mean_squared_error(all_labels, all_predictions)
    mae = mean_absolute_error(all_labels, all_predictions)
    r2 = r2_score(all_labels, all_predictions)
    
    print(f'Test Loss: {avg_test_loss}')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R2 Score: {r2}')
    
    return avg_test_loss, mse, mae, r2

# Initialize model, criterion
model = CrossAttentionFusion(text_embedding_dim, vision_embedding_dim, common_embedding_dim, num_heads).to(device)
model.load_state_dict(torch.load(os.path.join(data_dir, "multimodal_model.pth")))
criterion = nn.MSELoss()

# Test the model
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
avg_test_loss, mse, mae, r2 = test_model(model, test_loader, criterion)

print("Model testing completed!")


Testing: 100%|██████████| 25/25 [00:11<00:00,  2.10it/s]

Test Loss: 0.0005750005086883902
Mean Squared Error: 0.000559596810489893
Mean Absolute Error: 0.01885504089295864
R2 Score: -0.007188295743023154
Model testing completed!



