In [None]:
import pandas as pd

# Load fashion-related data (positive samples)
fashion_df = pd.read_csv('/home/disk1/red_disk1/fashion/topics_filtered.csv')

# Extract the 'keyword' column which contains fashion-related keywords
fashion_samples = fashion_df['keyword group']

# Load non-fashion-related data (negative samples)
non_fashion_df = pd.read_csv('/home/disk1/red_disk1/fashion/non_fashion_texts.csv')

# Assuming the non-fashion data has a column 'word'
non_fashion_samples = non_fashion_df['word']

# Step 2: Count the number of samples in each dataset
num_fashion_samples = fashion_samples.shape[0]
num_non_fashion_samples = non_fashion_samples.shape[0]

# Print the results
print(f"Number of fashion-related samples (positive): {num_fashion_samples}")
print(f"Number of non-fashion-related samples (negative): {num_non_fashion_samples}")

# Step 3: Check if they are balanced
if num_fashion_samples == num_non_fashion_samples:
    print("The datasets are balanced.")
else:
    print(f"The datasets are not balanced. Difference: {abs(num_fashion_samples - num_non_fashion_samples)}")


In [8]:
import torch
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os

# Step 1: Read Positive Samples from 'topics_filtered.csv'
# Load the fashion lexicon (positive samples)
pos_file = "/home/disk1/red_disk1/fashion/topics_filtered.csv"
positive_samples_df = pd.read_csv(pos_file)
positive_samples = positive_samples_df['keyword group'].dropna().tolist()  # Extract keywords column

# Assign label 1 to positive samples
positive_samples = pd.DataFrame(positive_samples, columns=['word'])
positive_samples['label'] = 1

# Step 2: Read Negative Samples from non_fashion_texts
def read_negative_samples(directory):
    all_lines = []
    for foldername in os.listdir(directory):
        folder_path = os.path.join(directory, foldername)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    all_lines.extend([line.strip() for line in file if line.strip() != ""])
    return all_lines

neg_dir = "/home/disk1/red_disk1/Multimodal_MKT/non_fashion_texts"
negative_samples_list = read_negative_samples(neg_dir)

# Assign label 0 to negative samples
negative_samples = pd.DataFrame(negative_samples_list, columns=['word'])
negative_samples['label'] = 0

# Step 3: Combine Positive and Negative Samples
data = pd.concat([positive_samples, negative_samples], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Print the shape and format of the data
print(f"Combined Data Shape: {data.shape}")
print(f"Data Format Example:\n{data.head()}")

# Step 4: Load text2vec-large-chinese model
model_path = "/home/disk1/red_disk1/fashion/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Function to get embeddings from text2vec-large-chinese
def get_text2vec_embeddings(texts, model):
    print("Generating embeddings...")
    embeddings = []
    for i in tqdm(range(0, len(texts), 32), desc="Embedding Progress"):
        batch_texts = texts[i:i+32].tolist()
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

# Step 5: Get embeddings for the dataset
embeddings = get_text2vec_embeddings(data['word'], text2vec_model)

# Print the shape and format of the embeddings
print(f"Embeddings Shape: {embeddings.shape}")
print(f"Embedding Example:\n{embeddings[0]}")

# Convert labels to tensor
labels = torch.tensor(data['label'].values)

# Print the labels format
print(f"Labels Shape: {labels.shape}")
print(f"Labels Example:\n{labels[:5]}")

# Step 6: Train-Test Split
train_embeddings, val_embeddings, train_labels, val_labels = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42
)

# Print the split data shapes
print(f"Train Embeddings Shape: {train_embeddings.shape}, Train Labels Shape: {train_labels.shape}")
print(f"Val Embeddings Shape: {val_embeddings.shape}, Val Labels Shape: {val_labels.shape}")

# Create TensorDataset and DataLoader for training and validation
train_dataset = TensorDataset(train_embeddings, train_labels)
val_dataset = TensorDataset(val_embeddings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Print the data loader batch size
print(f"Train Loader Batch Size: {len(train_loader)} batches")
print(f"Val Loader Batch Size: {len(val_loader)} batches")


No sentence-transformers model found with name /home/disk1/red_disk1/fashion/text2vec-large-chinese. Creating a new one with mean pooling.


Combined Data Shape: (356852, 2)
Data Format Example:
                  word  label
0                染指间岛.      0
1                中華民國.      0
2  以下介紹一種老虎棋，稱為十八子围老虎。      0
3                *火雲聖衣      0
4              ΛCDM模型.      0
Generating embeddings...


Embedding Progress:  18%|█▊        | 2056/11152 [02:52<11:28, 13.21it/s]

In [None]:
import pandas as pd

# Assuming your fashion and non-fashion datasets are in CSV format

# Load fashion dataset
fashion_df = pd.read_csv('/path/to/fashion_data.csv')
# Load non-fashion dataset
non_fashion_df = pd.read_csv('/path/to/non_fashion_data.csv')

# Check the number of samples in each dataset
num_fashion = len(fashion_df)
num_non_fashion = len(non_fashion_df)

# Print the size of both datasets
print(f"Number of fashion samples: {num_fashion}")
print(f"Number of non-fashion samples: {num_non_fashion}")

# Check if the sizes are balanced
if num_fashion == num_non_fashion:
    print("The datasets are balanced.")
else:
    print("The datasets are not balanced.")
    print(f"Difference: {abs(num_fashion - num_non_fashion)} samples.")


In [13]:
import torch
from torch import nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

# Step 5: Define ResNet-Based Binary Classification Model
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):  # Adjust embedding_dim based on text2vec output (1024)
        super(ResNetBinaryClassifier, self).__init__()
        # Use pre-trained ResNet18, but replace the first conv layer to accept text2vec embeddings
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)  # Binary classification
        
        # Define a simple MLP layer to map embeddings to ResNet input size
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),  # Match the expected ResNet input size
            nn.ReLU(),
            nn.Linear(512, 512),  # Match the expected ResNet input size after projection
        )
        
        # Adding a Conv layer to convert 1D to a 2D feature map
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))

        # Adaptive pooling layer to transform the spatial dimensions to (7, 7)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        # print(f"Input to forward (embedding): {x.shape}")
        x = self.embedding_to_resnet(x)  # Project text2vec embeddings to match ResNet input size
        # print(f"After embedding_to_resnet: {x.shape}")
        x = x.unsqueeze(1).unsqueeze(2)  # Reshape to fit Conv2D (batch_size, channels, height, width)
        # print(f"After unsqueeze: {x.shape}")
        x = self.embedding_conv(x)  # Use conv layer to convert 1D feature to 2D feature map
        # print(f"After embedding_conv: {x.shape}")
        x = self.adaptive_pool(x)  # Apply adaptive pooling to resize to (7, 7)
        # print(f"After adaptive_pool: {x.shape}")
        x = self.resnet(x)
        # print(f"Output of ResNet: {x.shape}")
        return x

# Initialize the model with the correct embedding dimension (1024 as per the output shape)
model = ResNetBinaryClassifier(embedding_dim=1024)

# Step 6: Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training function
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training Progress"):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

# Validation function
def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation Progress"):
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    return total_loss / len(val_loader), accuracy, precision, recall, f1

# Train the model
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss, accuracy, precision, recall, f1 = evaluate(model, val_loader, criterion, device)
    
    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Final evaluation
val_loss, accuracy, precision, recall, f1 = evaluate(model, val_loader, criterion, device)
print(f"Final Results: Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")




Epoch 1/5


Training Progress: 100%|██████████| 183/183 [00:02<00:00, 81.26it/s]
Validation Progress: 100%|██████████| 46/46 [00:00<00:00, 295.66it/s]


Train Loss: 0.5419, Val Loss: 0.6195, Accuracy: 0.6869
Precision: 0.6384, Recall: 0.9419, F1 Score: 0.7610
Epoch 2/5


Training Progress: 100%|██████████| 183/183 [00:02<00:00, 83.71it/s]
Validation Progress: 100%|██████████| 46/46 [00:00<00:00, 409.51it/s]


Train Loss: 0.3753, Val Loss: 0.3509, Accuracy: 0.8578
Precision: 0.8529, Recall: 0.8837, F1 Score: 0.8680
Epoch 3/5


Training Progress: 100%|██████████| 183/183 [00:02<00:00, 90.38it/s]
Validation Progress: 100%|██████████| 46/46 [00:00<00:00, 412.54it/s]


Train Loss: 0.3124, Val Loss: 0.4529, Accuracy: 0.7963
Precision: 0.7254, Recall: 0.9897, F1 Score: 0.8372
Epoch 4/5


Training Progress: 100%|██████████| 183/183 [00:02<00:00, 90.41it/s]
Validation Progress: 100%|██████████| 46/46 [00:00<00:00, 410.06it/s]


Train Loss: 0.2721, Val Loss: 0.3878, Accuracy: 0.8312
Precision: 0.9189, Recall: 0.7468, F1 Score: 0.8239
Epoch 5/5


Training Progress: 100%|██████████| 183/183 [00:02<00:00, 90.49it/s]
Validation Progress: 100%|██████████| 46/46 [00:00<00:00, 410.37it/s]


Train Loss: 0.2293, Val Loss: 0.3180, Accuracy: 0.8906
Precision: 0.9126, Recall: 0.8773, F1 Score: 0.8946


Validation Progress: 100%|██████████| 46/46 [00:00<00:00, 413.46it/s]

Final Results: Accuracy: 0.8906, Precision: 0.9126, Recall: 0.8773, F1 Score: 0.8946





In [14]:
# Step 7: Save the model
model_path = "/data1/dxw_data/llm/RA/cuhk_xinyu/resnet_binary_classifier.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /data1/dxw_data/llm/RA/cuhk_xinyu/resnet_binary_classifier.pth


In [None]:
# 应用模型测试

In [15]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load the pre-trained SentenceTransformer model (text2vec-large-chinese)
model_path = "/data1/dxw_data/llm/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Function to get embeddings from text2vec-large-chinese
def get_text2vec_embeddings(texts, model):
    embeddings = []
    for i in tqdm(range(0, len(texts), 32), desc="Generating Embeddings"):
        batch_texts = texts[i:i+32].tolist()
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

# Step 1: Load the test dataset
test_file = "/data1/dxw_data/llm/RA/cuhk_xinyu/dataset/test-dataset.csv"
test_data = pd.read_csv(test_file)

# Get the words column from the test dataset
test_words = test_data['word']

# Step 2: Get embeddings for the test words
test_embeddings = get_text2vec_embeddings(test_words, text2vec_model)

# Step 3: Create a DataLoader for the test data
test_dataset = TensorDataset(test_embeddings)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 4: Load the trained model
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):
        super(ResNetBinaryClassifier, self).__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
        )
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        x = self.embedding_to_resnet(x)
        x = x.unsqueeze(1).unsqueeze(2)
        x = self.embedding_conv(x)
        x = self.adaptive_pool(x)
        x = self.resnet(x)
        return x

# Assuming the trained model is saved as 'resnet_binary_classifier.pth'
model_path = "/data1/dxw_data/llm/RA/cuhk_xinyu/resnet_binary_classifier.pth"
model = ResNetBinaryClassifier(embedding_dim=1024)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

# Step 5: Perform inference on the test dataset
output_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing Progress"):
        inputs = batch[0].to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        output_labels.extend(preds.cpu().numpy())

# Step 6: Save the results
# Add the output_labels as a new column to the original test_data DataFrame
test_data['output_label'] = output_labels

# Save the updated DataFrame to a new CSV file
output_file = "/data1/dxw_data/llm/RA/cuhk_xinyu/dataset/test-dataset-output.csv"
test_data.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


No sentence-transformers model found with name /data1/dxw_data/llm/text2vec-large-chinese. Creating a new one with MEAN pooling.
Generating Embeddings: 100%|██████████| 2/2 [00:00<00:00,  5.67it/s]
Testing Progress: 100%|██████████| 2/2 [00:00<00:00, 322.75it/s]

Results saved to /data1/dxw_data/llm/RA/cuhk_xinyu/dataset/test-dataset-output.csv





In [None]:
# 更多测试


In [2]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load the pre-trained SentenceTransformer model (text2vec-large-chinese)
model_path = "/data1/dxw_data/llm/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Function to get embeddings from text2vec-large-chinese
def get_text2vec_embeddings(texts, model):
    embeddings = []
    for i in tqdm(range(0, len(texts), 32), desc="Generating Embeddings"):
        batch_texts = texts[i:i+32].tolist()
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

# Step 1: Load the test dataset
test_file = "/data1/dxw_data/llm/RA/cuhk_xinyu/dataset/filtered_notstyle_dataset.csv"
test_data = pd.read_csv(test_file)

# Get the words column from the test dataset
test_words = test_data['word']

# Step 2: Get embeddings for the test words
test_embeddings = get_text2vec_embeddings(test_words, text2vec_model)

# Step 3: Create a DataLoader for the test data
test_dataset = TensorDataset(test_embeddings)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 4: Load the trained model
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):
        super(ResNetBinaryClassifier, self).__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
        )
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        x = self.embedding_to_resnet(x)
        x = x.unsqueeze(1).unsqueeze(2)
        x = self.embedding_conv(x)
        x = self.adaptive_pool(x)
        x = self.resnet(x)
        return x

# Assuming the trained model is saved as 'resnet_binary_classifier.pth'
model_path = "/data1/dxw_data/llm/RA/cuhk_xinyu/resnet_binary_classifier.pth"
model = ResNetBinaryClassifier(embedding_dim=1024)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

# Step 5: Perform inference on the test dataset
output_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing Progress"):
        inputs = batch[0].to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        output_labels.extend(preds.cpu().numpy())

# Step 6: Save the results
# Add the output_labels as a new column to the original test_data DataFrame
test_data['output_label'] = output_labels

# Save the updated DataFrame to a new CSV file
output_file = "/data1/dxw_data/llm/RA/cuhk_xinyu/dataset/filtered_notstyle_dataset-output.csv"
test_data.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


ValueError: Path /data1/dxw_data/llm/text2vec-large-chinese not found

In [17]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

# Load the pre-trained SentenceTransformer model (text2vec-large-chinese)
model_path = "/data1/dxw_data/llm/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Function to get embeddings from text2vec-large-chinese
def get_text2vec_embeddings(texts, model):
    embeddings = []
    for i in tqdm(range(0, len(texts), 32), desc="Generating Embeddings"):
        batch_texts = texts[i:i+32].tolist()
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

# Step 1: Load the test dataset
test_file = "/data1/dxw_data/llm/RA/cuhk_xinyu/dataset/filtered_style_dataset.csv"
test_data = pd.read_csv(test_file)

# Get the words column from the test dataset
test_words = test_data['word']

# Step 2: Get embeddings for the test words
test_embeddings = get_text2vec_embeddings(test_words, text2vec_model)

# Step 3: Create a DataLoader for the test data
test_dataset = TensorDataset(test_embeddings)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 4: Load the trained model
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):
        super(ResNetBinaryClassifier, self).__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
        )
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        x = self.embedding_to_resnet(x)
        x = x.unsqueeze(1).unsqueeze(2)
        x = self.embedding_conv(x)
        x = self.adaptive_pool(x)
        x = self.resnet(x)
        return x

# Assuming the trained model is saved as 'resnet_binary_classifier.pth'
model_path = "/data1/dxw_data/llm/RA/cuhk_xinyu/resnet_binary_classifier.pth"
model = ResNetBinaryClassifier(embedding_dim=1024)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

# Step 5: Perform inference on the test dataset
output_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing Progress"):
        inputs = batch[0].to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        output_labels.extend(preds.cpu().numpy())

# Step 6: Save the results
# Add the output_labels as a new column to the original test_data DataFrame
test_data['output_label'] = output_labels

# Save the updated DataFrame to a new CSV file
output_file = "/data1/dxw_data/llm/RA/cuhk_xinyu/dataset/filtered_style_dataset-output.csv"
test_data.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


No sentence-transformers model found with name /data1/dxw_data/llm/text2vec-large-chinese. Creating a new one with MEAN pooling.


Generating Embeddings: 100%|██████████| 197/197 [00:07<00:00, 26.29it/s]
Testing Progress: 100%|██████████| 197/197 [00:00<00:00, 463.21it/s]

Results saved to /data1/dxw_data/llm/RA/cuhk_xinyu/dataset/filtered_style_dataset-output.csv





In [None]:
# 用于聚类embedding的数据

In [3]:
import torch
import pandas as pd
import ast
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from torchvision.models import resnet18
import torch.optim as optim
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Step 1: Load the pre-trained SentenceTransformer model (text2vec-large-chinese)
model_path = "/data1/dxw_data/llm/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Step 2: Load the trained ResNet binary classifier
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):
        super(ResNetBinaryClassifier, self).__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
        )
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        x = self.embedding_to_resnet(x)
        x = x.unsqueeze(1).unsqueeze(2)
        x = self.embedding_conv(x)
        x = self.adaptive_pool(x)
        x = self.resnet(x)
        return x

# Load the model
model_file = "/data1/dxw_data/llm/RA/cuhk_xinyu/resnet_binary_classifier.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNetBinaryClassifier(embedding_dim=1024)
model.load_state_dict(torch.load(model_file, map_location=device))
model.to(device)
model.eval()

# Step 3: Load the CSV file containing rake_keywords
csv_file = "/data1/dxw_data/llm/redbook_final/script_next/matching_records.csv"
df = pd.read_csv(csv_file)

# Step 4: Process rake_keywords and filter words based on model prediction
def get_text2vec_embeddings(words, model):
    embeddings = model.encode(words, convert_to_tensor=True)
    return embeddings

filtered_keywords = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
    # Check if rake_keywords is NaN or empty
    if pd.isna(row['rake_keywords']) or not row['rake_keywords'].strip():
        filtered_keywords.append([])  # Add an empty list for NaN or invalid rows
        continue
    
    try:
        # Parse the rake_keywords string into a list of words
        words = ast.literal_eval(row['rake_keywords'])
        
        # Get embeddings for the words
        embeddings = get_text2vec_embeddings(words, text2vec_model).to(device)
        
        # Make predictions using the pre-trained ResNet binary classifier
        with torch.no_grad():
            outputs = model(embeddings)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
        
        # Filter words where the model predicts label 1
        filtered_words = [word for word, label in zip(words, preds) if label == 1]
        filtered_keywords.append(filtered_words)
    except (ValueError, SyntaxError) as e:
        # If parsing fails, append an empty list and continue
        print(f"Error parsing rake_keywords in row {idx}: {e}")
        filtered_keywords.append([])

# Step 5: Add the new column filter_label and save the DataFrame
df['filter_label'] = filtered_keywords

# Save the updated DataFrame to a new CSV file
output_file = "/data1/dxw_data/llm/redbook_final/script_next/matching_records_filtered.csv"
df.to_csv(output_file, index=False)

print(f"Filtered results saved to {output_file}")


No sentence-transformers model found with name /data1/dxw_data/llm/text2vec-large-chinese. Creating a new one with MEAN pooling.
Processing Rows: 100%|██████████| 12807/12807 [03:59<00:00, 53.45it/s]


Filtered results saved to /data1/dxw_data/llm/redbook_final/script_next/matching_records_filtered.csv


In [11]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# Step 1: Load the pre-trained SentenceTransformer model (text2vec-large-chinese)
model_path = "/data1/dxw_data/llm/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Step 2: Load the trained ResNet binary classifier
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):
        super(ResNetBinaryClassifier, self).__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
        )
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        x = self.embedding_to_resnet(x)
        x = x.unsqueeze(1).unsqueeze(2)
        x = self.embedding_conv(x)
        x = self.adaptive_pool(x)
        x = self.resnet(x)
        return x

# Load the model
model_file = "/data1/dxw_data/llm/RA/cuhk_xinyu/resnet_binary_classifier.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNetBinaryClassifier(embedding_dim=1024)
model.load_state_dict(torch.load(model_file, map_location=device))
model.to(device)
model.eval()

# Step 3: Load the CSV file containing rake_keywords, limiting to the first 500 rows
csv_file = "/data1/dxw_data/llm/redbook_final/script_next/matching_records.csv"
df = pd.read_csv(csv_file, nrows=500)  # Only load the first 500 rows

# Step 4: Ensure that we split rake_keywords correctly into individual words
def split_keywords(keywords_string):
    # Split by spaces or other separators, depending on the actual format
    return keywords_string.split()

# Step 5: Process rake_keywords and filter words based on model prediction
def get_text2vec_embeddings(words, model):
    embeddings = model.encode(words, convert_to_tensor=True)
    return embeddings

filtered_keywords = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
    # Check if rake_keywords is NaN or empty
    if pd.isna(row['rake_keywords']) or not row['rake_keywords'].strip():
        filtered_keywords.append([])  # Add an empty list for NaN or invalid rows
        continue  # Skip NaN or invalid rows
    
    try:
        # Split the rake_keywords string into a list of words
        words = split_keywords(row['rake_keywords'])
        
        # Ensure each word is processed individually
        filtered_words = []
        for word in words:
            # Get embedding for the word
            embeddings = get_text2vec_embeddings([word], text2vec_model).to(device)
            
            # Make predictions using the pre-trained ResNet binary classifier
            with torch.no_grad():
                outputs = model(embeddings)
                pred = torch.argmax(outputs, dim=1).cpu().numpy()[0]
            
            # If the prediction label is 1, append the word to the filtered list
            if pred == 1:
                filtered_words.append(word)

        filtered_keywords.append(filtered_words)

    except (ValueError, SyntaxError) as e:
        print(f"Error parsing rake_keywords in row {idx}: {e}")
        filtered_keywords.append([])

# Step 6: Add the new column filter_label and save the DataFrame
df['filter_label'] = filtered_keywords

# Save the updated DataFrame to a new CSV file
output_file = "/data1/dxw_data/llm/redbook_final/script_next/matching_records_filtered_500.csv"
df.to_csv(output_file, index=False)

print(f"Filtered results saved to {output_file}")


No sentence-transformers model found with name /data1/dxw_data/llm/text2vec-large-chinese. Creating a new one with MEAN pooling.
Processing Rows: 100%|██████████| 500/500 [07:16<00:00,  1.15it/s]

Filtered results saved to /data1/dxw_data/llm/redbook_final/script_next/matching_records_filtered_500.csv





In [13]:
import pandas as pd

# Step 1: 读取文件
csv_file = "/data1/dxw_data/llm/Multimodal-MKT/label/text_predict/dataset/matching_records_filtered_500.csv"
df = pd.read_csv(csv_file)

# Step 2: 随机采样30行
sampled_df = df.sample(n=30, random_state=42)  # 随机采样30行

# Step 3: 去重函数，保持顺序不变
def remove_duplicates(word_list):
    seen = set()
    return [x for x in word_list if not (x in seen or seen.add(x))]

# Step 4: 打印输出其中的 summary_cleaned 列和 filter_label 列，去掉 filter_label 中的重复单词
for i, row in sampled_df.iterrows():
    summary_cleaned = eval(row['summary_cleaned'])  # 转换为列表
    filter_label = eval(row['filter_label'])  # 转换为列表
    filter_label_unique = remove_duplicates(filter_label)  # 去重
    
    print(f"Example {i+1}:")
    print(f"summary_cleaned: {summary_cleaned}")
    print(f"filter_label: {filter_label_unique}")
    print()  # 空行用于美观输出


FileNotFoundError: [Errno 2] No such file or directory: 'matching_records_filtered_500.csv'