##### Check if the datasets are balanced

In [11]:
import os
import pandas as pd

# Load fashion-related data (positive samples)
fashion_df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/topics_filtered.csv')

# Extract the 'keyword' column which contains fashion-related keywords
fashion_samples = fashion_df['keyword group']

# Directory containing non-fashion-related samples
neg_dir = "/home/disk1/red_disk1/Multimodal_MKT/non_fashion_texts"

# Initialize an empty list to store non-fashion samples
non_fashion_samples = []

# Recursively iterate through all subdirectories and files in non_fashion_texts directory
for root, dirs, files in os.walk(neg_dir):
    for file in files:
        # Read only text files
        if file.startswith('wiki'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                # Add each line as a sample
                lines = f.readlines()
                non_fashion_samples.extend([line.strip() for line in lines if line.strip()])

# Convert non-fashion samples to a pandas Series for consistency
non_fashion_samples = pd.Series(non_fashion_samples)

# Step 2: Count the number of samples in each dataset
num_fashion_samples = fashion_samples.shape[0]
num_non_fashion_samples = non_fashion_samples.shape[0]

# Print the results
print(f"Number of fashion-related samples (positive): {num_fashion_samples}")
print(f"Number of non-fashion-related samples (negative): {num_non_fashion_samples}")

# Step 3: Check if they are balanced
if num_fashion_samples == num_non_fashion_samples:
    print("The datasets are balanced.")
else:
    print(f"The datasets are not balanced. Difference: {abs(num_fashion_samples - num_non_fashion_samples)}")


Number of fashion-related samples (positive): 52242
Number of non-fashion-related samples (negative): 304611
The datasets are not balanced. Difference: 252369


#### Downsampling:
- randomly reducing the number of samples from the majority class (non-fashion-related samples) to match the number of samples in the minority class (fashion-related samples).

In [12]:
import random

# If the non-fashion dataset is larger, downsample it
if num_non_fashion_samples > num_fashion_samples:
    # Randomly select `num_fashion_samples` non-fashion samples
    downsampled_non_fashion_samples = non_fashion_samples.sample(n=num_fashion_samples, random_state=42)
else:
    downsampled_non_fashion_samples = non_fashion_samples

# Print the number of samples after downsampling
print(f"Number of downsampled non-fashion-related samples: {downsampled_non_fashion_samples.shape[0]}")


Number of downsampled non-fashion-related samples: 52242


#### Step 1: Read Data Files
- read positive samples
- read negative samples
- combine the datasets
#### Step 2: Load pre-trained sentence Transformer Model
- load text2vec-large-chinese model
- generate embeddings for each word
#### Step 3: Get embeddings for the dataset
- apply the embeddings function
- convert labels to tensor
#### Step 4: Train-test split
- split the data
- create PyTorh TensorDataset and DataLoader

In [13]:
import torch
from torch import nn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import os

# Step 1: Read Positive Samples from 'topics_filtered.csv'
# Load the fashion lexicon (positive samples)
pos_file = "/home/disk1/red_disk1/fashion/topics_filtered.csv"
positive_samples_df = pd.read_csv(pos_file)
positive_samples = positive_samples_df['keyword group'].dropna().tolist()  # Extract keywords column

# Assign label 1 to positive samples
positive_samples = pd.DataFrame(positive_samples, columns=['word'])
positive_samples['label'] = 1

# Step 2: Read Negative Samples from non_fashion_texts
def read_negative_samples(directory):
    all_lines = []
    for foldername in os.listdir(directory):
        folder_path = os.path.join(directory, foldername)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    all_lines.extend([line.strip() for line in file if line.strip() != ""])
    return all_lines

neg_dir = "/home/disk1/red_disk1/Multimodal_MKT/non_fashion_texts"
negative_samples_list = read_negative_samples(neg_dir)

# Assign label 0 to negative samples
negative_samples = pd.DataFrame(negative_samples_list, columns=['word'])
negative_samples['label'] = 0

# Step 3: Combine Positive and Negative Samples
data = pd.concat([positive_samples, negative_samples], ignore_index=True)

# Shuffle the dataset
data = data.sample(frac=1).reset_index(drop=True)

# Print the shape and format of the data
print(f"Combined Data Shape: {data.shape}")
print(f"Data Format Example:\n{data.head()}")

# Step 4: Load text2vec-large-chinese model
model_path = "/home/disk1/red_disk1/fashion/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Function to get embeddings from text2vec-large-chinese
def get_text2vec_embeddings(texts, model):
    print("Generating embeddings...")
    embeddings = []
    for i in tqdm(range(0, len(texts), 32), desc="Embedding Progress"):
        batch_texts = texts[i:i+32].tolist()
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

# Step 5: Get embeddings for the dataset
embeddings = get_text2vec_embeddings(data['word'], text2vec_model)

# Print the shape and format of the embeddings
print(f"Embeddings Shape: {embeddings.shape}")
print(f"Embedding Example:\n{embeddings[0]}")

# Convert labels to tensor
labels = torch.tensor(data['label'].values)

# Print the labels format
print(f"Labels Shape: {labels.shape}")
print(f"Labels Example:\n{labels[:5]}")

# Step 6: Train-Test Split
train_embeddings, val_embeddings, train_labels, val_labels = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42
)

# Print the split data shapes
print(f"Train Embeddings Shape: {train_embeddings.shape}, Train Labels Shape: {train_labels.shape}")
print(f"Val Embeddings Shape: {val_embeddings.shape}, Val Labels Shape: {val_labels.shape}")

# Create TensorDataset and DataLoader for training and validation
train_dataset = TensorDataset(train_embeddings, train_labels)
val_dataset = TensorDataset(val_embeddings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# Print the data loader batch size
print(f"Train Loader Batch Size: {len(train_loader)} batches")
print(f"Val Loader Batch Size: {len(val_loader)} batches")


No sentence-transformers model found with name /home/disk1/red_disk1/fashion/text2vec-large-chinese. Creating a new one with mean pooling.


Combined Data Shape: (356852, 2)
Data Format Example:
      word  label
0     市政府.      0
1   地理与气候.      0
2  綠 (消歧義)      0
3      邯山区      0
4     江戸幕府      0
Generating embeddings...


Embedding Progress: 100%|██████████| 11152/11152 [15:32<00:00, 11.96it/s]


Embeddings Shape: torch.Size([356852, 1024])
Embedding Example:
tensor([ 0.9775, -0.6769, -0.7114,  ..., -0.5798,  1.1212, -2.3075],
       device='cuda:0')
Labels Shape: torch.Size([356852])
Labels Example:
tensor([0, 0, 0, 0, 0])
Train Embeddings Shape: torch.Size([285481, 1024]), Train Labels Shape: torch.Size([285481])
Val Embeddings Shape: torch.Size([71371, 1024]), Val Labels Shape: torch.Size([71371])
Train Loader Batch Size: 8922 batches
Val Loader Batch Size: 2231 batches


### Define and train a RedNet-based binary classification model
1. Model Definition: ResNet-Based Binary Classifier
2. Training Loop
3. Validation Loop
4. Training and Evaluation

#### Summary
- Model Setup: It initializes a ResNet18 model tailored for binary classification using text embeddings as inputs.
- Training and Validation: The code implements a standard training loop with backpropagation and uses validation to track performance.
- Performance Metrics: Accuracy, precision, recall, and F1 score are calculated to evaluate the binary classification model.

In [17]:
import torch.optim as optim
from torch import nn
from tqdm import tqdm
from torchvision.models import resnet18
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Step 7: Define ResNet-Based Binary Classification Model
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):  # Adjust embedding_dim based on text2vec output (1024)
        super(ResNetBinaryClassifier, self).__init__()
        # Use pre-trained ResNet18, but replace the first conv layer to accept text2vec embeddings
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)  # Binary classification
        
        # Define a simple MLP layer to map embeddings to ResNet input size
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),  # Match the expected ResNet input size
            nn.ReLU(),
            nn.Linear(512, 512),  # Match the expected ResNet input size after projection
        )
        
        # Adding a Conv layer to convert 1D to a 2D feature map
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))

        # Adaptive pooling layer to transform the spatial dimensions to (7, 7)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        x = self.embedding_to_resnet(x)  # Project text2vec embeddings to match ResNet input size
        x = x.unsqueeze(1).unsqueeze(2)  # Reshape to fit Conv2D (batch_size, channels, height, width)
        x = self.embedding_conv(x)  # Use conv layer to convert 1D feature to 2D feature map
        x = self.adaptive_pool(x)  # Apply adaptive pooling to resize to (7, 7)
        x = self.resnet(x)
        return x

# Initialize the model with the correct embedding dimension (1024 as per the output shape)
model = ResNetBinaryClassifier(embedding_dim=1024)

# Step 8: Define Training Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training function
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc="Training Progress"):
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

# Validation function
def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation Progress"):
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    return total_loss / len(val_loader), accuracy, precision, recall, f1

# Step 9: Training Loop
epochs = 5
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train_loss = train(model, train_loader, criterion, optimizer, device)
    val_loss, accuracy, precision, recall, f1 = evaluate(model, val_loader, criterion, device)
    
    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Step 10: Final Evaluation
val_loss, accuracy, precision, recall, f1 = evaluate(model, val_loader, criterion, device)
print(f"Final Results: Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")




Epoch 1/5


Training Progress: 100%|██████████| 8922/8922 [03:50<00:00, 38.66it/s]
Validation Progress: 100%|██████████| 2231/2231 [00:06<00:00, 345.93it/s]


Train Loss: 0.0922, Val Loss: 0.0618, Accuracy: 0.9754
Precision: 0.9172, Recall: 0.9143, F1 Score: 0.9158
Epoch 2/5


Training Progress: 100%|██████████| 8922/8922 [03:50<00:00, 38.64it/s]
Validation Progress: 100%|██████████| 2231/2231 [00:06<00:00, 355.96it/s]


Train Loss: 0.0546, Val Loss: 0.0586, Accuracy: 0.9775
Precision: 0.9595, Recall: 0.8829, F1 Score: 0.9196
Epoch 3/5


Training Progress: 100%|██████████| 8922/8922 [03:49<00:00, 38.82it/s]
Validation Progress: 100%|██████████| 2231/2231 [00:05<00:00, 408.58it/s]


Train Loss: 0.0411, Val Loss: 0.0471, Accuracy: 0.9813
Precision: 0.9461, Recall: 0.9245, F1 Score: 0.9352
Epoch 4/5


Training Progress: 100%|██████████| 8922/8922 [03:49<00:00, 38.86it/s]
Validation Progress: 100%|██████████| 2231/2231 [00:06<00:00, 354.06it/s]


Train Loss: 0.0321, Val Loss: 0.0481, Accuracy: 0.9826
Precision: 0.9419, Recall: 0.9390, F1 Score: 0.9404
Epoch 5/5


Training Progress: 100%|██████████| 8922/8922 [03:50<00:00, 38.74it/s]
Validation Progress: 100%|██████████| 2231/2231 [00:05<00:00, 400.07it/s]


Train Loss: 0.0262, Val Loss: 0.0526, Accuracy: 0.9829
Precision: 0.9455, Recall: 0.9367, F1 Score: 0.9411


Validation Progress: 100%|██████████| 2231/2231 [00:05<00:00, 434.08it/s]


Final Results: Accuracy: 0.9829, Precision: 0.9455, Recall: 0.9367, F1 Score: 0.9411


In [18]:
# Step 11: Save the trained model

# Save model and tokenizer to the desired directory
model_save_path = "/home/disk1/red_disk1/fashion/tfashion"

# Make the directory if it doesn't exist
os.makedirs(model_save_path, exist_ok=True)

# Save the entire model
torch.save(model.state_dict(), os.path.join(model_save_path, "tfashion.pth"))

# Optionally, save additional model metadata if needed (e.g., optimizer state, epoch)
checkpoint = {
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epochs,
}
torch.save(checkpoint, os.path.join(model_save_path, "tfashion_checkpoint.pth"))

print(f"Model saved at: {model_save_path}")


Model saved at: /home/disk1/red_disk1/fashion/tfashion


#### Prepare post data
- Combine 'post_title' and 'post_content' into 'post_text'
- Clean 'post_text'

In [30]:
import pandas as pd

# Load the data
df = pd.read_csv('/home/disk1/red_disk1/fashion/poster_test_fashion_nlpclean.csv')
# df = pd.read_csv('/home/disk1/red_disk1/poster_9305.csv')

# Combine 'post_title' and 'post_content' into 'post_text'
df['post_text'] = df['post_title'].fillna('') + ' ' + df['post_content'].fillna('')

# Drop 'post_title' and 'post_content' columns
df = df.drop(columns=['post_title', 'post_content', 'post_tag'])

# Save the updated DataFrame
# df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_9305_combined.csv', index=False)
df.to_csv('/home/disk1/red_disk1/fashion/poster_test_fashion_nlpclean_combined.csv', index=False)

In [40]:
import emoji
import re

# Load stopwords from the provided file
with open('/home/disk1/red_disk1/fashion/stopwords_cn.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# Function for text cleaning
def clean_text(text, stopwords):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove specific patterns
    text = re.sub(r'- 小红书,,', '', text)  # Removing "- 小红书,,"
    text = re.sub(r'小红书', '', text)  # Explicitly remove "小红书"
    text = re.sub(r',,\d{2}-\d{2},,', '', text)  # Removing patterns like ",,XX-XX,,"
    text = re.sub(r'#', ' ', text)  # Replace '#' with a space
    text = re.sub(r'\s+', '', text)  # This will remove all whitespace characters

    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    cleaned_text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Remove stopwords (word-based removal)
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stopwords])
    
    return cleaned_text

# Apply cleaning function to 'post_text'
df['cleaned_post_text'] = df['post_text'].apply(lambda x: clean_text(str(x), stopwords))

# Remove duplicates
df = df.drop_duplicates()

# Save the cleaned DataFrame
df.to_csv('/home/disk1/red_disk1/fashion/post_cleaned.csv', index=False)


#### Test the model tfashion
- Step 1: Load the test dataset.
- Step 2: Generate embeddings for each word in the test dataset using text2vec-large-chinese.
- Step 3: Use a pre-trained ResNet-based classifier model to predict whether each word is related to fashion or not.
- Step 4: Save the predictions alongside the original data into a CSV file.

In [41]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from torch import nn
import os

# Load the pre-trained SentenceTransformer model (text2vec-large-chinese)
model_path = "/home/disk1/red_disk1/fashion/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Function to get embeddings from text2vec-large-chinese
def get_text2vec_embeddings(texts, model):
    embeddings = []
    for i in tqdm(range(0, len(texts), 32), desc="Generating Embeddings"):
        batch_texts = texts[i:i+32].tolist()
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        embeddings.append(batch_embeddings)
    return torch.cat(embeddings, dim=0)

# Step 1: Load the test dataset
test_file = "/home/disk1/red_disk1/fashion/post_cleaned.csv"  # Update this path if necessary
test_data = pd.read_csv(test_file)

# Get the words column from the test dataset
test_words = test_data['cleaned_post_text']

# Step 2: Get embeddings for the test words
test_embeddings = get_text2vec_embeddings(test_words, text2vec_model)

# Step 3: Create a DataLoader for the test data
test_dataset = TensorDataset(test_embeddings)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Step 4: Load the trained tfashion model
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):
        super(ResNetBinaryClassifier, self).__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
        )
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        x = self.embedding_to_resnet(x)
        x = x.unsqueeze(1).unsqueeze(2)
        x = self.embedding_conv(x)
        x = self.adaptive_pool(x)
        x = self.resnet(x)
        return x

# Load the trained tfashion model from your specified path
model_save_path = "/home/disk1/red_disk1/fashion/tfashion/tfashion.pth"
model = ResNetBinaryClassifier(embedding_dim=1024)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(model_save_path, map_location=device))
model.to(device)
model.eval()

# Step 5: Perform inference on the test dataset
output_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing Progress"):
        inputs = batch[0].to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1)
        output_labels.extend(preds.cpu().numpy())

# Step 6: Save the results
# Add the output_labels as a new column to the original test_data DataFrame
test_data['output_label'] = output_labels

# Save the updated DataFrame to a new CSV file
output_file = "/home/disk1/red_disk1/fashion/test-output.csv"  
test_data.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


No sentence-transformers model found with name /home/disk1/red_disk1/fashion/text2vec-large-chinese. Creating a new one with mean pooling.


Generating Embeddings: 100%|██████████| 489/489 [02:17<00:00,  3.57it/s]
  model.load_state_dict(torch.load(model_save_path, map_location=device))
Testing Progress: 100%|██████████| 489/489 [00:01<00:00, 476.13it/s]


Results saved to /home/disk1/red_disk1/fashion/test-output.csv


#### Steps
1. Tokenization: tokenize the post_text content
2. Word Embedding: pass each word (instead of the entire post) through the text2vec-large-chinese model to generate embeddings for each word
3. Classification: pass embeddings through the tfashion model to classify whether each word is fashion-related or not
4. Filter out any words that are present in the stopwords set before applying the model classification
5. Filter out single-character words
5. Results: The output will be a classification for each word

In [51]:
import torch
import pandas as pd
import jieba
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
import os

# Step 1: Load the stopwords file
stopwords_file = "/home/disk1/red_disk1/fashion/stopwords_cn.txt"
with open(stopwords_file, 'r', encoding='utf-8') as f:
    stopwords = set([line.strip() for line in f.readlines()])

# Step 2: Load the pre-trained SentenceTransformer model (text2vec-large-chinese)
model_path = "/home/disk1/red_disk1/fashion/text2vec-large-chinese"
text2vec_model = SentenceTransformer(model_path)

# Step 3: Define ResNetBinaryClassifier
class ResNetBinaryClassifier(nn.Module):
    def __init__(self, embedding_dim=1024):
        super(ResNetBinaryClassifier, self).__init__()
        self.resnet = resnet18(pretrained=False)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 2)
        self.embedding_to_resnet = nn.Sequential(
            nn.Linear(embedding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
        )
        self.embedding_conv = nn.Conv2d(1, 3, kernel_size=(1, 1))
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))

    def forward(self, x):
        x = self.embedding_to_resnet(x)
        x = x.unsqueeze(1).unsqueeze(2)
        x = self.embedding_conv(x)
        x = self.adaptive_pool(x)
        x = self.resnet(x)
        return x

# Load the model
model_file = "/home/disk1/red_disk1/fashion/tfashion/tfashion.pth"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNetBinaryClassifier(embedding_dim=1024)
model.load_state_dict(torch.load(model_file, map_location=device))
model.to(device)
model.eval()

# Step 4: Load the CSV file containing post_text
csv_file = "/home/disk1/red_disk1/fashion/post_cleaned.csv"
df = pd.read_csv(csv_file)

# Step 5: Tokenize the post_text content into words and remove stopwords
def tokenize_text(text):
    words = list(jieba.cut(text))
    # Filter out stopwords and single-character words
    return [word for word in words if word not in stopwords and len(word) > 1]

df['tokenized_text'] = df['cleaned_post_text'].apply(tokenize_text)

# Step 6: Get embeddings for each word in the tokenized_text column and classify them
def get_text2vec_embeddings(words, model):
    embeddings = model.encode(words, convert_to_tensor=True)
    return embeddings

filtered_keywords = []

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing Rows"):
    words = row['tokenized_text']
    
    # Get embeddings for the words
    embeddings = get_text2vec_embeddings(words, text2vec_model).to(device)
    
    # Make predictions using the pre-trained ResNet binary classifier
    with torch.no_grad():
        outputs = model(embeddings)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
    
    # Filter words where the model predicts label 1 (fashion-related)
    filtered_words = [word for word, label in zip(words, preds) if label == 1]
    filtered_keywords.append(filtered_words)

# Step 7: Add the new column filtered_keywords and save the DataFrame
df['filtered_keywords'] = filtered_keywords

# Save the updated DataFrame to a new CSV file
output_file = "/home/disk1/red_disk1/fashion/test_filtered.csv"
df.to_csv(output_file, index=False)

print(f"Filtered results saved to {output_file}")


No sentence-transformers model found with name /home/disk1/red_disk1/fashion/text2vec-large-chinese. Creating a new one with mean pooling.


  model.load_state_dict(torch.load(model_file, map_location=device))
Processing Rows:  30%|███       | 4764/15622 [02:21<03:48, 47.48it/s]

##### Check the output

In [50]:
import pandas as pd

# Step 1: 读取文件
csv_file = "/home/disk1/red_disk1/fashion/test_filtered.csv"
df = pd.read_csv(csv_file)

# Step 2: 随机采样30行
sampled_df = df.sample(n=30, random_state=42)  # 随机采样30行

# Step 3: 去重函数，保持顺序不变
def remove_duplicates(word_list):
    seen = set()
    return [x for x in word_list if not (x in seen or seen.add(x))]

# Step 4: 打印输出其中的 summary_cleaned 列和 filter_label 列，去掉 filter_label 中的重复单词
for i, row in sampled_df.iterrows():
    summary_cleaned = eval(row['cleaned_post_text'])  # 转换为列表
    filter_label = eval(row['filtered_keywords'])  # 转换为列表
    filter_label_unique = remove_duplicates(filter_label)  # 去重
    
    print(f"Example {i+1}:")
    print(f"summary_cleaned: {summary_cleaned}")
    print(f"filter_label: {filter_label_unique}")
    print()  # 空行用于美观输出

Example 12997:
summary_cleaned: ['尊嘟', '爱', 'redexclamationmark', '说', '富贵', '小姐姐', '穿', '搭', '尝试', '新', '温柔', '气质', '姐姐', '富家', '千金', '穿', '搭', '秋冬', '温柔', '慵懒', '风', '毛衣', '慵懒', '风穿', '搭', '高级', '感穿', '搭']
filter_label: ['爱', '说', '富贵', '小姐姐', '穿', '搭', '尝试', '新', '温柔', '气质', '姐姐', '富家', '千金', '秋冬', '慵懒', '风', '毛衣', '风穿', '高级', '感穿']

Example 11585:
summary_cleaned: ['爱', '温柔', '白月光', '感觉', '试试', '仙气', '裙子', '新', '中式', '穿', '搭', '民国', '风', '温柔', '连衣裙', '国风', '针织', '新', '中式', '套装']
filter_label: ['爱', '温柔', '白月光', '感觉', '试试', '仙气', '裙子', '新', '中式', '穿', '搭', '民国', '风', '连衣裙', '国风', '针织', '套装']

Example 11226:
summary_cleaned: ['我先', '微胖', '女孩', '微胖', '穿', '搭']
filter_label: ['微胖', '女孩', '穿', '搭']

Example 13552:
summary_cleaned: ['redapple', '型', '跟着', '模特', '穿', 'keycapkeycap', '复古', '百褶裙', '太好', '穿', '身高', 'cm', '体重', '斤斤', '小腿', '围', 'cm', '大腿', 'cm', '腰围', 'cm', '肚围', 'cmredapple', '型', '身材', '我太爱', '这件', '短裙', '前', '几年', '尝试', '百褶裙', '没想到', '穿', '感觉', '不错', '哈哈哈', '材质', '厚', '材质'