Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5d/lsfntfvj5jbbj9d1b8z0_qw00000gn/T/jieba.cache
Loading model cost 0.282 seconds.
Prefix dict has been built successfully.


Data cleaning complete. Cleaned data saved to 'nlpclean_processed_cleaned.csv'


#### Get a non-fashion Chinese corpus
- Chinese Wikipedia Dump
- Use WikiExtractor to convert the XML dump into plain text
- Get extracted_texts
- Filter Non-Fashion Content from the Extracted Texts, using fashion keywords

In [1]:
import pandas as pd

# Load the topics0611.csv file
fashion_keywords_df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/topics0611.csv')

# Extract the 'keyword group' column as a list of fashion-related keywords
# Convert all values to strings and drop NaN values
fashion_keywords = fashion_keywords_df['keyword group'].dropna().astype(str).tolist()


In [4]:
# Filter Out Fashion-Related Content
import os

def filter_non_fashion_texts(input_dir, output_dir, fashion_keywords):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for foldername in os.listdir(input_dir):
        folder_path = os.path.join(input_dir, foldername)
        if os.path.isdir(folder_path):
            output_folder_path = os.path.join(output_dir, foldername)
            os.makedirs(output_folder_path, exist_ok=True)
            
            for filename in os.listdir(folder_path):
                input_file_path = os.path.join(folder_path, filename)
                output_file_path = os.path.join(output_folder_path, filename)
                
                with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
                    for line in infile:
                        # Ensure that the comparison is between strings
                        if not any(keyword in line for keyword in fashion_keywords):
                            outfile.write(line)

input_dir = '/home/disk1/red_disk1/extracted_texts'
output_dir = '/home/disk1/red_disk1/Multimodal_MKT/non_fashion_texts'

filter_non_fashion_texts(input_dir, output_dir, fashion_keywords)


#### Prepare Data for Model Training
- Tokenize the Non-Fashion Texts
- Combine with Fashion-Related Tokens

In [5]:
from transformers import BertTokenizer
import torch
import os
import random

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def tokenize_corpus(directory, max_length):
    all_lines = []
    for foldername in os.listdir(directory):
        folder_path = os.path.join(directory, foldername)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    # Collect all lines to tokenize them at once, skipping blank lines
                    all_lines.extend([line.strip() for line in file if line.strip() != ""])
    
    # Tokenize all lines with padding and truncation to a fixed max length
    tokenized_texts = tokenizer(all_lines, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    
    # Manually adjust padding to ensure uniform sequence length
    input_ids = torch.nn.functional.pad(tokenized_texts['input_ids'], (0, max_length - tokenized_texts['input_ids'].size(1)), value=0)
    attention_mask = torch.nn.functional.pad(tokenized_texts['attention_mask'], (0, max_length - tokenized_texts['attention_mask'].size(1)), value=0)
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

# Set a max length for all tokenized sequences
max_length = 512  # You can adjust this based on your dataset

# Tokenize non-fashion related texts
non_fashion_tokens = tokenize_corpus('/home/disk1/red_disk1/Multimodal_MKT/non_fashion_texts', max_length)

# Assuming fashion_keywords is a list of strings
fashion_tokens = tokenizer(fashion_keywords, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# Combine the tokenized non-fashion and fashion tokens
combined_input_ids = torch.cat((non_fashion_tokens['input_ids'], fashion_tokens['input_ids']), dim=0)
combined_attention_mask = torch.cat((non_fashion_tokens['attention_mask'], fashion_tokens['attention_mask']), dim=0)

# If token_type_ids are available, concatenate them as well
if 'token_type_ids' in non_fashion_tokens and 'token_type_ids' in fashion_tokens:
    combined_token_type_ids = torch.cat((non_fashion_tokens['token_type_ids'], fashion_tokens['token_type_ids']), dim=0)
else:
    combined_token_type_ids = None

# Label fashion-related texts as 1 and non-fashion-related texts as 0
fashion_labels = [1] * fashion_tokens['input_ids'].shape[0]
non_fashion_labels = [0] * non_fashion_tokens['input_ids'].shape[0]

# Combine the labels
all_labels = fashion_labels + non_fashion_labels

# Shuffle the dataset
combined = list(zip(combined_input_ids, combined_attention_mask, all_labels))
random.shuffle(combined)

# Unzip the shuffled dataset
all_input_ids, all_attention_mask, all_labels = zip(*combined)

# Convert back to tensors
all_input_ids = torch.stack(all_input_ids)
all_attention_mask = torch.stack(all_attention_mask)
all_labels = torch.tensor(all_labels)

# If token_type_ids are used, shuffle and stack them too
if combined_token_type_ids is not None:
    combined_token_type_ids = list(zip(combined_token_type_ids))
    random.shuffle(combined_token_type_ids)
    all_token_type_ids = torch.stack([item[0] for item in combined_token_type_ids])
else:
    all_token_type_ids = None

# Print shapes for verification
print("Input IDs shape:", all_input_ids.shape)
print("Attention Mask shape:", all_attention_mask.shape)
if all_token_type_ids is not None:
    print("Token Type IDs shape:", all_token_type_ids.shape)
print("Labels shape:", all_labels.shape)




Input IDs shape: torch.Size([93018, 512])
Attention Mask shape: torch.Size([93018, 512])
Labels shape: torch.Size([93018])


#### Train the Model
- Split the Data into Training and Validation Sets
- Fine-Tune BERT

In [6]:
from sklearn.model_selection import train_test_split
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Ensure all_input_ids, all_attention_mask, and all_labels are tensors
all_input_ids = torch.tensor(all_input_ids)
all_attention_mask = torch.tensor(all_attention_mask)
all_labels = torch.tensor(all_labels)

# If token_type_ids is used, ensure it's a tensor too
if all_token_type_ids is not None:
    all_token_type_ids = torch.tensor(all_token_type_ids)

# Split the data into training and validation sets, ensuring consistency across all inputs
split_data = [all_input_ids, all_attention_mask, all_labels]
if all_token_type_ids is not None:
    split_data.append(all_token_type_ids)

train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    *split_data, test_size=0.2, random_state=42)

# If token_type_ids are used, split them as well
if all_token_type_ids is not None:
    train_token_type_ids, val_token_type_ids = train_test_split(all_token_type_ids, test_size=0.2, random_state=42)
else:
    train_token_type_ids, val_token_type_ids = None, None

# Convert splits into PyTorch tensors (no need to re-convert if already tensors)
# Combine inputs, masks, and labels into a TensorDataset
if train_token_type_ids is not None:
    train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_token_type_ids, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_inputs, val_masks, val_token_type_ids, val_labels)
else:
    train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_inputs, val_masks, val_labels)

# Print out the shapes of the datasets to verify
print("Train input_ids shape:", train_inputs.shape)
print("Validation input_ids shape:", val_inputs.shape)
print("Train labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)
print("Train attention masks shape:", train_masks.shape)
print("Validation attention masks shape:", val_masks.shape)
if train_token_type_ids is not None:
    print("Train token type ids shape:", train_token_type_ids.shape)
    print("Validation token type ids shape:", val_token_type_ids.shape)



Train input_ids shape: torch.Size([74414, 512])
Validation input_ids shape: torch.Size([18604, 512])
Train labels shape: torch.Size([74414])
Validation labels shape: torch.Size([18604])
Train attention masks shape: torch.Size([74414, 512])
Validation attention masks shape: torch.Size([18604, 512])


  all_input_ids = torch.tensor(all_input_ids)
  all_attention_mask = torch.tensor(all_attention_mask)
  all_labels = torch.tensor(all_labels)


#### Check data before training
- Check Data Shapes
- Check for Any 'NaN' Values
- Check Data Distribution
- Inspect a Few Examples
- Check Label Value Range
- Verify Dataset Splitting
- Check for Duplicates

In [7]:
print("Input IDs shape:", all_input_ids.shape)
print("Attention Mask shape:", all_attention_mask.shape)
print("Labels shape:", all_labels.shape)

# If token_type_ids are used
if all_token_type_ids is not None:
    print("Token Type IDs shape:", all_token_type_ids.shape)

Input IDs shape: torch.Size([93018, 512])
Attention Mask shape: torch.Size([93018, 512])
Labels shape: torch.Size([93018])


In [8]:
import torch

# Check for NaNs in input_ids
print("NaNs in Input IDs:", torch.isnan(all_input_ids).any().item())

# Check for NaNs in attention_mask
print("NaNs in Attention Mask:", torch.isnan(all_attention_mask).any().item())

# Check for NaNs in labels
print("NaNs in Labels:", torch.isnan(all_labels).any().item())

# If using token_type_ids
if all_token_type_ids is not None:
    print("NaNs in Token Type IDs:", torch.isnan(all_token_type_ids).any().item())


NaNs in Input IDs: False
NaNs in Attention Mask: False
NaNs in Labels: False


In [9]:
from collections import Counter

label_counts = Counter(all_labels.tolist())
print("Label distribution:", label_counts)


Label distribution: Counter({0: 73084, 1: 19934})


In [10]:
for i in range(5):  # Check first 5 examples
    print(f"Example {i + 1}:")
    print("Input IDs:", all_input_ids[i])
    print("Attention Mask:", all_attention_mask[i])
    print("Label:", all_labels[i])
    if all_token_type_ids is not None:
        print("Token Type IDs:", all_token_type_ids[i])
    print("-" * 50)


Example 1:
Input IDs: tensor([ 101,  100, 5498,  102,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0

In [11]:
print("Unique labels:", set(all_labels.tolist()))


Unique labels: {0, 1}


In [12]:
print("Train inputs shape:", train_inputs.shape)
print("Validation inputs shape:", val_inputs.shape)
print("Train labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)


Train inputs shape: torch.Size([74414, 512])
Validation inputs shape: torch.Size([18604, 512])
Train labels shape: torch.Size([74414])
Validation labels shape: torch.Size([18604])


In [13]:
unique_input_ids = torch.unique(all_input_ids, dim=0)
print("Number of unique input IDs:", unique_input_ids.shape[0])


Number of unique input IDs: 86132


In [14]:
from torch.utils.data import DataLoader

# Custom data collator that returns the batch directly
def custom_data_collator(batch):
    input_ids, attention_mask, labels = tuple(torch.stack(samples) for samples in zip(*batch))
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# If using token_type_ids
if train_token_type_ids is not None:
    def custom_data_collator(batch):
        input_ids, attention_mask, token_type_ids, labels = tuple(torch.stack(samples) for samples in zip(*batch))
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'labels': labels
        }

# DataLoader for the datasets
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=custom_data_collator)


#### Start Fine tune BERT

In [23]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_steps=300,
    weight_decay=0.02,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
)

# Create a Trainer instance with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=custom_data_collator,
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
0,0.413,0.437085
2,0.3,0.44666
3,0.2733,0.483683


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

TrainOutput(global_step=9300, training_loss=0.3797091121955584, metrics={'train_runtime': 4481.9937, 'train_samples_per_second': 66.412, 'train_steps_per_second': 2.075, 'total_flos': 7.830027140880384e+16, 'train_loss': 0.3797091121955584, 'epoch': 3.9991399698989465})

#### Save the Model

In [24]:
model.save_pretrained('/home/disk1/red_disk1/Multimodal_MKT')


#### Evaluate the Model


In [25]:
# Evaluate the model on the validation dataset
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 0.4836825728416443, 'eval_runtime': 85.8285, 'eval_samples_per_second': 216.758, 'eval_steps_per_second': 13.55, 'epoch': 3.9991399698989465}


In [26]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Get predictions and labels
predictions, labels, _ = trainer.predict(val_dataset)
predicted_labels = np.argmax(predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(labels, predicted_labels)
print("Accuracy:", accuracy)

# Calculate precision, recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(labels, predicted_labels, average='binary')
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.8056331971619006
Precision: 0.5598681801537898
Recall: 0.3877758052244484
F1 Score: 0.45819598441714116


## Apply the model


#### Prepare post data
- Combine 'post_title' and 'post_content' into 'post_text'
- Clean and Tokenize 'post_text'

In [27]:
import pandas as pd

# Load the data
df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_test_fashion_nlpclean.csv')

# Combine 'post_title' and 'post_content' into 'post_text'
df['post_text'] = df['post_title'].fillna('') + ' ' + df['post_content'].fillna('')

# Save the updated DataFrame
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_test_fashion_nlpclean_combined.csv', index=False)


In [28]:
import emoji
import re
import jieba

# Load stopwords from the provided file
with open('/home/disk1/red_disk1/Multimodal_MKT/stopwords_cn.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# Function for text cleaning
def clean_text(text, stopwords):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove specific patterns
    text = re.sub(r'- 小红书,,', '', text)
    text = re.sub(r',,\d{2}-\d{2},,', '', text)
    text = re.sub(r'#', ' ', text)
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    cleaned_text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Tokenize
    words = jieba.cut(cleaned_text)
    
    # Remove stopwords
    filtered_words = [word for word in words if word not in stopwords]
    
    return ' '.join(filtered_words)

# Apply cleaning function to 'post_text'
df['cleaned_post_text'] = df['post_text'].apply(lambda x: clean_text(str(x), stopwords))

# Remove duplicates
df = df.drop_duplicates(subset=['cleaned_post_text'])

# Save the cleaned DataFrame
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_cleaned.csv', index=False)


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.507 seconds.
Prefix dict has been built successfully.


#### Filter Non-Fashion Words Using the Model

In [29]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained('/home/disk1/red_disk1/Multimodal_MKT')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# Set the model to evaluation mode
model.eval()

# Function to filter fashion-related content
def filter_fashion_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()
    return text if predicted_label == 1 else ''

# Apply the function to filter non-fashion text
df['fashion_text'] = df['cleaned_post_text'].apply(lambda x: filter_fashion_text(x, model, tokenizer))

# Save the final DataFrame with only fashion-related content
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered.csv', index=False)


