#### Prepare data

In [2]:
import pandas as pd

# Load the poster_test_fashion_nlpclean.csv file
poster_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/poster_test_fashion_nlpclean.csv')

# Combine the post_title and post_content columns into a new column post_text
poster_df['post_text'] = poster_df['post_title'].fillna('') + ' ' + poster_df['post_content'].fillna('')

# Keep only the post_text and post_comment_content columns
poster_df = poster_df[['post_text', 'post_comment_content']]

# Remove duplicate rows
poster_df = poster_df.drop_duplicates()

# Save the resulting DataFrame to a new CSV file
poster_df.to_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed.csv', index=False)

print("Data has been processed and saved to 'nlpclean_processed.csv'")


Data has been processed and saved to 'nlpclean_processed.csv'


#### Data cleaning and clustering

In [3]:
import pandas as pd
import re
import emoji
import jieba

# Load stopwords from the provided file
with open('/Users/ddyilin/Documents/GitHub/fashion/stopwords_cn.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# Load the nlpclean_processed.csv file
poster_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed.csv')

# Function for text cleaning
def clean_text(text, stopwords):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove specific patterns
    text = re.sub(r'- 小红书,,', '', text)
    text = re.sub(r',,\d{2}-\d{2},,', '', text)
    text = re.sub(r'#', ' ', text)
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    cleaned_text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Tokenize
    words = jieba.cut(cleaned_text)
    
    # Remove stopwords
    filtered_words = [word for word in words if word not in stopwords]
    
    return ' '.join(filtered_words)

# Apply data cleaning to post_text and post_comment_content
poster_df['post_text_clean'] = poster_df['post_text'].apply(lambda x: clean_text(x, stopwords))
poster_df['post_comment_content_clean'] = poster_df['post_comment_content'].apply(lambda x: clean_text(str(x), stopwords))

# Save the cleaned data to a new CSV file
poster_df.to_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed_cleaned.csv', index=False)

print("Data cleaning complete. Cleaned data saved to 'nlpclean_processed_cleaned.csv'")


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5d/lsfntfvj5jbbj9d1b8z0_qw00000gn/T/jieba.cache
Loading model cost 0.282 seconds.
Prefix dict has been built successfully.


Data cleaning complete. Cleaned data saved to 'nlpclean_processed_cleaned.csv'


#### Get a non-fashion Chinese corpus
- Chinese Wikipedia Dump
- Use WikiExtractor to convert the XML dump into plain text
- Get extracted_texts
- Filter Non-Fashion Content from the Extracted Texts, using fashion keywords

In [7]:
import pandas as pd

# Load the topics0611.csv file
fashion_keywords_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/topics0611.csv')

# Extract the 'keyword group' column as a list of fashion-related keywords
# Convert all values to strings and drop NaN values
fashion_keywords = fashion_keywords_df['keyword group'].dropna().astype(str).tolist()


In [8]:
# Filter Out Fashion-Related Content
import os

def filter_non_fashion_texts(input_dir, output_dir, fashion_keywords):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for foldername in os.listdir(input_dir):
        folder_path = os.path.join(input_dir, foldername)
        if os.path.isdir(folder_path):
            output_folder_path = os.path.join(output_dir, foldername)
            os.makedirs(output_folder_path, exist_ok=True)
            
            for filename in os.listdir(folder_path):
                input_file_path = os.path.join(folder_path, filename)
                output_file_path = os.path.join(output_folder_path, filename)
                
                with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
                    for line in infile:
                        # Ensure that the comparison is between strings
                        if not any(keyword in line for keyword in fashion_keywords):
                            outfile.write(line)

input_dir = '/Users/ddyilin/Documents/GitHub/fashion/extracted_texts'
output_dir = '/Users/ddyilin/Documents/GitHub/fashion/non_fashion_texts'

filter_non_fashion_texts(input_dir, output_dir, fashion_keywords)


#### Prepare Data for Model Training
- Tokenize the Non-Fashion Texts
- Combine with Fashion-Related Tokens

In [11]:
from transformers import BertTokenizer
import torch
import os
import random

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def tokenize_corpus(directory, max_length):
    all_lines = []
    for foldername in os.listdir(directory):
        folder_path = os.path.join(directory, foldername)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    # Collect all lines to tokenize them at once, skipping blank lines
                    all_lines.extend([line.strip() for line in file if line.strip() != ""])
    
    # Tokenize all lines with padding and truncation to a fixed max length
    tokenized_texts = tokenizer(all_lines, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    
    # Manually adjust padding to ensure uniform sequence length
    input_ids = torch.nn.functional.pad(tokenized_texts['input_ids'], (0, max_length - tokenized_texts['input_ids'].size(1)), value=0)
    attention_mask = torch.nn.functional.pad(tokenized_texts['attention_mask'], (0, max_length - tokenized_texts['attention_mask'].size(1)), value=0)
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

# Set a max length for all tokenized sequences
max_length = 512  # You can adjust this based on your dataset

# Tokenize non-fashion related texts
non_fashion_tokens = tokenize_corpus('/Users/ddyilin/Documents/GitHub/fashion/non_fashion_texts', max_length)

# Assuming fashion_keywords is a list of strings
fashion_tokens = tokenizer(fashion_keywords, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# Combine the tokenized non-fashion and fashion tokens
combined_input_ids = torch.cat((non_fashion_tokens['input_ids'], fashion_tokens['input_ids']), dim=0)
combined_attention_mask = torch.cat((non_fashion_tokens['attention_mask'], fashion_tokens['attention_mask']), dim=0)

# If token_type_ids are available, concatenate them as well
if 'token_type_ids' in non_fashion_tokens and 'token_type_ids' in fashion_tokens:
    combined_token_type_ids = torch.cat((non_fashion_tokens['token_type_ids'], fashion_tokens['token_type_ids']), dim=0)
else:
    combined_token_type_ids = None

# Label fashion-related texts as 1 and non-fashion-related texts as 0
fashion_labels = [1] * fashion_tokens['input_ids'].shape[0]
non_fashion_labels = [0] * non_fashion_tokens['input_ids'].shape[0]

# Combine the labels
all_labels = fashion_labels + non_fashion_labels

# Shuffle the dataset
combined = list(zip(combined_input_ids, combined_attention_mask, all_labels))
random.shuffle(combined)

# Unzip the shuffled dataset
all_input_ids, all_attention_mask, all_labels = zip(*combined)

# Convert back to tensors
all_input_ids = torch.stack(all_input_ids)
all_attention_mask = torch.stack(all_attention_mask)
all_labels = torch.tensor(all_labels)

# If token_type_ids are used, shuffle and stack them too
if combined_token_type_ids is not None:
    combined_token_type_ids = list(zip(combined_token_type_ids))
    random.shuffle(combined_token_type_ids)
    all_token_type_ids = torch.stack([item[0] for item in combined_token_type_ids])
else:
    all_token_type_ids = None

# Print shapes for verification
print("Input IDs shape:", all_input_ids.shape)
print("Attention Mask shape:", all_attention_mask.shape)
if all_token_type_ids is not None:
    print("Token Type IDs shape:", all_token_type_ids.shape)
print("Labels shape:", all_labels.shape)


Input IDs shape: torch.Size([93018, 512])
Attention Mask shape: torch.Size([93018, 512])
Labels shape: torch.Size([93018])


#### Train the Model
- Split the Data into Training and Validation Sets
- Fine-Tune BERT

In [25]:
from sklearn.model_selection import train_test_split
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Ensure all_input_ids, all_attention_mask, and all_labels are tensors
all_input_ids = torch.tensor(all_input_ids)
all_attention_mask = torch.tensor(all_attention_mask)
all_labels = torch.tensor(all_labels)

# If token_type_ids is used, ensure it's a tensor too
if all_token_type_ids is not None:
    all_token_type_ids = torch.tensor(all_token_type_ids)

# Split the data into training and validation sets, ensuring consistency across all inputs
split_data = [all_input_ids, all_attention_mask, all_labels]
if all_token_type_ids is not None:
    split_data.append(all_token_type_ids)

train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    *split_data, test_size=0.2, random_state=42)

# If token_type_ids are used, split them as well
if all_token_type_ids is not None:
    train_token_type_ids, val_token_type_ids = train_test_split(all_token_type_ids, test_size=0.2, random_state=42)
else:
    train_token_type_ids, val_token_type_ids = None, None

# Convert splits into PyTorch tensors (no need to re-convert if already tensors)
# Combine inputs, masks, and labels into a TensorDataset
if train_token_type_ids is not None:
    train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_token_type_ids, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_inputs, val_masks, val_token_type_ids, val_labels)
else:
    train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_inputs, val_masks, val_labels)

# Print out the shapes of the datasets to verify
print("Train input_ids shape:", train_inputs.shape)
print("Validation input_ids shape:", val_inputs.shape)
print("Train labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)
print("Train attention masks shape:", train_masks.shape)
print("Validation attention masks shape:", val_masks.shape)
if train_token_type_ids is not None:
    print("Train token type ids shape:", train_token_type_ids.shape)
    print("Validation token type ids shape:", val_token_type_ids.shape)



  all_input_ids = torch.tensor(all_input_ids)
  all_attention_mask = torch.tensor(all_attention_mask)


Train input_ids shape: torch.Size([74414, 512])
Validation input_ids shape: torch.Size([18604, 512])
Train labels shape: torch.Size([74414])
Validation labels shape: torch.Size([18604])
Train attention masks shape: torch.Size([74414, 512])
Validation attention masks shape: torch.Size([18604, 512])
Input IDs shape: torch.Size([93018, 512])
Attention Mask shape: torch.Size([93018, 512])
Labels shape: torch.Size([93018])


  all_labels = torch.tensor(all_labels)


#### Check data before training
- Check Data Shapes
- Check for Any 'NaN' Values
- Check Data Distribution
- Inspect a Few Examples
- Check Label Value Range
- Verify Dataset Splitting
- Check for Duplicates

In [27]:
print("Input IDs shape:", all_input_ids.shape)
print("Attention Mask shape:", all_attention_mask.shape)
print("Labels shape:", all_labels.shape)

# If token_type_ids are used
if all_token_type_ids is not None:
    print("Token Type IDs shape:", all_token_type_ids.shape)

Input IDs shape: torch.Size([93018, 512])
Attention Mask shape: torch.Size([93018, 512])
Labels shape: torch.Size([93018])


In [26]:
import torch

# Check for NaNs in input_ids
print("NaNs in Input IDs:", torch.isnan(all_input_ids).any().item())

# Check for NaNs in attention_mask
print("NaNs in Attention Mask:", torch.isnan(all_attention_mask).any().item())

# Check for NaNs in labels
print("NaNs in Labels:", torch.isnan(all_labels).any().item())

# If using token_type_ids
if all_token_type_ids is not None:
    print("NaNs in Token Type IDs:", torch.isnan(all_token_type_ids).any().item())


NaNs in Input IDs: False
NaNs in Attention Mask: False
NaNs in Labels: False


In [28]:
from collections import Counter

label_counts = Counter(all_labels.tolist())
print("Label distribution:", label_counts)


Label distribution: Counter({0: 73084, 1: 19934})


In [29]:
for i in range(5):  # Check first 5 examples
    print(f"Example {i + 1}:")
    print("Input IDs:", all_input_ids[i])
    print("Attention Mask:", all_attention_mask[i])
    print("Label:", all_labels[i])
    if all_token_type_ids is not None:
        print("Token Type IDs:", all_token_type_ids[i])
    print("-" * 50)


Example 1:
Input IDs: tensor([ 101,  100,  185,  924, 5384,  185, 2548, 2861, 5384,  784,  102,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0

In [30]:
print("Unique labels:", set(all_labels.tolist()))


Unique labels: {0, 1}


In [31]:
print("Train inputs shape:", train_inputs.shape)
print("Validation inputs shape:", val_inputs.shape)
print("Train labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)


Train inputs shape: torch.Size([74414, 512])
Validation inputs shape: torch.Size([18604, 512])
Train labels shape: torch.Size([74414])
Validation labels shape: torch.Size([18604])


In [32]:
unique_input_ids = torch.unique(all_input_ids, dim=0)
print("Number of unique input IDs:", unique_input_ids.shape[0])


Number of unique input IDs: 86132


In [23]:
from torch.utils.data import DataLoader

# Custom data collator that returns the batch directly
def custom_data_collator(batch):
    input_ids, attention_mask, labels = tuple(torch.stack(samples) for samples in zip(*batch))
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# If using token_type_ids
if train_token_type_ids is not None:
    def custom_data_collator(batch):
        input_ids, attention_mask, token_type_ids, labels = tuple(torch.stack(samples) for samples in zip(*batch))
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'labels': labels
        }

# DataLoader for the datasets
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=custom_data_collator)


#### Start Fine tune BERT

In [34]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    max_grad_norm=1.0,
)

# Create a Trainer instance with the custom data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=custom_data_collator,
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 103/13953 [41:21<92:42:15, 24.10s/it]
                                         
  0%|          | 0/13953 [04:39<?, ?it/s]             

{'loss': 0.9895, 'grad_norm': 22.080347061157227, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


                                         
  0%|          | 0/13953 [07:37<?, ?it/s]             

{'loss': 0.9156, 'grad_norm': 27.814926147460938, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


                                         
  0%|          | 0/13953 [10:41<?, ?it/s]             

{'loss': 0.7861, 'grad_norm': 16.592445373535156, 'learning_rate': 3e-06, 'epoch': 0.0}


                                         
  0%|          | 0/13953 [13:59<?, ?it/s]             

{'loss': 0.6445, 'grad_norm': 12.68768310546875, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


                                         
  0%|          | 0/13953 [17:23<?, ?it/s]             

{'loss': 0.5343, 'grad_norm': 20.902494430541992, 'learning_rate': 5e-06, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [20:27<?, ?it/s]             

{'loss': 0.4718, 'grad_norm': 19.137393951416016, 'learning_rate': 6e-06, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [23:57<?, ?it/s]             

{'loss': 0.6053, 'grad_norm': 8.738337516784668, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [27:15<?, ?it/s]             

{'loss': 0.5002, 'grad_norm': 9.99236011505127, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [30:32<?, ?it/s]             

{'loss': 0.482, 'grad_norm': 6.592872142791748, 'learning_rate': 9e-06, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [34:05<?, ?it/s]              

{'loss': 0.463, 'grad_norm': 7.168025016784668, 'learning_rate': 1e-05, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [37:19<?, ?it/s]              

{'loss': 0.6577, 'grad_norm': 5.1994853019714355, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [40:54<?, ?it/s]              

{'loss': 0.5842, 'grad_norm': 3.8844990730285645, 'learning_rate': 1.2e-05, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [44:32<?, ?it/s]              

{'loss': 0.4571, 'grad_norm': 14.444279670715332, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


                                         
  0%|          | 0/13953 [47:55<?, ?it/s]              

{'loss': 0.4301, 'grad_norm': 6.7403364181518555, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.02}


                                         
  0%|          | 0/13953 [51:07<?, ?it/s]              

{'loss': 0.6466, 'grad_norm': 10.062919616699219, 'learning_rate': 1.5e-05, 'epoch': 0.02}


                                         
  0%|          | 0/13953 [54:27<?, ?it/s]              

{'loss': 0.4555, 'grad_norm': 3.7900373935699463, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.02}


                                         
  0%|          | 0/13953 [57:43<?, ?it/s]              

{'loss': 0.5315, 'grad_norm': 7.345664978027344, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.02}


                                         
  0%|          | 0/13953 [1:01:15<?, ?it/s]            

{'loss': 0.5133, 'grad_norm': 5.336297988891602, 'learning_rate': 1.8e-05, 'epoch': 0.02}


                                           
  0%|          | 0/13953 [1:04:26<?, ?it/s]              

{'loss': 0.519, 'grad_norm': 4.229982376098633, 'learning_rate': 1.9e-05, 'epoch': 0.02}


                                           
  0%|          | 0/13953 [1:07:58<?, ?it/s]              

{'loss': 0.5679, 'grad_norm': 9.487654685974121, 'learning_rate': 2e-05, 'epoch': 0.02}


                                           
  0%|          | 0/13953 [1:11:14<?, ?it/s]              

{'loss': 0.4645, 'grad_norm': 4.944336414337158, 'learning_rate': 2.1e-05, 'epoch': 0.02}


                                           
  0%|          | 0/13953 [1:14:13<?, ?it/s]              

{'loss': 0.4214, 'grad_norm': 8.645834922790527, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.02}


                                           
  0%|          | 0/13953 [1:17:09<?, ?it/s]              

{'loss': 0.5536, 'grad_norm': 3.907554864883423, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.02}


                                           
  0%|          | 0/13953 [1:20:23<?, ?it/s]              

{'loss': 0.6197, 'grad_norm': 4.248780250549316, 'learning_rate': 2.4e-05, 'epoch': 0.03}


                                           
  0%|          | 0/13953 [1:23:39<?, ?it/s]              

{'loss': 0.523, 'grad_norm': 8.383756637573242, 'learning_rate': 2.5e-05, 'epoch': 0.03}


                                           
  0%|          | 0/13953 [1:27:35<?, ?it/s]              

{'loss': 0.4431, 'grad_norm': 3.9028663635253906, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.03}


                                           
  0%|          | 0/13953 [1:30:47<?, ?it/s]              

{'loss': 0.6013, 'grad_norm': 4.347005844116211, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.03}


                                           
  0%|          | 0/13953 [1:34:02<?, ?it/s]              

{'loss': 0.5299, 'grad_norm': 3.06646466255188, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.03}


                                           
  0%|          | 0/13953 [1:37:17<?, ?it/s]              

{'loss': 0.47, 'grad_norm': 10.422545433044434, 'learning_rate': 2.9e-05, 'epoch': 0.03}




#### Evaluate and Save the Model
- 

In [None]:
model.save_pretrained('/path/to/save/your/model')
