#### Prepare data

In [3]:
import pandas as pd

# Load the poster_test_fashion_nlpclean.csv file
poster_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/poster_test_fashion_nlpclean.csv')

# Combine the post_title and post_content columns into a new column post_text
poster_df['post_text'] = poster_df['post_title'].fillna('') + ' ' + poster_df['post_content'].fillna('')

# Keep only the post_text and post_comment_content columns
poster_df = poster_df[['post_text', 'post_comment_content']]

# Remove duplicate rows
poster_df = poster_df.drop_duplicates()

# Save the resulting DataFrame to a new CSV file
poster_df.to_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed.csv', index=False)

print("Data has been processed and saved to 'nlpclean_processed.csv'")


Data has been processed and saved to 'nlpclean_processed.csv'


#### Data cleaning and clustering

In [6]:
import pandas as pd
import re
import emoji
import jieba

# Load stopwords from the provided file
with open('/Users/ddyilin/Documents/GitHub/fashion/stopwords_cn.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# Load the nlpclean_processed.csv file
poster_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed.csv')

# Function for text cleaning
def clean_text(text, stopwords):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove specific patterns
    text = re.sub(r'- 小红书,,', '', text)
    text = re.sub(r',,\d{2}-\d{2},,', '', text)
    text = re.sub(r'#', ' ', text)
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    cleaned_text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Tokenize
    words = jieba.cut(cleaned_text)
    
    # Remove stopwords
    filtered_words = [word for word in words if word not in stopwords]
    
    return ' '.join(filtered_words)

# Apply data cleaning to post_text and post_comment_content
poster_df['post_text_clean'] = poster_df['post_text'].apply(lambda x: clean_text(x, stopwords))
poster_df['post_comment_content_clean'] = poster_df['post_comment_content'].apply(lambda x: clean_text(str(x), stopwords))

# Save the cleaned data to a new CSV file
poster_df.to_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed_cleaned.csv', index=False)

print("Data cleaning complete. Cleaned data saved to 'nlpclean_processed_cleaned.csv'")


Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/5d/lsfntfvj5jbbj9d1b8z0_qw00000gn/T/jieba.cache
Loading model cost 0.314 seconds.
Prefix dict has been built successfully.


Data cleaning complete. Cleaned data saved to 'nlpclean_processed_cleaned.csv'


#### Get a non-fashion Chinese corpus
- Chinese Wikipedia Dump
- Use WikiExtractor to convert the XML dump into plain text
- Get extracted_texts
- Filter Non-Fashion Content from the Extracted Texts

In [10]:
import pandas as pd

# Load the topics0611.csv file
fashion_keywords_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/topics0611.csv')

# Extract the 'keyword group' column as a list of fashion-related keywords
# Convert all values to strings and drop NaN values
fashion_keywords = fashion_keywords_df['keyword group'].dropna().astype(str).tolist()


In [11]:
# Filter Out Fashion-Related Content
import os

def filter_non_fashion_texts(input_dir, output_dir, fashion_keywords):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for foldername in os.listdir(input_dir):
        folder_path = os.path.join(input_dir, foldername)
        if os.path.isdir(folder_path):
            output_folder_path = os.path.join(output_dir, foldername)
            os.makedirs(output_folder_path, exist_ok=True)
            
            for filename in os.listdir(folder_path):
                input_file_path = os.path.join(folder_path, filename)
                output_file_path = os.path.join(output_folder_path, filename)
                
                with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
                    for line in infile:
                        # Ensure that the comparison is between strings
                        if not any(keyword in line for keyword in fashion_keywords):
                            outfile.write(line)

input_dir = '/Users/ddyilin/Documents/GitHub/fashion/extracted_texts'
output_dir = '/Users/ddyilin/Documents/GitHub/fashion/non_fashion_texts'

filter_non_fashion_texts(input_dir, output_dir, fashion_keywords)


#### Prepare Data for Model Training
- Tokenize the Non-Fashion Texts
- Combine with Fashion-Related Tokens

In [36]:
from transformers import BertTokenizer
import torch
import os

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def tokenize_corpus(directory):
    all_lines = []
    for foldername in os.listdir(directory):
        folder_path = os.path.join(directory, foldername)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    # Collect all lines to tokenize them at once
                    all_lines.extend([line.strip() for line in file if line.strip()])
    
    # Tokenize all lines with padding and truncation
    tokenized_texts = tokenizer(all_lines, padding=True, truncation=True, return_tensors='pt')
    return tokenized_texts

# Tokenize non-fashion related texts
non_fashion_tokens = tokenize_corpus('/Users/ddyilin/Documents/GitHub/fashion/non_fashion_texts')

# Assuming fashion_keywords is a list of strings
fashion_tokens = tokenizer(fashion_keywords, padding=True, truncation=True, return_tensors='pt')

# Print the keys of the tokenized fashion tokens dictionary
print(fashion_tokens.keys())
# Output: dict_keys(['input_ids', 'attention_mask', 'token_type_ids'])


ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.

In [16]:
import random

# Label fashion-related texts as 1 and non-fashion-related texts as 0
fashion_labels = [1] * len(fashion_tokens)
non_fashion_labels = [0] * len(non_fashion_tokens)

# Combine the data
all_tokens = fashion_tokens + non_fashion_tokens
all_labels = fashion_labels + non_fashion_labels

# Shuffle the dataset
combined = list(zip(all_tokens, all_labels))
random.shuffle(combined)
all_tokens, all_labels = zip(*combined)


#### Train the Model
- Split the Data into Training and Validation Sets
- Fine-Tune BERT

In [22]:
from sklearn.model_selection import train_test_split
import torch

train_inputs, val_inputs, train_labels, val_labels = train_test_split(all_tokens, all_labels, test_size=0.2, random_state=42)

train_inputs = torch.tensor(train_inputs)
val_inputs = torch.tensor(val_inputs)
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)


ValueError: expected sequence of length 5 at dim 1 (got 2)

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=torch.utils.data.TensorDataset(train_inputs, train_labels),
    eval_dataset=torch.utils.data.TensorDataset(val_inputs, val_labels),
)

# Train the model
trainer.train()


#### Evaluate and Save the Model
- 

In [None]:
model.save_pretrained('/path/to/save/your/model')
