#### Prepare data

In [3]:
import pandas as pd

# Load the poster_test_fashion_nlpclean.csv file
poster_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/poster_test_fashion_nlpclean.csv')

# Combine the post_title and post_content columns into a new column post_text
poster_df['post_text'] = poster_df['post_title'].fillna('') + ' ' + poster_df['post_content'].fillna('')

# Keep only the post_text and post_comment_content columns
poster_df = poster_df[['post_text', 'post_comment_content']]

# Remove duplicate rows
poster_df = poster_df.drop_duplicates()

# Save the resulting DataFrame to a new CSV file
poster_df.to_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed.csv', index=False)

print("Data has been processed and saved to 'nlpclean_processed.csv'")


Data has been processed and saved to 'nlpclean_processed.csv'


#### Data cleaning and clustering

In [6]:
import pandas as pd
import re
import emoji
import jieba

# Load stopwords from the provided file
with open('/Users/ddyilin/Documents/GitHub/fashion/stopwords_cn.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# Load the nlpclean_processed.csv file
poster_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed.csv')

# Function for text cleaning
def clean_text(text, stopwords):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove specific patterns
    text = re.sub(r'- 小红书,,', '', text)
    text = re.sub(r',,\d{2}-\d{2},,', '', text)
    text = re.sub(r'#', ' ', text)
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    cleaned_text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Tokenize
    words = jieba.cut(cleaned_text)
    
    # Remove stopwords
    filtered_words = [word for word in words if word not in stopwords]
    
    return ' '.join(filtered_words)

# Apply data cleaning to post_text and post_comment_content
poster_df['post_text_clean'] = poster_df['post_text'].apply(lambda x: clean_text(x, stopwords))
poster_df['post_comment_content_clean'] = poster_df['post_comment_content'].apply(lambda x: clean_text(str(x), stopwords))

# Save the cleaned data to a new CSV file
poster_df.to_csv('/Users/ddyilin/Documents/GitHub/fashion/nlpclean_processed_cleaned.csv', index=False)

print("Data cleaning complete. Cleaned data saved to 'nlpclean_processed_cleaned.csv'")


Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/5d/lsfntfvj5jbbj9d1b8z0_qw00000gn/T/jieba.cache
Loading model cost 0.314 seconds.
Prefix dict has been built successfully.


Data cleaning complete. Cleaned data saved to 'nlpclean_processed_cleaned.csv'
