### FUll Pipeline: Fine-Tuning BERT, Classifying Fashion-Related Content, and Applying Keyword Extraction

#### Step 1: Fine-Tune BERT on Fashion Corpus

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch

# Step 1: Load Fashion Corpus
fashion_keywords_df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/topics0611_filtered.csv')

# Extract the 'keyword group' column as a list of fashion-related keywords
# Convert all values to strings and drop NaN values
fashion_keywords = fashion_keywords_df['keyword group'].dropna().astype(str).tolist()

# Custom Dataset for Fashion Corpus
class FashionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(1, dtype=torch.long)  # Label = 1 for fashion-related
        }

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

# Set max length for the keywords
max_len = 32  # Adjust as per your dataset
fashion_dataset = FashionDataset(fashion_keywords, tokenizer, max_len)
train_dataloader = DataLoader(fashion_dataset, batch_size=8, shuffle=True)

# Step 2: Define Training Arguments and Fine-Tune the Model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=fashion_dataset,
    eval_dataset=fashion_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('/home/disk1/red_disk1/Multimodal_MKT/fashion_bert_model')


#### Step 2: Classify Fashion-Related Sentences in cleaned_post_text


In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_test_fashion_nlpclean.csv')
# df = pd.read_csv('/home/disk1/red_disk1/poster_9305.csv')

# Combine 'post_title' and 'post_content' into 'post_text'
df['post_text'] = df['post_title'].fillna('') + ' ' + df['post_content'].fillna('')

# Drop 'post_title' and 'post_content' columns
df = df.drop(columns=['post_title', 'post_content', 'post_tag'])

# Save the updated DataFrame
# df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_9305_combined.csv', index=False)
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_test_fashion_nlpclean_combined.csv', index=False)

import emoji
import re

# Load stopwords from the provided file
with open('/home/disk1/red_disk1/Multimodal_MKT/stopwords_cn.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# Function for text cleaning
def clean_text(text, stopwords):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove specific patterns
    text = re.sub(r'- 小红书,,', '', text)  # Removing "- 小红书,,"
    text = re.sub(r'小红书', '', text)  # Explicitly remove "小红书"
    text = re.sub(r',,\d{2}-\d{2},,', '', text)  # Removing patterns like ",,XX-XX,,"
    text = re.sub(r'#', ' ', text)  # Replace '#' with a space
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    cleaned_text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Remove stopwords (word-based removal)
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stopwords])
    
    return cleaned_text

# Apply cleaning function to 'post_text'
df['cleaned_post_text'] = df['post_text'].apply(lambda x: clean_text(str(x), stopwords))

# Remove duplicates
df = df.drop_duplicates()

# Save the cleaned DataFrame
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_cleaned.csv', index=False)


In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the fine-tuned BERT model for fashion classification
model = BertForSequenceClassification.from_pretrained('/home/disk1/red_disk1/Multimodal_MKT/fashion_bert_model')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# Set the model to evaluation mode
model.eval()

# Function to classify if a sentence/phrase is fashion-related
def classify_fashion_sentence(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()
    return predicted_label == 1  # Return True if the sentence is fashion-related

# Function to filter fashion-related sentences in a post
def filter_fashion_sentences(text, model, tokenizer):
    sentences = text.split('，')  # Split by comma or another delimiter suitable for sentences
    fashion_sentences = [sentence for sentence in sentences if classify_fashion_sentence(sentence, model, tokenizer)]
    return ' '.join(fashion_sentences)

# Load the cleaned data
df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/post_cleaned.csv')

# Apply the sentence filtering function to each row of 'cleaned_post_text'
df['fashion_text'] = df['cleaned_post_text'].apply(lambda x: filter_fashion_sentences(x, model, tokenizer))

# Save the DataFrame with filtered fashion-related sentences
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered_bert.csv', index=False)

# Display the count of fashion-related posts
num_fashion_related = df['fashion_text'].apply(lambda x: len(x.strip()) > 0).sum()
num_non_fashion_related = len(df) - num_fashion_related
print(f"Number of fashion-related posts: {num_fashion_related}")
print(f"Number of non-fashion-related posts: {num_non_fashion_related}")


#### Step 3: Extract Fashion-Related Keywords Using RAKE


In [None]:
import rake_nltk
from rake_nltk import Rake

# Initialize RAKE keyword extractor
r = Rake()  # You can also pass a stopwords list here, e.g., Rake(stopwords)

# Function to extract fashion-related keywords using RAKE
def extract_keywords(text):
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()

# Apply RAKE on the fashion_text column to extract keywords
df['fashion_keywords'] = df['fashion_text'].apply(lambda x: extract_keywords(x))

# Save the final DataFrame with fashion-related keywords
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered_rake.csv', index=False)


#### Step 4: Filter Out Non-Fashion Keywords


In [None]:
# Load the fashion lexicon from 'topics0611_filtered.csv'
fashion_lexicon_df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/topics0611_filtered.csv')
fashion_keywords = set(fashion_lexicon_df['keyword group'].dropna().astype(str))

# Function to keep only fashion-related keywords
def filter_fashion_keywords(keywords):
    return [keyword for keyword in keywords if keyword in fashion_keywords]

# Apply the fashion keyword filter
df['filtered_fashion_keywords'] = df['fashion_keywords'].apply(lambda x: filter_fashion_keywords(x))

# Save the final output with only fashion-related keywords
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered.csv', index=False)
