#### Expand your keyword list using RAKE (Rapid Automatic Keyword Extraction)
1. Tokenize the keyword group column to break down fashion phrases into individual words.
2. Apply RAKE to identify high-importance words from the tokenized text.
3. Expand the fashion keyword list by combining tokenized words and extracted keywords.

In [7]:
import pandas as pd
import jieba
import rake_nltk
from rake_nltk import Rake
import nltk
nltk.download('stopwords')

# Load the topics0611_filtered.csv file
fashion_keywords_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/topics0611.csv')

# Step 1: Tokenize the 'keyword group' column
def tokenize_keywords(text):
    tokens = jieba.lcut(text)
    return tokens

# Apply tokenization to the 'keyword group' column
fashion_keywords_df['tokenized_keywords'] = fashion_keywords_df['keyword group'].apply(lambda x: tokenize_keywords(str(x)))

# Step 2: Combine all tokenized words into a single list for keyword extraction
all_tokenized_keywords = [token for sublist in fashion_keywords_df['tokenized_keywords'] for token in sublist]

# Join the tokenized words to create a corpus for RAKE keyword extraction
corpus = ' '.join(all_tokenized_keywords)

# Step 3: Use RAKE to extract important keywords from the tokenized corpus
rake_extractor = Rake(language="chinese")  # Specify 'chinese' for RAKE in Chinese text

# Extract keywords from the tokenized corpus
rake_extractor.extract_keywords_from_text(corpus)

# Get the top keywords and phrases identified by RAKE
ranked_keywords = rake_extractor.get_ranked_phrases()

# Step 4: Combine the tokenized keywords with the extracted keywords for the expanded keyword list
expanded_fashion_keywords = set(all_tokenized_keywords + ranked_keywords)

# Save the expanded keyword list to a CSV file
expanded_keywords_df = pd.DataFrame(list(expanded_fashion_keywords), columns=['fashion_keywords'])
expanded_keywords_df.to_csv('/Users/ddyilin/Documents/GitHub/fashion/expanded_fashion_keywords.csv', index=False)

# Output the number of expanded fashion keywords
print(f"Number of expanded fashion keywords: {len(expanded_fashion_keywords)}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ddyilin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/ddyilin/nltk_data'
    - '/Users/ddyilin/miniconda3/envs/d2l/nltk_data'
    - '/Users/ddyilin/miniconda3/envs/d2l/share/nltk_data'
    - '/Users/ddyilin/miniconda3/envs/d2l/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


#### Get a non-fashion Chinese corpus
- Chinese Wikipedia Dump
- Use WikiExtractor to convert the XML dump into plain text
- Get extracted_texts
- Filter Non-Fashion Content from the Extracted Texts, using fashion keywords

In [None]:
import pandas as pd

# Load the topics0611.csv file
fashion_keywords_df = pd.read_csv('/Users/ddyilin/Documents/GitHub/fashion/topics0611_filtered.csv')

# Extract the 'keyword group' column as a list of fashion-related keywords
# Convert all values to strings and drop NaN values
fashion_keywords = fashion_keywords_df['keyword group'].dropna().astype(str).tolist()


In [None]:
# Filter Out Fashion-Related Content
import os

def filter_non_fashion_texts(input_dir, output_dir, fashion_keywords):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for foldername in os.listdir(input_dir):
        folder_path = os.path.join(input_dir, foldername)
        if os.path.isdir(folder_path):
            output_folder_path = os.path.join(output_dir, foldername)
            os.makedirs(output_folder_path, exist_ok=True)
            
            for filename in os.listdir(folder_path):
                input_file_path = os.path.join(folder_path, filename)
                output_file_path = os.path.join(output_folder_path, filename)
                
                with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
                    for line in infile:
                        # Ensure that the comparison is between strings
                        if not any(keyword in line for keyword in fashion_keywords):
                            outfile.write(line)

input_dir = '/Users/ddyilin/Documents/GitHub/fashion/extracted_texts'
output_dir = '/Users/ddyilin/Documents/GitHub/fashion/non_fashion_texts'

filter_non_fashion_texts(input_dir, output_dir, fashion_keywords)


#### Prepare Data for Model Training
- Tokenize the Non-Fashion Texts
- Combine with Fashion-Related Tokens

In [None]:
from transformers import BertTokenizer
import torch
import os
import random

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

def tokenize_corpus(directory, max_length):
    all_lines = []
    for foldername in os.listdir(directory):
        folder_path = os.path.join(directory, foldername)
        if os.path.isdir(folder_path):
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as file:
                    # Collect all lines to tokenize them at once, skipping blank lines
                    all_lines.extend([line.strip() for line in file if line.strip() != ""])
    
    # Tokenize all lines with padding and truncation to a fixed max length
    tokenized_texts = tokenizer(all_lines, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    
    # Manually adjust padding to ensure uniform sequence length
    input_ids = torch.nn.functional.pad(tokenized_texts['input_ids'], (0, max_length - tokenized_texts['input_ids'].size(1)), value=0)
    attention_mask = torch.nn.functional.pad(tokenized_texts['attention_mask'], (0, max_length - tokenized_texts['attention_mask'].size(1)), value=0)
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

# Set a max length for all tokenized sequences
max_length = 512  

# Tokenize non-fashion related texts
non_fashion_tokens = tokenize_corpus('/Users/ddyilin/Documents/GitHub/fashion/non_fashion_texts', max_length)

# Assuming fashion_keywords is a list of strings
fashion_tokens = tokenizer(fashion_keywords, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# Combine the tokenized non-fashion and fashion tokens
combined_input_ids = torch.cat((non_fashion_tokens['input_ids'], fashion_tokens['input_ids']), dim=0)
combined_attention_mask = torch.cat((non_fashion_tokens['attention_mask'], fashion_tokens['attention_mask']), dim=0)

# If token_type_ids are available, concatenate them as well
if 'token_type_ids' in non_fashion_tokens and 'token_type_ids' in fashion_tokens:
    combined_token_type_ids = torch.cat((non_fashion_tokens['token_type_ids'], fashion_tokens['token_type_ids']), dim=0)
else:
    combined_token_type_ids = None

# Label fashion-related texts as 1 and non-fashion-related texts as 0
fashion_labels = [1] * fashion_tokens['input_ids'].shape[0]
non_fashion_labels = [0] * non_fashion_tokens['input_ids'].shape[0]

# Combine the labels
all_labels = fashion_labels + non_fashion_labels

# Shuffle the dataset
combined = list(zip(combined_input_ids, combined_attention_mask, all_labels))
random.shuffle(combined)

# Unzip the shuffled dataset
all_input_ids, all_attention_mask, all_labels = zip(*combined)

# Convert back to tensors
all_input_ids = torch.stack(all_input_ids)
all_attention_mask = torch.stack(all_attention_mask)
all_labels = torch.tensor(all_labels)

# If token_type_ids are used, shuffle and stack them too
if combined_token_type_ids is not None:
    combined_token_type_ids = list(zip(combined_token_type_ids))
    random.shuffle(combined_token_type_ids)
    all_token_type_ids = torch.stack([item[0] for item in combined_token_type_ids])
else:
    all_token_type_ids = None

# Print shapes for verification
print("Input IDs shape:", all_input_ids.shape)
print("Attention Mask shape:", all_attention_mask.shape)
if all_token_type_ids is not None:
    print("Token Type IDs shape:", all_token_type_ids.shape)
print("Labels shape:", all_labels.shape)


#### Train the Model
- Split the Data into Training and Validation Sets
- Fine-Tune BERT

In [None]:
from sklearn.model_selection import train_test_split
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Ensure all_input_ids, all_attention_mask, and all_labels are tensors
all_input_ids = torch.tensor(all_input_ids)
all_attention_mask = torch.tensor(all_attention_mask)
all_labels = torch.tensor(all_labels)

# If token_type_ids is used, ensure it's a tensor too
if all_token_type_ids is not None:
    all_token_type_ids = torch.tensor(all_token_type_ids)

# Split the data into training and validation sets, ensuring consistency across all inputs
split_data = [all_input_ids, all_attention_mask, all_labels]
if all_token_type_ids is not None:
    split_data.append(all_token_type_ids)

train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    *split_data, test_size=0.2, random_state=42)

# If token_type_ids are used, split them as well
if all_token_type_ids is not None:
    train_token_type_ids, val_token_type_ids = train_test_split(all_token_type_ids, test_size=0.2, random_state=42)
else:
    train_token_type_ids, val_token_type_ids = None, None

# Convert splits into PyTorch tensors (no need to re-convert if already tensors)
# Combine inputs, masks, and labels into a TensorDataset
if train_token_type_ids is not None:
    train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_token_type_ids, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_inputs, val_masks, val_token_type_ids, val_labels)
else:
    train_dataset = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels)
    val_dataset = torch.utils.data.TensorDataset(val_inputs, val_masks, val_labels)

# Print out the shapes of the datasets to verify
print("Train input_ids shape:", train_inputs.shape)
print("Validation input_ids shape:", val_inputs.shape)
print("Train labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)
print("Train attention masks shape:", train_masks.shape)
print("Validation attention masks shape:", val_masks.shape)
if train_token_type_ids is not None:
    print("Train token type ids shape:", train_token_type_ids.shape)
    print("Validation token type ids shape:", val_token_type_ids.shape)



#### Check data before training
- Check Data Shapes
- Check for Any 'NaN' Values
- Check Data Distribution
- Inspect a Few Examples
- Check Label Value Range
- Verify Dataset Splitting
- Check for Duplicates

In [None]:
print("Input IDs shape:", all_input_ids.shape)
print("Attention Mask shape:", all_attention_mask.shape)
print("Labels shape:", all_labels.shape)

# If token_type_ids are used
if all_token_type_ids is not None:
    print("Token Type IDs shape:", all_token_type_ids.shape)

In [None]:
import torch

# Check for NaNs in input_ids
print("NaNs in Input IDs:", torch.isnan(all_input_ids).any().item())

# Check for NaNs in attention_mask
print("NaNs in Attention Mask:", torch.isnan(all_attention_mask).any().item())

# Check for NaNs in labels
print("NaNs in Labels:", torch.isnan(all_labels).any().item())

# If using token_type_ids
if all_token_type_ids is not None:
    print("NaNs in Token Type IDs:", torch.isnan(all_token_type_ids).any().item())


In [None]:
from collections import Counter

label_counts = Counter(all_labels.tolist())
print("Label distribution:", label_counts)


In [None]:
for i in range(5):  # Check first 5 examples
    print(f"Example {i + 1}:")
    print("Input IDs:", all_input_ids[i])
    print("Attention Mask:", all_attention_mask[i])
    print("Label:", all_labels[i])
    if all_token_type_ids is not None:
        print("Token Type IDs:", all_token_type_ids[i])
    print("-" * 50)


In [None]:
print("Unique labels:", set(all_labels.tolist()))


In [None]:
print("Train inputs shape:", train_inputs.shape)
print("Validation inputs shape:", val_inputs.shape)
print("Train labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)


In [None]:
unique_input_ids = torch.unique(all_input_ids, dim=0)
print("Number of unique input IDs:", unique_input_ids.shape[0])


In [None]:
from torch.utils.data import DataLoader

# Custom data collator that returns the batch directly
def custom_data_collator(batch):
    input_ids, attention_mask, labels = tuple(torch.stack(samples) for samples in zip(*batch))
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# If using token_type_ids
if train_token_type_ids is not None:
    def custom_data_collator(batch):
        input_ids, attention_mask, token_type_ids, labels = tuple(torch.stack(samples) for samples in zip(*batch))
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'labels': labels
        }

# DataLoader for the datasets
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=custom_data_collator)


#### Start Fine tune BERT
- Train BERT for Sequence Classification: classify whether a sentence or post is fashion-related

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split

# Load the BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_steps=500,
    weight_decay=0.03,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    gradient_accumulation_steps=2,
    max_grad_norm=1.0,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Assuming you already have train_dataset and val_dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()


#### Evaluate and Save the Model
- input: post_cleaned.csv
- output: post_filtered.csv

In [None]:
model.save_pretrained('/Users/ddyilin/Documents/GitHub/fashion')


In [None]:
# Evaluate the model on the validation dataset
eval_results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", eval_results)


## Apply the model


#### Prepare post data
- Combine 'post_title' and 'post_content' into 'post_text'
- Clean and Tokenize 'post_text'

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_test_fashion_nlpclean.csv')
# df = pd.read_csv('/home/disk1/red_disk1/poster_9305.csv')

# Combine 'post_title' and 'post_content' into 'post_text'
df['post_text'] = df['post_title'].fillna('') + ' ' + df['post_content'].fillna('')

# Drop 'post_title' and 'post_content' columns
df = df.drop(columns=['post_title', 'post_content', 'post_tag'])

# Save the updated DataFrame
# df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_9305_combined.csv', index=False)
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/poster_test_fashion_nlpclean_combined.csv', index=False)


In [None]:
import emoji
import re

# Load stopwords from the provided file
with open('/home/disk1/red_disk1/Multimodal_MKT/stopwords_cn.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# Function for text cleaning
def clean_text(text, stopwords):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove specific patterns
    text = re.sub(r'- 小红书,,', '', text)  # Removing "- 小红书,,"
    text = re.sub(r'小红书', '', text)  # Explicitly remove "小红书"
    text = re.sub(r',,\d{2}-\d{2},,', '', text)  # Removing patterns like ",,XX-XX,,"
    text = re.sub(r'#', ' ', text)  # Replace '#' with a space
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    cleaned_text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Remove stopwords (word-based removal)
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stopwords])
    
    return cleaned_text

# Apply cleaning function to 'post_text'
df['cleaned_post_text'] = df['post_text'].apply(lambda x: clean_text(str(x), stopwords))

# Remove duplicates
df = df.drop_duplicates()

# Save the cleaned DataFrame
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_cleaned.csv', index=False)


#### Filter Non-Fashion Words Using the Model
- Use the Trained Model to Classify Fashion-Related Content

In [None]:
# Set the model to evaluation mode
model.eval()

def classify_fashion_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_label = torch.argmax(logits, dim=1).item()
    return predicted_label == 1  # Returns True if the text is fashion-related

# Apply classification to the 'cleaned_post_text'
df['is_fashion_related'] = df['cleaned_post_text'].apply(lambda x: classify_fashion_text(x, model, tokenizer))


- Apply Keyword Extraction (RAKE) on Fashion-Related Texts

In [None]:
from rake_nltk import Rake

# Initialize RAKE for keyword extraction
r = Rake()

def extract_fashion_keywords(text):
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()  # Returns a list of keywords

# Apply RAKE keyword extraction to fashion-related texts
df['fashion_keywords'] = df.apply(lambda row: extract_fashion_keywords(row['cleaned_post_text']) if row['is_fashion_related'] else '', axis=1)

# Save the results
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered.csv', index=False)
