## No dictionary
##### Inputs: poster_test_fashion_nlpclean.csv
##### Outputs: poster_test_fashion_clustered.csv
##### Steps:
1. Double textual data cleaning
2. RAKE Keyword Extraction
3. Embedding Generation: text2vec-large-chinese
4. Silhouette Method for Optimal Clusters
5. Clustering: KMeans clustering

In [4]:
import re
import jieba
from rake_nltk import Rake
import emoji
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np

# Load the stopwords file
stopwords_file_path = '/home/disk1/red_disk1/Multimodal_MKT/stopwords_cn.txt'
with open(stopwords_file_path, 'r', encoding='utf-8') as file:
    stopwords = set(file.read().splitlines())

# Load the poster_test_fashion_nlpclean.csv file
poster_df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/test/poster_test_fashion_nlpclean.csv')

# Ensure that the post_title and post_content columns are filled
poster_df['post_title'] = poster_df['post_title'].fillna('')
poster_df['post_content'] = poster_df['post_content'].fillna('')

# Combine titles and content for searching
poster_df['combined_text'] = poster_df['post_title'] + ' ' + poster_df['post_content']

# Function for text cleaning
def clean_text(text, stopwords):
    # Convert emojis to text
    text = emoji.demojize(text)
    
    # Remove specific patterns
    text = re.sub(r'- 小红书,,', '', text)
    text = re.sub(r',,\d{2}-\d{2},,', '', text)
    text = re.sub(r'#', ' ', text)
    
    # Remove digits
    text = re.sub(r'\d+', '', text)
    
    # Remove special characters
    cleaned_text = ''.join(char for char in text if char.isalnum() or char.isspace())
    
    # Tokenize
    words = jieba.cut(cleaned_text)
    
    # Remove stopwords
    filtered_words = [word for word in words if word not in stopwords]
    
    return ' '.join(filtered_words)

# Function to extract keywords using RAKE
def extract_keywords(text):
    r = Rake()
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()

# Apply text cleaning and RAKE keyword extraction to each row
poster_df['cleaned_text'] = poster_df['combined_text'].apply(lambda x: clean_text(x, stopwords))
poster_df['rake_keywords'] = poster_df['cleaned_text'].apply(extract_keywords)

# Load the text2vec-large-chinese model and tokenizer
model_name = "shibing624/text2vec-large-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to get embeddings for the cleaned text
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        embedding = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
    return embedding

# Generate embeddings for the cleaned text
poster_df['embedding'] = poster_df['cleaned_text'].apply(get_embedding)

# Stack the embeddings into a single numpy array for clustering
embeddings = np.stack(poster_df['embedding'].values)

# Determine the optimal number of clusters using the silhouette method
silhouette_scores = []
range_n_clusters = list(range(2, 11))  # Testing clusters from 2 to 10

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print(f"For n_clusters = {n_clusters}, the average silhouette score is {silhouette_avg}")

# Select the best number of clusters
best_n_clusters = range_n_clusters[np.argmax(silhouette_scores)]
print(f"The best number of clusters based on silhouette score is: {best_n_clusters}")

# Perform KMeans clustering with the best number of clusters
kmeans = KMeans(n_clusters=best_n_clusters, random_state=42)
poster_df['cluster'] = kmeans.fit_predict(embeddings)

# Save the clustered data to a new CSV file
poster_df[['poster_id', 'post_id', 'cleaned_text', 'rake_keywords', 'cluster']].to_csv('/home/disk1/red_disk1/Multimodal_MKT/test/poster_test_fashion_clustered.csv', index=False)

# Print a sample of the clustered data
print(poster_df[['poster_id', 'post_id', 'cleaned_text', 'rake_keywords', 'cluster']].head())


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/diandian/nltk_data'
    - '/opt/tljh/user/nltk_data'
    - '/opt/tljh/user/share/nltk_data'
    - '/opt/tljh/user/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


#### Text Visualization
1. Check Fashion Popularity among Clusters. The plot shows the number of posts in each cluster.
2. Top Keywords in Each Cluster

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the clustered data
poster_df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/test/poster_test_fashion_clustered.csv')

# Set up the visualization style
sns.set(style="whitegrid")

# Count the number of posts in each cluster
cluster_counts = poster_df['cluster'].value_counts().reset_index()
cluster_counts.columns = ['cluster', 'count']

# Visualize the popularity of each cluster
plt.figure(figsize=(12, 8))
sns.barplot(x='cluster', y='count', data=cluster_counts, palette="viridis")
plt.title('Popularity of Clusters (Number of Posts per Cluster)')
plt.xlabel('Cluster')
plt.ylabel('Number of Posts')
plt.show()

# Visualize the top keywords in each cluster
top_keywords_per_cluster = poster_df.groupby('cluster')['rake_keywords'].apply(lambda x: ', '.join(x).split(', ')).reset_index()
top_keywords_per_cluster['top_keywords'] = top_keywords_per_cluster['rake_keywords'].apply(lambda x: pd.Series(x).value_counts().head(5).index.tolist())

# Explode the top_keywords list for better visualization
top_keywords_per_cluster = top_keywords_per_cluster.explode('top_keywords')

plt.figure(figsize=(14, 10))
sns.countplot(y='top_keywords', hue='cluster', data=top_keywords_per_cluster, palette="tab20", order=top_keywords_per_cluster['top_keywords'].value_counts().index)
plt.title('Top Keywords in Each Cluster')
plt.xlabel('Frequency')
plt.ylabel('Keyword')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


#### Image visualization
The images associated with the top clusters are displayed, giving a visual representation of the content within those clusters.


In [3]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

# Load the clustered data
poster_df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/test/poster_test_fashion_clustered.csv')

# Define the base path for the images
base_image_path = '/home/disk1/red_disk1/Multimodal_MKT/test/data/'

# Function to construct the image path
def get_image_path(row):
    poster_id = row['poster_id']
    post_id = row['post_id']
    return os.path.join(base_image_path, poster_id, post_id, 'post.png')

# Add the image path to the DataFrame
poster_df['image_path'] = poster_df.apply(get_image_path, axis=1)

# Check if the image file exists
poster_df['image_exists'] = poster_df['image_path'].apply(lambda x: os.path.exists(x))

# Filter out rows where the image does not exist
poster_df = poster_df[poster_df['image_exists']]

# Count the number of posts in each cluster
cluster_counts = poster_df['cluster'].value_counts().reset_index()
cluster_counts.columns = ['cluster', 'count']

# Visualize the images for the most popular clusters
top_n = 5  # Number of top clusters to visualize
for cluster in cluster_counts['cluster'].head(top_n):
    cluster_df = poster_df[poster_df['cluster'] == cluster]
    image_paths = cluster_df['image_path'].tolist()
    
    # Plot the images for this cluster
    plt.figure(figsize=(15, 10))
    plt.suptitle(f"Images for Cluster: {cluster}", fontsize=16)
    
    for i, image_path in enumerate(image_paths[:10]):  # Display up to 10 images per cluster
        try:
            img = Image.open(image_path)
            plt.subplot(2, 5, i + 1)
            plt.imshow(img)
            plt.axis('off')
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
    
    plt.tight_layout()
    plt.show()


FileNotFoundError: [Errno 2] No such file or directory: '/home/disk1/red_disk1/Multimodal_MKT/test/poster_test_fashion_clustered.csv'