# Basic TF-IDF and K-means clustering
 - Step 1: Preprocess the Text
 - Step 2: Apply K-means Clustering
 - Step 3: Evaluate and Visualize the Clusters
 - Step 4: Save the Results

### Summary
- TF-IDF Vectorization: Converts the fashion_text into a numerical form.
- K-means Clustering: Groups similar texts together.
- PCA Visualization: Optionally visualizes the clusters in 2D space.
- Saving Results: Saves the DataFrame with the cluster assignments.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the data
df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered.csv')

# Convert the 'fashion_text' column into TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(df['fashion_text'].dropna())


In [None]:
from sklearn.cluster import KMeans

# Define the number of clusters
num_clusters = 5  # You can adjust this based on your needs

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X)


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce the dimensionality for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.toarray())

# Plot the clusters
plt.figure(figsize=(10, 7))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['cluster'], cmap='viridis')
plt.title('Clusters of Fashion-Related Texts')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
# Save the DataFrame with cluster labels
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered_with_clusters0.csv', index=False)


# Advanced clustering
- Step 1: Extract Keywords with RAKE
- Step 2: Generate Embeddings with SBERT
- Step 3: Apply UMAP for Dimensionality Reduction
- Step 4: Apply Clustering (K-means or HDBSCAN)
- Step 5: Visualize the Clusters
- Step 6: Save the Results

### Summary
- RAKE helps in focusing on important keywords.
- SBERT captures the semantic meaning of the text.
- UMAP reduces the dimensionality for better visualization and clustering.
- HDBSCAN can find clusters of varying densities, which can be more effective than K-means in some cases.

In [None]:
import pandas as pd
from rake_nltk import Rake

# Load the data
df = pd.read_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered.csv')

# Initialize RAKE
rake = Rake()

# Extract keywords for each text
df['keywords'] = df['fashion_text'].apply(lambda x: ' '.join(rake.extract_keywords_from_text(x) or ['']))


In [None]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained SBERT model
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Generate embeddings for the text
df['embeddings'] = df['fashion_text'].apply(lambda x: model.encode(x))


In [None]:
import umap

# Convert the list of embeddings to a matrix
embeddings_matrix = list(df['embeddings'])

# Reduce dimensionality with UMAP
umap_model = umap.UMAP(n_neighbors=15, n_components=2, metric='cosine')
reduced_embeddings = umap_model.fit_transform(embeddings_matrix)


In [None]:
from sklearn.cluster import KMeans

# Apply K-means clustering
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(reduced_embeddings)

# Optionally, you could use HDBSCAN
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom')
df['cluster'] = clusterer.fit_predict(reduced_embeddings)


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 7))
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=df['cluster'], cmap='viridis')
plt.title('Clusters of Fashion-Related Texts')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.colorbar(label='Cluster')
plt.show()


In [None]:
# Save the DataFrame with cluster labels
df.to_csv('/home/disk1/red_disk1/Multimodal_MKT/post_filtered_with_clusters.csv', index=False)
