In [12]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import davies_bouldin_score
from scipy.spatial.distance import cdist
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial import ConvexHull
import matplotlib.patches as patches

1) find optimal clustering 
2) load into labelstudio, qualitatively get senses
3) attribute replies w/ senses 
4) get the thread content bro... (actually im not sure how big of a deal this is, because idk how many instances are only 1-level replies, though we need this anyways)

In [24]:
contextinfodict = {"345": "Full 'dei' /pol/ threads, February - August 2024",
"567embeds": "Full 'dei' /pol/ threads, June- October 2024", 
"allembeds": "Full threads from /pol/ with 'dei', September 2022 - October 2024",
"dv_pol_": "All of /pol/ in Nov 2019", 
"DEIpol" : "Text with 'dei' on /pol/, September 2022 - October 2024", 
"rando2embeds" : "Third of randomly shuffled full threads, September 2022- October 2024", 
"28_twtembs": "U.S Election Tweets posted August 26-28 2024"}

In [18]:
contextinfodict = {"345": "Every parent post that contained 'dei', and children comments, from /pol/ between February and August 2024",
"567embeds": "Every parent post that contained 'dei', and children comments, from /pol/ between June and October 2024", 
"allembeds": "Every parent post that contained 'dei', and children comments, from /pol/ between September 2022 and October 2024",
"dv_pol_": "Random selection of posts/comments posted on all of /pol in November 2019; collected from Papasavva et al. 2020", 
"DEIpol" : "Only posts/comments that contained 'dei', from September 2022 to October 2024", 
"rando2embeds" : "A randomly shuffled third of the all parent+ children text corpus", 
"28_twtembs": "5.8million tweets related to the US 2024 Elections posted from August 26-28 2024, designated and collected by Balasubramanian et al. 2024"}

In [67]:
def process_word_embeddings(parent_folder, contextinfodict, min_clusters=2, max_clusters=6, tsne_perplexity=30):
    word_stats = {}
    for word_folder in os.listdir(parent_folder):
        word_path = os.path.join(parent_folder, word_folder)
        if not os.path.isdir(word_path):
            continue
        context_files = [f for f in os.listdir(word_path) if f.endswith('.csv')]
        tsne_fig, axes = plt.subplots(1, len(context_files), figsize=(6 * len(context_files), 6))
        if len(context_files) == 1:
            axes = [axes]
        word_stats[word_folder] = []
        for idx, context_file in enumerate(context_files):
            context_path = os.path.join(word_path, context_file)
            context_name = os.path.splitext(context_file)[0]
            data = pd.read_csv(context_path)
            if 'Dim1' not in data.columns or 'Dim2' not in data.columns:
                continue

            inertia_values = []
            silhouette_scores = []
            db_scores = []
            bic_values = []
            gap_statistics = []

            for k in range(min_clusters, max_clusters + 1):
                kmeans = KMeans(n_clusters=k, random_state=123)
                clusters = kmeans.fit_predict(data[['Dim1', 'Dim2']])
                inertia_values.append(kmeans.inertia_)
                if k > 1:
                    silhouette_scores.append(silhouette_score(data[['Dim1', 'Dim2']], clusters))
                    db_scores.append(davies_bouldin_score(data[['Dim1', 'Dim2']], clusters))
                gmm = GaussianMixture(n_components=k, random_state=123)
                gmm.fit(data[['Dim1', 'Dim2']])
                bic_values.append(gmm.bic(data[['Dim1', 'Dim2']]))
                if k > 1:
                    reference_inertia = []
                    for _ in range(10):
                        random_data = np.random.uniform(
                            low=data[['Dim1', 'Dim2']].min(),
                            high=data[['Dim1', 'Dim2']].max(),
                            size=data[['Dim1', 'Dim2']].shape
                        )
                        random_kmeans = KMeans(n_clusters=k, random_state=123).fit(random_data)
                        reference_inertia.append(random_kmeans.inertia_)
                    gap = np.log(np.mean(reference_inertia)) - np.log(kmeans.inertia_)
                    gap_statistics.append(gap)

            optimal_k_silhouette = silhouette_scores.index(max(silhouette_scores)) + min_clusters if silhouette_scores else min_clusters
            optimal_k_davies_bouldin = db_scores.index(min(db_scores)) + min_clusters if db_scores else min_clusters
            optimal_k_gmm = bic_values.index(min(bic_values)) + min_clusters if bic_values else min_clusters
            optimal_k_gap = gap_statistics.index(max(gap_statistics)) + min_clusters if gap_statistics else min_clusters

            gap_kmeans = KMeans(n_clusters=optimal_k_gap, random_state=123)
            data['gap_cluster'] = gap_kmeans.fit_predict(data[['Dim1', 'Dim2']])
            # Add cluster assignments for each method to the DataFrame
            data['gap_cluster'] = gap_kmeans.fit_predict(data[['Dim1', 'Dim2']])
            data['silhouette_cluster'] = KMeans(n_clusters=optimal_k_silhouette, random_state=123).fit_predict(data[['Dim1', 'Dim2']])
            data['davies_bouldin_cluster'] = KMeans(n_clusters=optimal_k_davies_bouldin, random_state=123).fit_predict(data[['Dim1', 'Dim2']])
            data['gmm_cluster'] = GaussianMixture(n_components=optimal_k_gmm, random_state=123).fit_predict(data[['Dim1', 'Dim2']])

            # Save to CSV
            output_csv_path = os.path.join(word_path, f"{context_name}_clustering_results.csv")
            data.to_csv(output_csv_path, index=False)


            ax = axes[idx]
            sns.scatterplot(
                data=data, x='Dim1', y='Dim2', hue='gap_cluster', palette='icefire', ax=ax, s=50, alpha=1
            )

            handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles=handles, labels=labels, title='gap clusters', fontsize=8)

            for method, optimal_k, color, label in [
                ('silhouette', optimal_k_silhouette, 'lightgrey', 'Silhouette'),
                ('davies_bouldin', optimal_k_davies_bouldin, 'slategray', 'Davies-Bouldin'),
                ('gmm', optimal_k_gmm, 'dimgray', 'GMM')
            ]:
                overlay_kmeans = KMeans(n_clusters=optimal_k, random_state=123)
                overlay_clusters = overlay_kmeans.fit_predict(data[['Dim1', 'Dim2']])
                for cluster in range(optimal_k):
                    cluster_points = data[['Dim1', 'Dim2']][overlay_clusters == cluster].values
                    if len(cluster_points) > 2:
                        hull = ConvexHull(cluster_points)
                        polygon = patches.Polygon(cluster_points[hull.vertices], 
                                                   edgecolor=color, facecolor=color, 
                                                   alpha=0.1, label=f"{label} (k={optimal_k})", zorder = 1)
                        ax.add_patch(polygon)  
            description = next((v for k, v in contextinfodict.items() if k in context_name), context_name)
            description = next((v for k, v in contextinfodict.items() if k in context_name), context_name)
            silhouette_metric = max(silhouette_scores) if silhouette_scores else 'N/A'
            davies_bouldin_metric = min(db_scores) if db_scores else 'N/A'
            gmm_bic_metric = min(bic_values) if bic_values else 'N/A'
            ax.set_title(
                f"{description} | "
                f"n = {len(data)} \n ",
                fontsize=10, ha='center', va='top'
                )
            ax.text(
                0.5,-0.15,
                f"clustering metrics: Gap = {optimal_k_gap}, Silhouette = {optimal_k_silhouette} (Score: {silhouette_metric if silhouette_metric == 'N/A' else f'{silhouette_metric:.2f}'})," 
                f"Davies-Bouldin = {optimal_k_davies_bouldin} (Score: {davies_bouldin_metric if davies_bouldin_metric == 'N/A' else f'{davies_bouldin_metric:.2f}'}),\n "
                f"GMM = {optimal_k_gmm} (BIC: {gmm_bic_metric if gmm_bic_metric == 'N/A' else f'{gmm_bic_metric:.2f}'})",
                fontsize=8, ha='center', va='top', transform=ax.transAxes
                )
        tsne_fig.suptitle(f"tSNE visualization for {word_folder}", fontsize=16, fontweight='bold')
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(os.path.join(word_path, f"{word_folder}_clustersv8.png"))
        plt.close(tsne_fig)
    return word_stats

In [69]:
parent_folder = 'C:\\Users\\emzou\\Desktop\\luffy'

# Run the function
results = process_word_embeddings(
    parent_folder=parent_folder,
    contextinfodict = contextinfodict,
    min_clusters=2,
    max_clusters=6,
    tsne_perplexity=30
)

In [None]:
handles = [
        patches.Patch(color='blue', alpha=0.2, label='Silhouette'),
        patches.Patch(color='red', alpha=0.2, label='Davies-Bouldin'),
        patches.Patch(color='yellow', alpha=0.2, label='GMM')
        ]
        tsne_fig.legend(
        handles=handles, loc='upper left', bbox_to_anchor=(0.2, 0.95), 
        ncol=3, title='other clustering methods', fontsize=8, title_fontsize=10
        )

handles, labels = ax.get_legend_handles_labels()
            ax.legend(handles=handles, labels=labels, title='gap clusters', loc='upper right', fontsize=8)