In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact

df = pd.read_csv('results-survey875738.csv')

# Filter the columns containing 'define'
text_columns = [c for c in df.columns if 'define' in c]
text_data = df[text_columns].fillna('').astype(str).apply(lambda row: ' '.join(row), axis=1)

# Use an off-the-shelf sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert the text to embeddings
embeddings = model.encode(text_data.tolist(), show_progress_bar=True)

# Elbow Method: Calculate inertia for clusters from 2 to 10
def plot_elbow_curve(embeddings, max_clusters=5):
    inertia = []
    cluster_range = range(2, max_clusters + 1)
    for n_clusters in cluster_range:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(embeddings)
        inertia.append(kmeans.inertia_)

    # Plot the elbow curve
    plt.figure(figsize=(8, 5))
    plt.plot(cluster_range, inertia, marker='o')
    plt.title('Elbow Curve for Optimal Number of Clusters')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia (Sum of Squared Distances)')
    plt.xticks(cluster_range)
    plt.grid(True)
    plt.show()

# Plot the elbow curve to help determine the optimal number of clusters
plot_elbow_curve(embeddings)

# Interactive clustering and visualization function
def cluster_and_visualize(n_clusters):
    # Perform KMeans with selected number of clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(embeddings)

    # Reduce embeddings to 2D using PCA for visualization
    pca = PCA(n_components=2)
    embeddings_2d = pca.fit_transform(embeddings)

    # Create a DataFrame for plotting
    plot_df = pd.DataFrame(embeddings_2d, columns=['PC1', 'PC2'])
    plot_df['cluster'] = df['cluster']

    # Visualize the clusters
    plt.figure(figsize=(10, 7))
    sns.scatterplot(x='PC1', y='PC2', hue='cluster', palette='viridis', data=plot_df, s=100, alpha=0.7)
    plt.title(f'KMeans Clustering with {n_clusters} Clusters')
    plt.show()

    # Display the text responses grouped by cluster
    for cluster_num in range(n_clusters):
        print(f"\n--- Texts in Cluster {cluster_num} ---")
        cluster_texts = df[text_columns][df['cluster'] == cluster_num].apply(lambda row: ' '.join(row), axis=1)
        for idx, text in enumerate(cluster_texts):
            print(f"{idx + 1}. {text}")

# Interact to select number of clusters based on elbow method
interact(cluster_and_visualize, n_clusters=(2, 5));
