In [None]:
import xml.etree.ElementTree as ET
import os, json, plotly, tqdm
import numpy as np
from sklearn.decomposition import PCA
import plotly.express as px
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

def get_posts_or_topics(base_path = rf"C:\Users\{os.getlogin()}\Downloads", topics = ["apple", "astronomy", "ai", "3dprinting", "bioinformatics", "beer"]):

    posts = {}
    for topic in topics:
        print(f"WORKING ON TOPIC '{topic}'")
        # Parse the XML file
        tree = ET.parse(rf"{base_path}\{topic}.stackexchange.com\Posts.xml")
        root = tree.getroot()

        # Iterate over each row element and extract the Body attribute
        ct = 0
        posts[topic] = []
        for row in root.findall('row'):
            body = row.get('Body')
            posts[topic].append(body)
            ct = ct + 1
            if ct > 50:
                break

    return posts


In [None]:
# use to get data
if 1 == 2:
    posts = get_posts_or_topics()
    with open('posts.json', 'w') as file:
        json.dump(posts, file, indent=4)

In [None]:
with open('posts.json', 'r') as f:
    posts = json.load(f)

In [None]:
# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# get embeddings per topic
j_embeddings = {}
for topic, v in posts.items():
    j_embeddings[topic] = model.encode(posts[topic])

In [None]:
# combine embeddings in single array
topics = ["apple", "ai", "3dprinting", "bioinformatics", "beer"]
embeddings = np.vstack((j_embeddings[topics[0]], j_embeddings[topics[1]]))
for topic in topics[2:]:
    embeddings = np.vstack((embeddings, j_embeddings[topic]))

In [None]:
# get vector for topics
num_elements_per_topic = 51
num_topics = int(embeddings.shape[0]/num_elements_per_topic)
colors = ["red", "blue", "green", "cyan", "magenta", "black", "yellow"][:num_topics]
col_topics = [element for element in topics for _ in range(num_elements_per_topic)]

In [None]:
# Perform PCA to reduce to 3 components
pca = PCA(n_components=3)
reduced_data = pca.fit_transform(embeddings)

df = pd.DataFrame(reduced_data, columns=['PC1', 'PC2', 'PC3'])
df['labels'] = col_topics

# Create an interactive 3D scatter plot
fig = px.scatter_3d(df, x='PC1', y='PC2', z='PC3', color='labels')

fig.update_traces(marker=dict(size=5))
fig.update_layout(
    scene=dict(
        xaxis=dict(showticklabels=False, title=''),
        yaxis=dict(showticklabels=False, title=''),
        zaxis=dict(showticklabels=False, title='')
    ), showlegend=False
)
fig.show()

In [None]:
# Perform TSNE to reduce to 2 components
tsne_model = TSNE(n_components=2, random_state=42)
tsne_embeddings_values = tsne_model.fit_transform(embeddings)

fig = px.scatter(
    x = tsne_embeddings_values[:,0], 
    y = tsne_embeddings_values[:,1],
    color = col_topics,
)

fig.update_layout(
    xaxis=dict(showticklabels=False, title=''),
    yaxis=dict(showticklabels=False, title=''),
    showlegend=False
)
fig.show()

In [None]:
silhouette_scores = []
kmeans_labels = {}
for k in tqdm.tqdm(range(2, 12)):
    kmeans = KMeans(n_clusters=k, 
                    random_state=42, 
                    n_init = 'auto').fit(embeddings)
    kmeans_labels[k] = kmeans.labels_
    silhouette_scores.append(
        {
            'k': k,
            'silhouette_score': silhouette_score(embeddings, 
                kmeans_labels[k], metric = 'cosine')
        }
    )

fig = px.line(pd.DataFrame(silhouette_scores).set_index('k'),
       title = '<b>Silhouette scores for K-means clustering</b>',
       labels = {'value': 'silhoutte score'}, 
       color_discrete_sequence = plotly.colors.qualitative.Alphabet)
fig.update_layout(showlegend = False)

