In [1]:
from dash import Dash, dcc, html
from dash.dependencies import Input, Output
from dash_bootstrap_components.themes import CERULEAN
import dash_bootstrap_components as dbc
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
import pandas as pd 
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # suppress annoying kmeans future warning

df = pd.read_csv("data/pan22_features.csv")
X = df.drop(columns=["author_id", "discourse_type"])

In [9]:
mds = MDS(n_components=2, random_state=42)
embedding = mds.fit_transform(X)

kmeans = KMeans(n_clusters=9, random_state=42)
kmeans.fit(embedding)

In [18]:


kmeans_df = pd.concat([pd.DataFrame({"Component 1":embedding[:,0],
                                     "Component 2":embedding[:,1]}), 
                       pd.DataFrame({"K Cluster":kmeans.labels_})], axis=1)

fig = px.scatter(kmeans_df, 
                 x="Component 1", 
                 y="Component 2", 
                 color="K Cluster")
fig.update_layout(title="Data MDS Plot",
                  title_x=0.5)


(1047,)

# K means experiment

From the plot generated below, it is difficult to see where the elbow is. Upon inspection of the points, I've determined that after `k=9` is when the points stop seeing significant change.

In [31]:
sum_squares = []
candidate_k_values = list(range(1,20))
for k in candidate_k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    sum_squares.append(kmeans.inertia_)

fig = go.Figure()
fig.add_trace(
    go.Line(
        x=candidate_k_values, 
        y=sum_squares))

fig.add_shape(
        type="line",
        x0=1, y0=max(sum_squares), x1=19, y1=min(sum_squares),
        line={"color":"grey"}
    )


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.


