In [5]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
import pandas as pd 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # suppress annoying kmeans future warning
warnings.simplefilter(action='ignore', category=DeprecationWarning)

scaler = StandardScaler()
data_df = pd.read_csv("data/pan22_features.csv")
X = data_df.drop(columns=["author_id", "discourse_type"])
cols = list(X.columns)

X_scaled = scaler.fit_transform(X)
X_T_scaled = scaler.fit_transform(X.values.T)

dis_matrix_df = pd.concat([
    pd.DataFrame(cols),
    pd.DataFrame(squareform(pdist(X_T_scaled, metric="correlation")))],
                       axis=1, ignore_index=True)

dis_matrix_df.set_index(0, inplace=True)
dis_matrix_df.columns = cols


"('was', 'VERB')" in data_df.columns


True

In [2]:
mds_data = MDS(n_components=2, random_state=42)
mds_variables = MDS(n_components=2, random_state=42, dissimilarity="precomputed")
embedding_data = mds_data.fit_transform(X_scaled)
embedding_variables = mds_variables.fit_transform(dis_matrix_df.values, )

kmeans = KMeans(n_clusters=9, random_state=42)
kmeans.fit(X_scaled)

In [7]:
variables_df = pd.DataFrame({"Feature names": dis_matrix_df.columns,
                             "Component 1":embedding_variables[:,0],
                             "Component 2":embedding_variables[:,1]})


variables_df

Unnamed: 0,Feature names,Component 1,Component 2
0,ADJ,0.229392,-0.866665
1,ADP,0.738875,-0.532222
2,ADV,-0.355174,-1.070422
3,AUX,-0.084888,-1.188446
4,CCONJ,-0.614177,0.125747
...,...,...,...
402,"('NOUN', 'was')",-0.121916,0.404940
403,"('NOUN', 'have')",0.060876,0.353798
404,"('an', 'NOUN')",0.061109,0.300067
405,"('ADV', 'the')",0.049445,0.243239


# K means experiment

From the plot generated below, it is difficult to see where the elbow is. Upon inspection of the points, I've determined that after `k=9` is when the points stop seeing significant change.

In [6]:
sum_squares = []
candidate_k_values = list(range(1,20))
for k in candidate_k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    sum_squares.append(kmeans.inertia_)

fig = go.Figure()
fig.add_trace(
    go.Line(
        x=candidate_k_values, 
        y=sum_squares))

fig.add_shape(
        type="line",
        x0=1, y0=max(sum_squares), x1=19, y1=min(sum_squares),
        line={"color":"grey"}
    )