In [1]:
import plotly.express as px
import plotly.graph_objects as go
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
import pandas as pd 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # suppress annoying kmeans future warning
warnings.simplefilter(action='ignore', category=DeprecationWarning)

scaler = StandardScaler()
data_df = pd.read_csv("data/pan22_features.csv")
X = data_df.drop(columns=["author_id", "discourse_type"])
cols = list(X.columns)

X_scaled = scaler.fit_transform(X)
X_T_scaled = scaler.fit_transform(X.values.T)

dis_matrix_df = pd.concat([
    pd.DataFrame(cols),
    pd.DataFrame(squareform(pdist(X_T_scaled, metric="correlation")))],
                       axis=1, ignore_index=True)

dis_matrix_df.set_index(0, inplace=True)
dis_matrix_df.columns = cols

dis_matrix_df.head()

Unnamed: 0_level_0,ADJ,ADP,ADV,AUX,CCONJ,DET,INTJ,NOUN,NUM,PART,...,"('was', 'VERB')","('NOUN', 'is')","('INTJ', 'I')","('VERB', 'your')","('can', 'VERB')","('NOUN', 'was')","('NOUN', 'have')","('an', 'NOUN')","('ADV', 'the')","('n’t', 'VERB')"
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ADJ,0.0,0.988814,0.771448,0.906078,1.132052,1.038133,0.835288,0.866748,1.098346,1.148486,...,1.179542,1.050419,0.97517,1.189326,1.207448,1.13716,1.199493,1.191803,1.107316,1.175719
ADP,0.988814,0.0,1.069209,1.075143,1.029357,0.793767,1.105653,0.726901,1.046739,1.168549,...,1.108848,1.092089,1.19037,1.072902,1.057144,1.103173,1.035565,1.077695,1.089118,1.207448
ADV,0.771448,1.069209,0.0,0.645924,1.018252,1.2421,0.627525,1.069976,1.292415,0.906844,...,1.391494,1.387928,1.13424,1.33621,1.277867,1.36037,1.479855,1.416454,1.328107,1.152493
AUX,0.906078,1.075143,0.645924,0.0,1.254923,1.068414,0.613003,0.908586,1.38929,0.770053,...,1.406633,1.405571,1.243355,1.417378,1.285829,1.421028,1.485825,1.464667,1.449802,1.19858
CCONJ,1.132052,1.029357,1.018252,1.254923,0.0,1.015406,1.192214,1.221713,0.940772,0.924507,...,0.713664,0.720283,0.806346,0.716796,0.682134,0.692152,0.699918,0.705725,0.691939,0.754711


In [2]:
# mds_data = MDS(n_components=2, random_state=42)
# mds_variables = MDS(n_components=2, random_state=42, dissimilarity="precomputed")
# embedding_data = mds_data.fit_transform(X_scaled)
# embedding_variables = mds_variables.fit_transform(dis_matrix_df.values, )

kmeans = KMeans(n_clusters=9, random_state=42)
kmeans.fit(X_scaled)

In [18]:

data_df_sample = data_df.sample(n=10, axis="columns")
kmeans_pcp_df = pd.concat([data_df_sample, 
                           pd.DataFrame({"K Cluster":kmeans.labels_})],
                          axis=1)
    

data_df_sample
# fig = px.parallel_coordinates(
#         kmeans_pcp_df,
#         dimensions=list(data_df_sample.columns),
#         color="K Cluster",
#     )
# fig.show()

Unnamed: 0,what,can,"('you', 'VERB')","('PRON', 'ADP')",myself,"('.', 'ADV')","('VERB', 'DET')",%,"('NOUN', 'have')",from
0,0.004098,0.012295,0.019499,0.009302,0.0,0.005571,0.020155,0.000000,0.000000,0.008197
1,0.010526,0.000000,0.013937,0.007767,0.0,0.010453,0.021359,0.000000,0.010453,0.000000
2,0.000000,0.000000,0.004695,0.005155,0.0,0.004695,0.036082,0.018868,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.015748,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.015873,0.000000,0.0,0.000000,0.017391,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
1042,0.006270,0.010972,0.000000,0.001034,0.0,0.012500,0.014470,0.003968,0.003125,0.009404
1043,0.000000,0.013514,0.017857,0.000000,0.0,0.000000,0.009302,0.000000,0.000000,0.006757
1044,0.007576,0.007576,0.009804,0.007282,0.0,0.004902,0.009709,0.000000,0.000000,0.007576
1045,0.000000,0.009259,0.010753,0.005602,0.0,0.005376,0.025210,0.000000,0.000000,0.000000


# K means experiment

From the plot generated below, it is difficult to see where the elbow is. Upon inspection of the points, I've determined that after `k=9` is when the points stop seeing significant change.

In [6]:
sum_squares = []
candidate_k_values = list(range(1,20))
for k in candidate_k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    sum_squares.append(kmeans.inertia_)

fig = go.Figure()
fig.add_trace(
    go.Line(
        x=candidate_k_values, 
        y=sum_squares))

fig.add_shape(
        type="line",
        x0=1, y0=max(sum_squares), x1=19, y1=min(sum_squares),
        line={"color":"grey"}
    )