In [1]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd 

scaler = StandardScaler()
df = pd.read_csv("data/pan22_features.csv", index_col=None)

X = df.drop(columns=["author_id", "discourse_type"]).values

X_scaled = scaler.fit_transform(X)
Y = df["author_id"].values

# n_components experiments

The following cell finds the `number of components` most suited for this data, which turns out to be 7

In [2]:
X_scaled = scaler.fit_transform(X)

THRESHOLD = 5.5
for n in range(1,100):
    pca = PCA(n_components=n)
    pca_features = pca.fit_transform(X_scaled)
    #print(f"n_components: {n}, Explained variance: {pca.explained_variance_[-1]}")
    
    if pca.explained_variance_[-1] < THRESHOLD:
        # print(f"Final n = {n}")
        # print(f"Final Explained variance = {pca.explained_variance_}")
        break
    

In [16]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # suppress annoying kmeans future warning

within_cluster_sum_squares = []
candidate_k_values = range(1,16)

for k in candidate_k_values:
    model = KMeans(n_clusters=k, random_state=1)
    model.fit(pca_features)
    within_cluster_sum_squares.append(model.inertia_)

kmeans_df = pd.DataFrame({"K":candidate_k_values,
                          "Within Cluster Sum of Squares (WCSS)":within_cluster_sum_squares})
fig = px.line(kmeans_df, 
        x="K", 
        y="Within Cluster Sum of Squares (WCSS)", 
        markers=True,
        title="PCA WCSS Error per K cluster")
fig.update_layout(title_x=0.5)
fig.show()