In [1]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # suppress annoying kmeans future warning 

scaler = StandardScaler()
df = pd.read_csv("data/pan22_features.csv", index_col=None)

X = df.drop(columns=["author_id", "discourse_type"]).values

X_scaled = scaler.fit_transform(X)
Y = df["author_id"].values

# n_components experiments

The following cell finds the `number of components` most suited for this data, which turns out to be 7

In [2]:
X_scaled = scaler.fit_transform(X)

THRESHOLD = 5.5
for n in range(1,100):
    pca = PCA(n_components=n)
    pca_features = pca.fit_transform(X_scaled)
    #print(f"n_components: {n}, Explained variance: {pca.explained_variance_[-1]}")
    
    if pca.explained_variance_[-1] < THRESHOLD:
        # print(f"Final n = {n}")
        # print(f"Final Explained variance = {pca.explained_variance_}")
        break
    
X_reduced = pca.fit_transform(X_scaled)

In [24]:
model = KMeans(n_clusters=4, random_state=1)
model.fit(X_reduced)

pca_data_df = pd.DataFrame(X_reduced).rename(columns={0:"PC1", 1:"PC2", 2:"PC3", 3:"PC4", 4:"PC5", 5:"PC6", 6:"PC7"})
kmeans_cluster_labels = pd.DataFrame({"K Cluster":model.labels_})

kmeans_pca_df = pd.concat([df, pca_data_df, kmeans_cluster_labels], axis=1)
px.scatter(kmeans_pca_df, x="PC1", y="PC2", color="K Cluster")
