In [1]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # suppress annoying kmeans future warning 

scaler = StandardScaler()
df = pd.read_csv("data/pan22_features.csv", index_col=None)

X = df.drop(columns=["author_id", "discourse_type"]).values

X_scaled = scaler.fit_transform(X)
Y = df["author_id"].values

# n_components experiments

The following cell finds the `number of components` most suited for this data, which turns out to be 7

In [5]:
X_scaled = scaler.fit_transform(X)

THRESHOLD = 5.5
for n in range(1,100):
    pca = PCA(n_components=n)
    pca_features = pca.fit_transform(X_scaled)
    #print(f"n_components: {n}, Explained variance: {pca.explained_variance_[-1]}")
    
    if pca.explained_variance_[-1] < THRESHOLD:
        # print(f"Final n = {n}")
        # print(f"Final Explained variance = {pca.explained_variance_}")
        break
    
X_reduced = pca.fit_transform(X_scaled)

In [6]:
df = pd.DataFrame({
    "Number of components": range(1, pca.n_components+1),
    "Explained variance": pca.explained_variance_
})
px.line(df, x="Number of components", y="Explained variance")

In [2]:
test = np.array([
    [1,2,3,4],
    [0.3, 0.5, 1,2],
    [.45, 0.2, 1, 6]
])

sorted_arr = list(map(lambda x: list(x), test))

sorted(sorted_arr, key=lambda x: sum(x))


[[0.3, 0.5, 1.0, 2.0], [0.45, 0.2, 1.0, 6.0], [1.0, 2.0, 3.0, 4.0]]