In [69]:
# external imports
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# internal imports
from config import *
print(MODEL_NAME)

sin_2d_200_points_sgd_0.01_last


In [71]:
# load data
df = pd.read_csv(f'data/{MODEL_NAME}_fixed_300epochs.csv')

(4493, 6)


In [76]:
# analyze data

# set random state
random_seed = 210
np.random.seed(random_seed)

grouped_df = df.groupby(['epoch', 'batch'])

previous_centroids = None  # To store centroids from the previous epoch
clustered_groups = []
for group_name, group_df in grouped_df:
    features = group_df[['x', 'y']]
    var_x = features['x'].var()
    var_y = features['y'].var()
    if var_x + var_y < FIXED_POINT_TOL:
        group_df['fixed point'] = [0] * len(features)
    else:
        min_clusters = 2
        max_clusters = len(group_df) - 1
        silhouette_scores = []
        
        for n_clusters in range(min_clusters, max_clusters + 1):
             # Initialize kmeans with previous centroids if available
            kmeans = KMeans(n_clusters=n_clusters, random_state=random_seed, tol=FIXED_POINT_TOL)
            kmeans.fit(features)
            silhouette_scores.append(silhouette_score(features, kmeans.labels_, random_state=random_seed))

        optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + min_clusters
        kmeans.set_params(n_clusters=optimal_num_clusters)
        kmeans.fit(features)
        previous_centroids = kmeans.cluster_centers_  # Store centroids for the next epoch
        labels = kmeans.labels_
        idx = np.argsort(kmeans.cluster_centers_.sum(axis=1))[::-1]
        lut = np.zeros_like(idx)
        lut[idx] = np.arange(optimal_num_clusters)
        group_df['fixed point'] = lut[labels]
    clustered_groups.append(group_df)

df_with_clusters = pd.concat(clustered_groups)
fixed_points = df_with_clusters.groupby(['epoch', 
                                         'batch', 
                                         'fixed point']).agg(
                                             {'x':['mean', 'var'], 
                                              'y':['mean', 'var']}).reset_index()
fixed_points.columns = ['epoch', 'batch', 'fixed point', 'x_mean', 'x_var', 'y_mean', 'y_var']
fixed_points.to_csv(f'data/analyzed_{MODEL_NAME}_fixed_300epochs.csv')