In [26]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import utils
import sklearn.cluster
import sklearn.mixture

In [2]:
DATA_PATH = Path.cwd() / "../data"
data = {
    Path(f).stem: pd.read_csv(f, index_col=0) for f in DATA_PATH.glob("combined_*.csv")
}
print(list(data.keys()))

['combined_metrics_finished_edges', 'combined_metrics_finished_paths', 'combined_metrics_unfinished_edges', 'combined_metrics_unfinished_paths']


In [3]:
features_finished_paths = data["combined_metrics_finished_paths"].reset_index(drop=True)
features_unfinished_paths = data["combined_metrics_unfinished_paths"].reset_index(
    drop=True
)
features_finished_paths_normalized = utils.normalize_features(features_finished_paths[utils.FEATURES_COLS_USED_FOR_CLUSTERING])
features_unfinished_paths_normalized = utils.normalize_features(features_unfinished_paths[utils.FEATURES_COLS_USED_FOR_CLUSTERING])

In [4]:
combined_df = pd.concat([features_finished_paths_normalized, features_unfinished_paths_normalized], axis=0)
X = combined_df.values
#find number of NaNs in each column
nans = np.isnan(X).sum(axis=0)
print(nans)
#drop rows with NaNs
X = X[~np.isnan(X).any(axis=1)]

[   0    0    3 4131 4131    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [5]:
#make PCA of the data
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
pca.fit(X)
X_pca = pca.transform(X)

In [6]:
X_pca.shape

(66827, 10)

In [32]:
import numpy as np
# Select a random subset of samples
np.random.seed(42)
subset_indices = np.random.choice(X_pca.shape[0], size=1000, replace=False)
X_subset = X_pca[subset_indices]

# Run hierarchical clustering on the subset of data
model = sklearn.cluster.AgglomerativeClustering(n_clusters=5, compute_full_tree=True)
Z = model.fit_predict(X_subset)

# Plot the dendrogram
import plotly.express as px

px.scatter_3d(
	x=X_subset[:, 0],
	y=X_subset[:, 1],
	z=X_subset[:, 2],
	color=model.labels_,
	opacity=0.8,
	labels={"color": "cluster"},
	title="Agglomerative clustering of 1000 samples",
)


In [33]:
import numpy as np
# Select a random subset of samples
np.random.seed(42)
subset_indices = np.random.choice(X_pca.shape[0], size=1000, replace=False)
X_subset = X_pca[subset_indices]

# Run hierarchical clustering on the subset of data
model = sklearn.mixture.GaussianMixture(n_components=5)
Z = model.fit_predict(X_subset)

# Plot the dendrogram
import plotly.express as px

px.scatter_3d(
	x=X_subset[:, 0],
	y=X_subset[:, 1],
	z=X_subset[:, 2],
	color=Z,
	opacity=0.8,
	labels={"color": "cluster"},
	title="Gaussian Mixture clustering of 1000 samples",
)
