In [1]:
import numpy as np
import pandas as pd
import itertools
from sklearn.neural_network import BernoulliRBM
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
jasper_embeddings = np.loadtxt('../cache/embeddings.tsv', delimiter='\t')
print(f"shape of jasper_embeddings: {jasper_embeddings.shape}")
jasper_metadata = pd.read_csv('../cache/meta.tsv', sep='\t')
print(f"shape of jasper_metadata: {jasper_metadata.shape}")


shape of jasper_embeddings: (320, 768)
shape of jasper_metadata: (320, 5)


In [3]:
param_grid = {
    "n_components": [20, 30, 50],
    "n_iter": [50, 100],
    "batch_size": [32, 64],
    "learning_rate": [0.005, 0.01],
}

param_combinations = list(itertools.product(
    param_grid["n_components"],
    param_grid["n_iter"],
    param_grid["batch_size"],
    param_grid["learning_rate"]
))


In [4]:
def build_rbm_model(n_components=50, learning_rate=0.01, batch_size=10, n_iter=20):
    rbm = BernoulliRBM(
        n_components=n_components,
        learning_rate=learning_rate,
        batch_size=batch_size,
        n_iter=n_iter,
        verbose=1
    )
    return rbm

In [5]:
def evaluate_rbm_with_kmeans(data, n_components, n_iter, batch_size, learning_rate, k_clusters=7):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    all_latents = []

    for train_idx, test_idx in kf.split(data_scaled):
        X_train, X_test = data_scaled[train_idx], data_scaled[test_idx]
        rbm = BernoulliRBM(
            n_components=n_components,
            learning_rate=learning_rate,
            batch_size=batch_size,
            n_iter=n_iter,
            verbose=0
        )
        rbm.fit(X_train)
        latent = rbm.transform(X_test)
        all_latents.append(latent)

    rbm_latent_space = np.vstack(all_latents)

    kmeans = KMeans(n_clusters=k_clusters, n_init='auto', random_state=42)
    labels = kmeans.fit_predict(rbm_latent_space)
    score = silhouette_score(rbm_latent_space, labels)

    return score

In [6]:
results = []
for (n_components, n_iter, batch_size, learning_rate) in param_combinations:
    score = evaluate_rbm_with_kmeans(
        jasper_embeddings,
        n_components=n_components,
        n_iter=n_iter,
        batch_size=batch_size,
        learning_rate=learning_rate
    )
    results.append({
        "n_components": n_components,
        "n_iter": n_iter,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "silhouette_score": round(score, 4)
    })

In [7]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="silhouette_score", ascending=False).reset_index(drop=True)
print("top RBM params :")
display(results_df)

best_params = results_df.iloc[0]
best_params

top RBM params :


Unnamed: 0,n_components,n_iter,batch_size,learning_rate,silhouette_score
0,50,100,32,0.01,0.7006
1,30,100,32,0.01,0.6889
2,20,100,32,0.01,0.687
3,50,50,32,0.01,0.6526
4,30,50,32,0.01,0.6505
5,50,100,64,0.01,0.6267
6,20,50,32,0.01,0.6223
7,30,100,32,0.005,0.6133
8,50,100,32,0.005,0.6055
9,20,100,64,0.01,0.6043


n_components         50.0000
n_iter              100.0000
batch_size           32.0000
learning_rate         0.0100
silhouette_score      0.7006
Name: 0, dtype: float64

In [8]:
n_components = int(best_params['n_components'])
n_iter = int(best_params['n_iter'])
batch_size = int(best_params['batch_size'])
learning_rate = float(best_params['learning_rate'])

scaler = MinMaxScaler()
scaled_embeddings = scaler.fit_transform(jasper_embeddings)

rbm_final = BernoulliRBM(
    n_components=n_components,
    learning_rate=learning_rate,
    batch_size=batch_size,
    n_iter=n_iter,
    verbose=1
)
rbm_final.fit(scaled_embeddings)
rbm_latent_space_final = rbm_final.transform(scaled_embeddings)

print("final RBM latent space shape:", rbm_latent_space_final.shape)

[BernoulliRBM] Iteration 1, pseudo-likelihood = -530.24, time = 0.06s
[BernoulliRBM] Iteration 2, pseudo-likelihood = -528.78, time = 0.07s
[BernoulliRBM] Iteration 3, pseudo-likelihood = -528.98, time = 0.07s
[BernoulliRBM] Iteration 4, pseudo-likelihood = -526.31, time = 0.08s
[BernoulliRBM] Iteration 5, pseudo-likelihood = -527.48, time = 0.06s
[BernoulliRBM] Iteration 6, pseudo-likelihood = -528.39, time = 0.06s
[BernoulliRBM] Iteration 7, pseudo-likelihood = -528.54, time = 0.05s
[BernoulliRBM] Iteration 8, pseudo-likelihood = -525.85, time = 0.05s
[BernoulliRBM] Iteration 9, pseudo-likelihood = -530.19, time = 0.09s
[BernoulliRBM] Iteration 10, pseudo-likelihood = -525.79, time = 0.05s
[BernoulliRBM] Iteration 11, pseudo-likelihood = -527.10, time = 0.05s
[BernoulliRBM] Iteration 12, pseudo-likelihood = -528.17, time = 0.06s
[BernoulliRBM] Iteration 13, pseudo-likelihood = -528.16, time = 0.05s
[BernoulliRBM] Iteration 14, pseudo-likelihood = -528.21, time = 0.07s
[BernoulliRBM] 

In [9]:
def plot_3d(points, labels, title):
    points = np.concatenate((points, labels), axis=1)
    points = np.concatenate((points, title), axis=1)
    df = pd.DataFrame(points, columns=['x', 'y', 'z', 'labels', 'title'])

    fig = px.scatter_3d(df, x='x', y='y', z='z', color='labels', hover_data=['title'])
    fig.update_traces(marker_size=3)
    fig.update_layout(
        scene=dict(
            xaxis_title="1",
            yaxis_title="2",
            zaxis_title="3",
            aspectmode='data',
            camera=dict(
                eye=dict(x=0.7, y=0.7, z=0.7),
                center=dict(x=0, y=0, z=0)
            )
        )
    )
    fig.show()

In [18]:
from openTSNE.sklearn import TSNE
import plotly.express as px

tsne_latent = TSNE(n_components=3,random_state=42, perplexity=10, n_iter=2000, learning_rate=50).fit_transform(rbm_latent_space_final)
plot_3d(tsne_latent, jasper_metadata[['category']], jasper_metadata[['title']])

In [19]:
# Cluster the latent space with GMM
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=10, random_state=42)
gmm.fit(rbm_latent_space_final)
gmm_predictions = gmm.predict(rbm_latent_space_final).reshape(-1, 1)
print(f"shape of gmm_predictions: {gmm_predictions.shape}")
print(f"shape of latent_space: {rbm_latent_space_final.shape}")
plot_3d(tsne_latent, gmm_predictions, jasper_metadata[['title']])

shape of gmm_predictions: (320, 1)
shape of latent_space: (320, 50)
