In [1]:
!pip install pandas matplotlib scikit-learn umap-learn


Collecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.12-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.12 umap-learn-0.5.6


In [7]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA

# Load the embeddings
df_embeddings = pd.read_csv('short_caption_embeddings-2.csv')

# Drop rows with NaN values
df_embeddings.dropna(inplace=True)

# Extract embeddings and captions
embeddings = df_embeddings.iloc[:, :-1].values
captions = df_embeddings['prompt'].values

# Perform PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(embeddings)

# Create a DataFrame for plotting
df_pca = pd.DataFrame(pca_result, columns=['PCA1', 'PCA2', 'PCA3'])
df_pca['caption'] = captions

# Plot PCA results
fig = px.scatter_3d(df_pca, x='PCA1', y='PCA2', z='PCA3', hover_data=['caption'], title='3D PCA of Embeddings')
fig.show()


In [8]:
import umap

# Load the embeddings
df_embeddings = pd.read_csv('short_caption_embeddings-2.csv')

# Drop rows with NaN values
df_embeddings.dropna(inplace=True)

# Extract embeddings and captions
embeddings = df_embeddings.iloc[:, :-1].values
captions = df_embeddings['prompt'].values

# Perform UMAP
umap_model = umap.UMAP(n_components=3)
umap_result = umap_model.fit_transform(embeddings)

# Create a DataFrame for plotting
df_umap = pd.DataFrame(umap_result, columns=['UMAP1', 'UMAP2', 'UMAP3'])
df_umap['caption'] = captions

# Plot UMAP results
fig = px.scatter_3d(df_umap, x='UMAP1', y='UMAP2', z='UMAP3', hover_data=['caption'], title='3D UMAP of Embeddings')
fig.show()


In [9]:
from sklearn.manifold import TSNE

# Load the embeddings
df_embeddings = pd.read_csv('short_caption_embeddings-2.csv')

# Drop rows with NaN values
df_embeddings.dropna(inplace=True)

# Extract embeddings and captions
embeddings = df_embeddings.iloc[:, :-1].values
captions = df_embeddings['prompt'].values

# Perform t-SNE
tsne = TSNE(n_components=3, random_state=42)
tsne_result = tsne.fit_transform(embeddings)

# Create a DataFrame for plotting
df_tsne = pd.DataFrame(tsne_result, columns=['TSNE1', 'TSNE2', 'TSNE3'])
df_tsne['caption'] = captions

# Plot t-SNE results
fig = px.scatter_3d(df_tsne, x='TSNE1', y='TSNE2', z='TSNE3', hover_data=['caption'], title='3D t-SNE of Embeddings')
fig.show()
