In [None]:
!pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.9/90.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.5-py3-none-any.whl size=86831 sha256=28a3528374e01bb8d6b25a2e4854a1d7f97534aa48b2021e23ad29bc96d391ea
  Stored in directory: /root/.cache/pip/wheels/3a/70/07/428d2b58660a1a3b431db59b806a10da736612ebbc66c1bcc5
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for py

In [None]:
import umap
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from os import path
from sklearn.preprocessing import StandardScaler


# Connect to GoogleDrive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# This is where my dataset lives

In [None]:
dataset_location = '/content/gdrive/MyDrive/nln-dataset'

# Visualize features

In [None]:
features_location = f"{dataset_location}/clip-features.csv"
dataset_location_normalized = f"{dataset_location}/normalized"

if not path.exists(features_location):
    print("Run NB01_extract_features first")
else:

  df = pd.read_csv(features_location)

  filter_labels = []

  if len(filter_labels) > 0:
      df = df[df["label"].isin(filter_labels)]


  features_df = df[df.columns[2:]]
  id_class_df = df[df.columns[0:2]]

  class_df = df[df.columns[1]]

  labels = sorted(set([path.dirname(x) for x in df["id"]]))


  features = StandardScaler().fit_transform(features_df)

  reducer = umap.UMAP()
  embedding_with_classes = reducer.fit_transform(features, y=class_df)
  embedding = reducer.fit_transform(features)


  def get_image(path, zoom=1):
      return OffsetImage(plt.imread(f"{dataset_location_normalized}/{path}"), zoom=zoom*0.1)


  fig, ax = plt.subplots(1,1,figsize=(20,15))
  plt.scatter(*embedding.T, s=0.1, c=class_df, cmap='Spectral', alpha=1.0)

  x,y = embedding.T
  for x0, y0, path in zip(x, y, df["id"]):
      ab = AnnotationBbox(get_image(path), (x0, y0), frameon=False)
      ax.add_artist(ab)

  embedding_df = pd.DataFrame( { "x": x, "y": y } )
  embedding_df = pd.concat([id_class_df, embedding_df], axis=1)

  embedding_df.to_csv(f"{dataset_location}/clip-embedding_{'_'.join(str(x) for x in filter_labels)}.csv", index=False)

  cbar = plt.colorbar(boundaries=np.arange(len(labels)+1)-0.5)
  cbar.set_ticks(np.arange(len(labels)))
  cbar.set_ticklabels(labels)
  plt.title("embedding")
  plt.show()


Run NB01_extract_features first
