<a href="https://colab.research.google.com/github/filipecalegario/ref-aulas-criacomp/blob/main/2023_1_CRIACOMP_UMAP%2BObservables%2Bwidget.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CRIACOMP: UMAP + Cluster Visualization

You can read more about the integration with Observables [here](https://observablehq.com/@radames/umap-jupyter-notebook-scattergl)

#Configurations

In [2]:
%pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82808 sha256=ec2c8c6ec8e1314b7de812a390cbacaa6522b15f2f5a1584f650c11c3b05ae41
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d035ef606acc
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for py

In [3]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from IPython.display import display, Javascript, HTML
import numpy as np
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
import seaborn as sns
import pandas as pd
import umap
import codecs, json

In [4]:
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [5]:
reducer = umap.UMAP(init='random')

In [6]:
reducer

## Functions Definition

In [7]:
def run_umap(data, n_neighbors, min_dis, n_components, metric, spread):
  reducer.n_neighbors = n_neighbors
  reducer.min_dist = min_dis
  reducer.n_components = n_components
  reducer.metric = metric
  reducer.spread = spread
  embedding = reducer.fit_transform(data)
  return embedding

In [8]:
def make_viz_embed(data, color = [], labels = []):
  embed = f"""
    <div id="observablehq-viewof-containerEl-96fe8cff"></div>
    <script type="module">
    import {{Runtime, Inspector}} from "https://cdn.jsdelivr.net/npm/@observablehq/runtime@4/dist/runtime.js";
    import define from "https://api.observablehq.com/@radames/umap-jupyter-notebook-scattergl.js?v=3";
    const inspect = new Inspector(document.querySelector("#observablehq-viewof-containerEl-96fe8cff"));
    const notebook = (new Runtime).module(define, name => {{
    if(name === "viewof containerEl") return inspect;
        return ["init"].includes(name);
    }})
    notebook.redefine('points', {json.dumps(data,separators=(',', ':'))})
    notebook.redefine('colors', {json.dumps(colors,separators=(',', ':'))})
    notebook.redefine('labels', {json.dumps(labels,separators=(',', ':'))})
    </script>

  """
  return embed

In [9]:
def render(data, colors, labels, n_neighbors=100, min_dis=0.5, n_components=3, metric='euclidean', spread = 1.0):
  embedding = run_umap(data, n_neighbors, min_dis, n_components, metric, spread)
  html_str = make_viz_embed(embedding.tolist(), colors, labels)
  display(HTML(html_str))


# Loading Data

In [10]:
casos_uso_df = pd.read_csv('word_embeddings.csv')

In [12]:
casos_uso_df.embedding = casos_uso_df.embedding.apply(eval).apply(np.array)

In [23]:
casos_uso_df

Unnamed: 0.1,Unnamed: 0,Itens,Categoria,embedding
0,0,Midjouney,"""aplicacoes""","[-0.009065642952919006, -0.021264266222715378,..."
1,1,Openjourney,"""aplicacoes""","[-0.0028128710109740496, -0.002215629443526268..."
2,2,DALL E,"""aplicacoes""","[-0.011047448962926865, -0.021905938163399696,..."
3,3,Tome.app,"""aplicacoes""","[0.003620806382969022, 0.006396056618541479, 0..."
4,4,Stable diffusion,"""aplicacoes""","[-0.01985820196568966, 0.016730502247810364, 0..."
...,...,...,...,...
203,203,Identificação de falhas de segurança,"""casos_uso""","[-0.015269014053046703, 0.009737345390021801, ..."
204,204,Fake news para manipular eleições,"""casos_uso""","[-0.030033115297555923, 0.02663939818739891, -..."
205,205,Geração de planos de crime,"""casos_uso""","[-0.008206862024962902, -0.015600252896547318,..."
206,206,Nova trending do tiktok,"""casos_uso""","[-0.03929344564676285, -0.008771595545113087, ..."


In [None]:
casos_uso_df.embedding

In [None]:
# Adapting for the expected data format

output_list = list()
for n in casos_uso_df.embedding:
  inter_output_line = list()
  for m in n:
    inter_output_line.append(m)
  output_list.append(inter_output_line)

output_list[0:1]

In [None]:
colors = [sns.color_palette()[0] for x in output_list]
colors

# Visualization

In [None]:
render(output_list, colors, casos_uso_df['Itens'].to_list(), n_neighbors=3, min_dis=0.5, n_components=3, metric='cosine')