<a href="https://colab.research.google.com/github/marcpadro/eines_colab/blob/main/BAU_DwD_6_Text_UMAP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sorting texts using artificial intelligence


#Setup


In [None]:
#@title ▶ Install the required tools

!pip install -q sentence_transformers
!pip install -q umap-learn hdbscan
!pip install -q datasets

In [None]:
#@title ▶ Download the embedding model

from sentence_transformers import SentenceTransformer, util

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
# embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [None]:
#@title ▶ Load a dataset (from [Huggingface](https://huggingface.co/datasets)) and extract the first 1000 texts

from datasets import load_dataset

dataset = load_dataset("DReAMy-lib/DreamBank-dreams-en")
texts = dataset['train']['dreams'][0:1000]

In [None]:
#@title ▶ Load a plain text file (.txt)

from google.colab import files

split_on = 'line break'#@param ['line break', 'stop', 'space']

split_strings = { 'line break': '\n', 'stop': '.', 'space': None}

split_str = split_strings[split_on]

uploaded = files.upload()
texts = []

with open( next(iter(uploaded.keys())) ) as f:
  if split_on == 'line break':
    split = f.readlines()
  else:
    split = f.read().split(split_str)

  for t in split:
    t = t.strip()
    if t:
      texts.append(t)

#Processing

In [None]:
#@title ▶ Calulate the position of the texts in the latent space (calculate the embedding)

embeddings = embedding_model.encode(texts, convert_to_tensor=True, show_progress_bar=True)

print(embeddings.shape)

In [None]:
#@title ▶ Convert from 384 dimensions to 2 dimensions

from umap import UMAP

model = UMAP(
    n_components=2,
    metric='cosine')
embeddings_2d = model.fit_transform(embeddings)

print(embeddings_2d.shape)

#Plotting

In [None]:
#@title ▶ Create an interactive chart

import textwrap
import plotly.express as px

width = 48
wrapped_texts = ["<br>".join(textwrap.wrap(text, width, break_long_words=False)) for text in texts]

fig = px.scatter(hover_name=wrapped_texts, x=embeddings_2d[:,0], y=embeddings_2d[:,1])
fig.show()

# Credits

Taller Estampa https://tallerestampa.com / https://github.com/estampa
