## Get UMAP Embeddings

In [1]:
import pandas as pd
import chromadb
import numpy as np
import umap
import matplotlib.pyplot as plt
import unicodedata
from datetime import datetime


In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize

import re, os, sys, json
import tiktoken
from openai import OpenAI

from chromadb.config import Settings
import chromadb
# from app.utils.embeddings import OpenAIEmbedding

from tqdm.notebook import trange, tqdm

In [3]:
from dotenv import load_dotenv

load_dotenv()


True

In [4]:
# Get the parent directory
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

print(parent_directory)
# Add the parent directory to the Python path if it is not already included
if parent_directory not in sys.path:
    sys.path.append(parent_directory)

/home/user/NOVA/_PhD/arquivo25/cosmos_cravos


In [5]:
env_path = os.path.abspath(os.path.join("./../../",".env" ))

load_dotenv(dotenv_path=env_path)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
client_gpt = OpenAI(api_key=OPENAI_API_KEY)

CHROMADB_PATH = os.environ.get("CHROMADB_PATH")
EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL")
COMPLETION_MODEL = os.environ.get("DEFAULT_COMPLETION_MODEL")
CHUNK_SIZE = int(os.environ.get("DEFAULT_CHUNK_SIZE"))
CHUNK_OVERLAP = int(os.environ.get("DEFAULT_CHUNK_OVERLAP"))




In [6]:
db_path = "./../data/chroma_cravo"

client = chromadb.PersistentClient(path=db_path, 
                                   settings=Settings(anonymized_telemetry=False)
                                   )

In [7]:
collection = client.get_collection("cravo")

# Retrieve all documents and their embeddings
results = collection.get(
    include=["embeddings", "documents", "metadatas"]
)

# Access all embeddings
embeddings = results['embeddings']

# Print information about the embeddings
print(f"Retrieved {len(embeddings)} embeddings")
print(f"Dimension of embeddings: {embeddings.shape}")

# If you need to access the documents and metadata as well
documents = results['documents']
metadatas = results['metadatas']

Retrieved 900 embeddings
Dimension of embeddings: (900, 1536)


In [8]:
# documents
metadatas

[{'link': 'https://arquivo.pt/noFrame/replay/20080316100800/http://pt.wikipedia.org/wiki/Guerra_Colonial_Portuguesa',
  'm_id': 65},
 {'m_id': 66,
  'link': 'https://arquivo.pt/noFrame/replay/20080215045248/http://pt.wikipedia.org/wiki/Guerra_Colonial_Portuguesa'},
 {'m_id': 32,
  'link': 'https://arquivo.pt/noFrame/replay/20240612193309/https://pt.wikipedia.org/wiki/Guerra_Colonial_Portuguesa'},
 {'m_id': 20,
  'link': 'https://arquivo.pt/noFrame/replay/20151005193344/https://pt.wikipedia.org/wiki/Guerra_colonial_portuguesa'},
 {'link': 'https://arquivo.pt/noFrame/replay/20230122023529/https://pt.wikipedia.org/wiki/Guerra_Colonial_Portuguesa',
  'm_id': 6},
 {'m_id': 30,
  'link': 'https://arquivo.pt/noFrame/replay/20120121233316/http://pt.wikipedia.org/wiki/Guerra_Colonial_Portuguesa'},
 {'m_id': 31,
  'link': 'https://arquivo.pt/noFrame/replay/20240829115608/https://pt.wikipedia.org/wiki/Guerra_Colonial_Portuguesa'},
 {'link': 'https://arquivo.pt/noFrame/replay/20221011150609/https:

In [9]:
embeddings_path = "./../data/embeddings/"
meta_path = (os.path.join(embeddings_path,"metadata.json"))
with open(meta_path, 'r') as file:
    meta_concat = json.load(file)



In [10]:
embeddings

array([[ 0.00492407,  0.05813501,  0.00876155, ...,  0.03319753,
         0.01546793,  0.0016053 ],
       [ 0.00437642,  0.05011618,  0.00270458, ...,  0.03575006,
         0.01862347,  0.0047312 ],
       [ 0.00738478,  0.06554307,  0.00825896, ...,  0.04370893,
         0.02756712,  0.00020917],
       ...,
       [-0.01279808,  0.0113901 ,  0.01278837, ...,  0.01614811,
         0.02013901, -0.00289608],
       [-0.00602193, -0.00652547,  0.02490983, ...,  0.01957641,
         0.03054125,  0.00596027],
       [-0.01780719, -0.01026209,  0.02217142, ...,  0.01773146,
         0.02001297,  0.01999404]], shape=(900, 1536))

In [11]:
# Convert to numpy array if not already
# embeddings_array = np.array(embeddings)

# Step 2: Apply UMAP for dimensionality reduction
# Install UMAP if needed: pip install umap-learn
reducer = umap.UMAP(
    n_components=2,  # Reduce to 2D (use 3 for 3D)
    n_neighbors=15,  # Controls how local/global the projection is (5-50)
    min_dist=0.1,    # Controls how tightly points are packed (0.0-1.0)
    random_state=42  # For reproducibility
)

# Fit and transform the embeddings
reduced_embeddings = reducer.fit_transform(embeddings)




  warn(


In [12]:
umap_df = pd.DataFrame(reduced_embeddings, columns=["x", "y"])


In [13]:
umap_df.reset_index(inplace=True)

In [14]:
umap_df['meta_id'] = umap_df['index'].apply(lambda x: metadatas[x]['m_id'])

In [15]:

meta_df = pd.DataFrame(meta_concat).T
meta_df

Unnamed: 0,tstamp,title,originalURL,linkToArchive,linkToNoFrame,linkToScreenshot,filepath,source_name
1,20231121143131,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20231121143131/http...,https://arquivo.pt/noFrame/replay/202311211431...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
2,20231019004141,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20231019004141/http...,https://arquivo.pt/noFrame/replay/202310190041...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
3,20230902083232,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20230902083232/http...,https://arquivo.pt/noFrame/replay/202309020832...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
4,20230626205149,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20230626205149/http...,https://arquivo.pt/noFrame/replay/202306262051...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
5,20230624194157,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20230624194157/http...,https://arquivo.pt/noFrame/replay/202306241941...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
...,...,...,...,...,...,...,...,...
2080,20110704075116,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20110704075116/http...,https://arquivo.pt/noFrame/replay/201107040751...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
2081,20110615193844,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20110615193844/http...,https://arquivo.pt/noFrame/replay/201106151938...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
2082,20110523184300,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20110523184300/http...,https://arquivo.pt/noFrame/replay/201105231843...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
2083,20231121151225,https://pt.wikipedia.org/wiki/Junta_de_Salva%C...,https://pt.wikipedia.org/wiki/Junta_de_Salva%C...,https://arquivo.pt/wayback/20231121151225/http...,https://arquivo.pt/noFrame/replay/202311211512...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT


In [16]:
umap_df['meta_id']
meta_df.reset_index()

Unnamed: 0,index,tstamp,title,originalURL,linkToArchive,linkToNoFrame,linkToScreenshot,filepath,source_name
0,1,20231121143131,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20231121143131/http...,https://arquivo.pt/noFrame/replay/202311211431...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
1,2,20231019004141,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20231019004141/http...,https://arquivo.pt/noFrame/replay/202310190041...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
2,3,20230902083232,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20230902083232/http...,https://arquivo.pt/noFrame/replay/202309020832...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
3,4,20230626205149,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20230626205149/http...,https://arquivo.pt/noFrame/replay/202306262051...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
4,5,20230624194157,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20230624194157/http...,https://arquivo.pt/noFrame/replay/202306241941...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
...,...,...,...,...,...,...,...,...,...
2079,2080,20110704075116,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20110704075116/http...,https://arquivo.pt/noFrame/replay/201107040751...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
2080,2081,20110615193844,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20110615193844/http...,https://arquivo.pt/noFrame/replay/201106151938...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
2081,2082,20110523184300,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20110523184300/http...,https://arquivo.pt/noFrame/replay/201105231843...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
2082,2083,20231121151225,https://pt.wikipedia.org/wiki/Junta_de_Salva%C...,https://pt.wikipedia.org/wiki/Junta_de_Salva%C...,https://arquivo.pt/wayback/20231121151225/http...,https://arquivo.pt/noFrame/replay/202311211512...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT


In [17]:
meta_cols = ['tstamp', 'title', 'originalURL', 'linkToArchive', 'linkToNoFrame',
       'linkToScreenshot', 'filepath', 'source_name']

In [18]:
for col in meta_cols:
    umap_df[col] = umap_df['meta_id'].apply(lambda x: meta_concat[str(x)][col])

In [19]:
umap_df

Unnamed: 0,index,x,y,meta_id,tstamp,title,originalURL,linkToArchive,linkToNoFrame,linkToScreenshot,filepath,source_name
0,0,-2.369146,1.766376,65,20080316100800,http://pt.wikipedia.org/wiki/Guerra_Colonial_P...,http://pt.wikipedia.org/wiki/Guerra_Colonial_P...,https://arquivo.pt/wayback/20080316100800/http...,https://arquivo.pt/noFrame/replay/200803161008...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
1,1,-2.376286,1.768186,66,20080215045248,http://pt.wikipedia.org/wiki/Guerra_Colonial_P...,http://pt.wikipedia.org/wiki/Guerra_Colonial_P...,https://arquivo.pt/wayback/20080215045248/http...,https://arquivo.pt/noFrame/replay/200802150452...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
2,2,-2.642831,2.042382,32,20240612193309,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20240612193309/http...,https://arquivo.pt/noFrame/replay/202406121933...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
3,3,-2.592063,1.994415,20,20151005193344,https://pt.wikipedia.org/wiki/Guerra_colonial_...,https://pt.wikipedia.org/wiki/Guerra_colonial_...,https://arquivo.pt/wayback/20151005193344/http...,https://arquivo.pt/noFrame/replay/201510051933...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
4,4,-2.667109,2.031749,6,20230122023529,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://pt.wikipedia.org/wiki/Guerra_Colonial_...,https://arquivo.pt/wayback/20230122023529/http...,https://arquivo.pt/noFrame/replay/202301220235...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_guerra_colonial_portuguesa/...,Wikipedia PT
...,...,...,...,...,...,...,...,...,...,...,...,...
895,895,7.795035,2.654285,1949,20151005230006,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20151005230006/http...,https://arquivo.pt/noFrame/replay/201510052300...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
896,896,7.749103,2.635786,2081,20110615193844,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20110615193844/http...,https://arquivo.pt/noFrame/replay/201106151938...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
897,897,7.680693,2.548033,2080,20110704075116,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,http://pt.wikipedia.org/wiki/Junta_de_Salva%C3...,https://arquivo.pt/wayback/20110704075116/http...,https://arquivo.pt/noFrame/replay/201107040751...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT
898,898,7.793026,2.694399,1944,20131106082031,https://pt.wikipedia.org/wiki/Junta_de_Salva%C...,https://pt.wikipedia.org/wiki/Junta_de_Salva%C...,https://arquivo.pt/wayback/20131106082031/http...,https://arquivo.pt/noFrame/replay/201311060820...,https://arquivo.pt/screenshot?url=https%3A%2F%...,./../data/res/wiki_junta_de_salvacao_nacional/...,Wikipedia PT


In [20]:
umap_path = os.path.join(embeddings_path, "umap_metadata.csv")
umap_df.to_csv(umap_path, index=False)

In [24]:
umap_df['source_name'].unique().tolist()

['Wikipedia PT', 'Publico', 'Web', 'Expresso']