# Cosmos de Cravos

## ETL 

In [1]:
import pandas as pd
import chromadb
import numpy as np
import umap
import matplotlib.pyplot as plt
import unicodedata
from datetime import datetime


In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize

import re, os, sys, json
import tiktoken
from openai import OpenAI

from chromadb.config import Settings
import chromadb
# from app.utils.embeddings import OpenAIEmbedding

from tqdm.notebook import trange, tqdm

In [3]:
from dotenv import load_dotenv

load_dotenv()


True

In [4]:
# Get the parent directory
parent_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

print(parent_directory)
# Add the parent directory to the Python path if it is not already included
if parent_directory not in sys.path:
    sys.path.append(parent_directory)

/home/user/NOVA/_PhD/arquivo25/cosmos_cravos


In [5]:
env_path = os.path.abspath(os.path.join("./../../",".env" ))

load_dotenv(dotenv_path=env_path)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
client_gpt = OpenAI(api_key=OPENAI_API_KEY)

CHROMADB_PATH = os.environ.get("CHROMADB_PATH")
EMBEDDING_MODEL = os.environ.get("DEFAULT_EMBEDDING_MODEL")
COMPLETION_MODEL = os.environ.get("DEFAULT_COMPLETION_MODEL")
CHUNK_SIZE = int(os.environ.get("DEFAULT_CHUNK_SIZE"))
CHUNK_OVERLAP = int(os.environ.get("DEFAULT_CHUNK_OVERLAP"))




In [6]:
# from app.etl.preprocess import JsonDataProcessor

In [7]:

data_dir = os.path.join("./../data", "res")


In [8]:

def get_arq_meta(arq_item, meta_id=0, arq_keys=[]):
    
    if len(arq_keys) == 0:
        arq_keys = ['tstamp', 'title', 'originalURL', 'linkToArchive', 'linkToNoFrame', 'linkToScreenshot']

    arq_meta = {k: arq_item[k] for k in arq_keys}
    
    meta_id += 1
    # arq_meta['meta_id'] = meta_id

    return arq_meta, meta_id 

In [9]:
def preprocess_text(text):
    """
    Preprocess text by removing HTML tags, normalizing line breaks,
    and cleaning up whitespace.

    Args:
        text (str): Raw text from the JSON file

    Returns:
        str: Preprocessed text
    """
    if not text:
        return ""

    # Remove HTML tags if present
    text = re.sub(r"<[^>]+>", "", text)
    text = text.replace('{html}',"") 

    # Normalize line breaks
    text = text.replace("\r\n", "\n").replace("\r", "\n")

    # Remove repeated empty lines (more than 2 consecutive newlines)
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Strip leading/trailing whitespace from each line
    lines = [line.strip() for line in text.split("\n")]

    # Remove empty lines at the beginning and end
    while lines and not lines[0]:
        lines.pop(0)
    while lines and not lines[-1]:
        lines.pop()

    # Rejoin the lines
    processed_text = "\n".join(lines)

    return processed_text

In [10]:
encoding = tiktoken.get_encoding("cl100k_base")

def chunk_text(text, encoding=encoding, max_tokens= 8000):
    """
    Splits a long text into sentence-based chunks that stay under OpenAI's max token limit
    for text embeddings (8191 for text-embedding-3 models).
    
    Args:
        text (str): Input text to chunk.
        max_tokens (int): Max token limit per chunk (default is 8191).
    
    Returns:
        List of tuples: (text, token_count)
    """
    sentences = sent_tokenize(text)

    chunks = []
    current_chunk = ""
    current_token_count = 0

    for sentence in sentences:
        sentence_token_count = len(encoding.encode(sentence))

        # If adding sentence doesn't exceed limit, add it
        if current_token_count + sentence_token_count <= max_tokens:
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence
            current_token_count += sentence_token_count
        else:
            # Save current chunk
            if current_chunk:
                chunks.append((current_chunk, current_token_count))
            
            # Start new chunk
            current_chunk = sentence
            current_token_count = sentence_token_count

    # Add final chunk
    if current_chunk:
        chunks.append((current_chunk, current_token_count))

    return chunks


In [11]:
def get_arq_files(input_path):
    """
    Identify which JSON files to process based on the input path.
    """
    files_to_process = []
    if os.path.isdir(input_path):
        # Process all JSON files in the directory
        for filename in os.listdir(input_path):
            if filename.endswith(".json"):
                file_path = os.path.join(input_path, filename)
                with open(file_path, "r", encoding="utf-8") as f:
                    data_ = json.load(f)
                    if len(data_) >= 1:
                        files_to_process.append((data_, file_path))
                    #     print(f"Will process file: {file_path}", len(data_))
                    # else:
                    #     print("Found empty file at", file_path, len(data_))
                # files_to_process.append(file_path)
        print(
            f"Found {len(files_to_process)} JSON files to process in {input_path}"
        )
    elif os.path.isfile(input_path) and input_path.endswith(".json"):
        # Process a single JSON file
        with open(input_path, "r", encoding="utf-8") as f:
            data_ = json.load(f)
            if len(data_) > 2:
                files_to_process.append(data_, file_path)
                print(f"Will process single file: {input_path}", len(data_))
            else:
                print("Found empty file at", input_path)
    else:
        print(f"Error: {input_path} is not a valid JSON file or directory")

    return files_to_process

In [12]:

def process_data_dir(data_dir, source_name, last_meta_id=-1):

    if last_meta_id == -1:
        last_meta_id = 0

    files_to_process = get_arq_files(os.path.join(data_dir))
    child_items = []
    all_metadata = {}

    for f_ in files_to_process:
        
        file_ = f_[0]
        file_path = f_[1]

        for tstamp in file_:
            doc_ = file_[tstamp]
            arq_meta, last_meta_id = get_arq_meta(doc_, last_meta_id)
            arq_meta["filepath"] = file_path
            arq_meta["source_name"] = source_name
            
            all_metadata[last_meta_id] = arq_meta
            
            for child_ in doc_['children']:
                sent_ = preprocess_text(child_['text'])
                child_chunks = chunk_text(sent_)

                for chunk_ in child_chunks:
                    chunk_item = {
                        "text": chunk_[0], 
                        "tokens": chunk_[1],
                        "link": child_['link'],
                        "tstamp": tstamp,
                        "metadata_id": last_meta_id,
                        "source": source_name
                        }
                child_items.append(chunk_item)
    return child_items, all_metadata, last_meta_id


In [13]:
def normalize_chunk(sentence):
    sentence = sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    sentence = re.sub(cleanr, '', sentence)
    sentence = sentence.replace('\n', ' ').replace('\r', ' ')
    sentence = unicodedata.normalize('NFKD', sentence)
    sentence = ''.join(c for c in sentence if not unicodedata.combining(c))
    sentence = re.sub(r'http\S+', '', sentence)
    sentence = re.sub(r'[^A-Za-z\s]', '', sentence).lower().strip()
    return sentence


In [14]:
def remove_duplicates(df):
    df_ = df.copy()
    df_['normalized'] = df_['text'].apply(lambda x: normalize_chunk(x))
    df_.sort_values(by=["normalized", "tstamp"], inplace=True)
    df_.drop_duplicates(subset=["normalized"], keep="last", inplace=True)
    df_.drop(columns=['normalized'], inplace=True)
    df_.reset_index(drop=True, inplace=True)

    return df_

## Retrieve Embeddings

In [15]:
metadata_map = {}


In [16]:

def get_embeddings(docs, 
                   api_key=OPENAI_API_KEY, 
                   model=EMBEDDING_MODEL):
    """
    Get embeddings for the provided texts using OpenAI API.
    """
    client = OpenAI(api_key=api_key)

    responses = [] 
    embeddings = []

    pbar = tqdm(docs)
    for doc_i in pbar:
        pbar.set_description(f"Processing: get_embeddings")
        
        resp_i = client.embeddings.create(input=doc_i, model=model)
        responses.append(resp_i)

        emb_i = np.array([item.embedding for item in resp_i.data])
        embeddings.append(emb_i)


    return np.concat(embeddings)




def store_embeddings(docs, embeddings, metadatas, client, collection_name):

    # Create or get collection
    try:
        collection = client.get_collection(name=collection_name)
        print(f"Collection '{collection_name}' already exists. Adding documents...")

    except Exception as e:

        print(f"Creating new collection '{collection_name}'.")
        collection = client.create_collection(name=collection_name)


    # Add documents to the collection
    try:
        collection.add(
            documents=docs,
            embeddings=embeddings,
            ids=[f"doc_{i}" for i in range(len(docs))],
            metadatas=metadatas
        )

        print(f"Added {len(docs)} documents to the collection.")
    except Exception as e:

        print(f"Error adding documents to collection: {e}")
        sys.exit(1)

        
    return 



In [17]:
# def store_csv_embeddings(docs, embeddings, metadatas, dest):
#     print(dest)
#     print(len(docs))
#     print(embeddings.shape)
#     print(metadatas)

    

        
#     return 

In [18]:
db_path = "./../data/chroma_cravo"

client = chromadb.PersistentClient(path=db_path, 
                                   settings=Settings(anonymized_telemetry=False)
                                   )

In [19]:

data_dirs = os.listdir(data_dir)
data_dirs



['wiki_guerra_colonial_portuguesa',
 'all',
 'wiki_rev',
 'wiki_estado_novo',
 'publico',
 '50anos25abril',
 'wiki_processo',
 'wiki_constituicao',
 'wiki_movimento_das_forcas_armadas',
 'expresso',
 'wiki_junta_de_salvacao_nacional']

In [20]:
source_categs = {
    'wiki_guerra_colonial_portuguesa' : "Wikipedia PT",
    'wiki_rev' : "Wikipedia PT",
    'wiki_estado_novo' : "Wikipedia PT",
    'publico' : "Publico",
    '50anos25abril' : "Web",
    'wiki_processo' : "Wikipedia PT",
    'wiki_constituicao' : "Wikipedia PT",
    'wiki_movimento_das_forcas_armadas' : "Wikipedia PT",
    'expresso' : "Expresso",
    'wiki_junta_de_salvacao_nacional' : "Wikipedia PT",

}

In [21]:
last_meta_id = 0
metadata_list = []
df_list = []

for source_dir, source_name in source_categs.items():
    print(f"Starting: {last_meta_id} {source_name}")

    source_data_dir = os.path.join(data_dir, source_dir)
    source_data, source_meta, last_meta_id = process_data_dir(source_data_dir, source_name, last_meta_id)
    source_df = pd.DataFrame(source_data)
    source_df = remove_duplicates(source_df)
    
    metadata_list.append(source_meta)
    df_list.append(source_df)
    

Starting: 0 Wikipedia PT
Found 17 JSON files to process in ./../data/res/wiki_guerra_colonial_portuguesa
Starting: 75 Wikipedia PT
Found 11 JSON files to process in ./../data/res/wiki_rev
Starting: 186 Wikipedia PT
Found 17 JSON files to process in ./../data/res/wiki_estado_novo
Starting: 309 Publico
Found 11 JSON files to process in ./../data/res/publico
Starting: 907 Web
Found 1 JSON files to process in ./../data/res/50anos25abril
Starting: 951 Wikipedia PT
Found 12 JSON files to process in ./../data/res/wiki_processo
Starting: 982 Wikipedia PT
Found 13 JSON files to process in ./../data/res/wiki_constituicao
Starting: 1027 Wikipedia PT
Found 17 JSON files to process in ./../data/res/wiki_movimento_das_forcas_armadas
Starting: 1081 Expresso
Found 9 JSON files to process in ./../data/res/expresso
Starting: 1872 Wikipedia PT
Found 13 JSON files to process in ./../data/res/wiki_junta_de_salvacao_nacional


In [None]:
# metadata_list[0]
# df_list[0]



In [22]:
# for source_i, source_name in enumerate(source_categs.items()):
#     print(source_i, source_name)

In [23]:
# from time import sleep


In [24]:
# count_docs = 0
# for df_ in df_list:
#     print(df_.shape)
#     count_docs += df_.shape[0]

# print(count_docs)

In [25]:
df_concat = pd.concat(df_list, ignore_index=True)
df_concat

Unnamed: 0,text,tokens,link,tstamp,metadata_id,source
0,A primeira das celebrações realizou-se em 1963...,1733,https://arquivo.pt/noFrame/replay/200803161008...,20080316100800,65,Wikipedia PT
1,A primeira das celebrações realizou-se em 1963...,1756,https://arquivo.pt/noFrame/replay/200802150452...,20080215045248,66,Wikipedia PT
2,"Após a independência, a grande maioria desses ...",3447,https://arquivo.pt/noFrame/replay/202406121933...,20240612193309,32,Wikipedia PT
3,Cada redefinição do processo representava uma ...,4175,https://arquivo.pt/noFrame/replay/201510051933...,20151005193344,20,Wikipedia PT
4,"Contudo, grupos organizados, num movimento con...",6347,https://arquivo.pt/noFrame/replay/202301220235...,20230122023529,6,Wikipedia PT
...,...,...,...,...,...,...
895,A Junta de Salvação Nacional (JSN) foi um grup...,3026,https://arquivo.pt/noFrame/replay/201510052300...,20151005230006,1949,Wikipedia PT
896,A Junta de Salvação Nacional (JSN) foi um grup...,607,https://arquivo.pt/noFrame/replay/201106151938...,20110615193844,2081,Wikipedia PT
897,A Junta de Salvação Nacional (JSN) foi um grup...,615,https://arquivo.pt/noFrame/replay/201107040751...,20110704075116,2080,Wikipedia PT
898,A Junta de Salvação Nacional (JSN) foi um grup...,1835,https://arquivo.pt/noFrame/replay/201311060820...,20131106082031,1944,Wikipedia PT


In [26]:

def get_embedding(doc, 
                   api_key=OPENAI_API_KEY, 
                   model=EMBEDDING_MODEL):
    """
    Get embeddings for the provided texts using OpenAI API.
    """
    client = OpenAI(api_key=api_key)

    response = client.embeddings.create(input=doc, model=model)

    embedding = np.array([item.embedding for item in response.data])


    return embedding



In [27]:
embeddings_path = "./../data/embeddings/"

In [28]:
metadata_map


{}

In [29]:
def embeddings_tocsv(embeddings_path, df_concat):
    with tqdm(total=df_concat.shape[0]) as df_bar:

        for i, row_i in df_concat.iterrows():
            df_bar.set_description(f"Processing embeddings")

            em_i = get_embedding(row_i['text'])
            path_i = os.path.join(embeddings_path,f"{i}.csv")
            pd.DataFrame(em_i).to_csv(path_i, index=False, header=False)

            df_bar.update(1)
    df_bar.close()

# embeddings_tocsv(embeddings_path, df_concat)

In [30]:
# df_concat[['text']].to_csv(os.path.join(embeddings_path, "documents.csv"), index_label="id", )


In [31]:
docs_df = pd.read_csv(os.path.join(embeddings_path, "documents.csv"))


In [33]:
def save_metadata(embeddings_path, metadata_list):
    meta_concat = {}

    for i in metadata_list:
        meta_concat = meta_concat | i

    with open(os.path.join(embeddings_path,"metadata.json"), "w") as outfile:
        outfile.write(json.dumps(meta_concat, indent=2))

def load_metadata(meta_path):
    with open(meta_path, 'r') as file:
        return json.load(file)

In [36]:
meta_concat = load_metadata(os.path.join(embeddings_path,"metadata.json"))


In [37]:
def store_embeddings(docs, embeddings, metadatas, client, collection_name):

    # Create or get collection
    try:
        collection = client.get_collection(name=collection_name)
        print(f"Collection '{collection_name}' already exists. Adding documents...")

    except Exception as e:

        print(f"Creating new collection '{collection_name}'.")
        collection = client.create_collection(name=collection_name)


    # Add documents to the collection
    try:
        collection.add(
            documents=docs,
            embeddings=embeddings,
            ids=[f"doc_{i}" for i in range(len(docs))],
            metadatas=metadatas
        )

        print(f"Added {len(docs)} documents to the collection.")
    except Exception as e:

        print(f"Error adding documents to collection: {e}")
        sys.exit(1)

        
    return 


In [38]:
df_concat

Unnamed: 0,text,tokens,link,tstamp,metadata_id,source
0,A primeira das celebrações realizou-se em 1963...,1733,https://arquivo.pt/noFrame/replay/200803161008...,20080316100800,65,Wikipedia PT
1,A primeira das celebrações realizou-se em 1963...,1756,https://arquivo.pt/noFrame/replay/200802150452...,20080215045248,66,Wikipedia PT
2,"Após a independência, a grande maioria desses ...",3447,https://arquivo.pt/noFrame/replay/202406121933...,20240612193309,32,Wikipedia PT
3,Cada redefinição do processo representava uma ...,4175,https://arquivo.pt/noFrame/replay/201510051933...,20151005193344,20,Wikipedia PT
4,"Contudo, grupos organizados, num movimento con...",6347,https://arquivo.pt/noFrame/replay/202301220235...,20230122023529,6,Wikipedia PT
...,...,...,...,...,...,...
895,A Junta de Salvação Nacional (JSN) foi um grup...,3026,https://arquivo.pt/noFrame/replay/201510052300...,20151005230006,1949,Wikipedia PT
896,A Junta de Salvação Nacional (JSN) foi um grup...,607,https://arquivo.pt/noFrame/replay/201106151938...,20110615193844,2081,Wikipedia PT
897,A Junta de Salvação Nacional (JSN) foi um grup...,615,https://arquivo.pt/noFrame/replay/201107040751...,20110704075116,2080,Wikipedia PT
898,A Junta de Salvação Nacional (JSN) foi um grup...,1835,https://arquivo.pt/noFrame/replay/201311060820...,20131106082031,1944,Wikipedia PT


In [39]:
def get_precomputed_embeddings(embeddings_path, df_concat):
    df_embeddings = None
    with tqdm(total=df_concat.shape[0]) as df_bar:

        for i, row_i in df_concat.iterrows():
            df_bar.set_description(f"Processing embeddings")
            path_i = os.path.join(embeddings_path,f"{i}.csv")
            em_i = pd.read_csv(path_i, header=None)

            if i == 0:
                df_embeddings = em_i.copy()
            else: 
                df_embeddings = pd.concat([df_embeddings, em_i], axis=0, ignore_index=True)

            df_bar.update(1)
    df_bar.close()
    return df_embeddings


In [41]:
embeddings_path = "./../data/embeddings/"

df_embeddings = get_precomputed_embeddings(embeddings_path, df_concat)

df_embeddings.shape

  0%|          | 0/900 [00:00<?, ?it/s]

(900, 1536)

In [42]:
db_path = "./../data/chroma_cravo"

client = chromadb.PersistentClient(path=db_path, 
                                   settings=Settings(anonymized_telemetry=False)
                                   )

In [43]:
docs_ = df_concat['text'].values
embeddings = df_embeddings.values
metadatas = [{"link": m[0], "m_id": m[1]} for m in df_concat[['link', 'metadata_id']].values]
docs = docs_.tolist()

store_embeddings(docs, embeddings, metadatas, client, "cravo")


Creating new collection 'cravo'.
Added 900 documents to the collection.
