In [1]:
from .util import *
import chromadb

ImportError: attempted relative import with no known parent package

In [13]:
def parse_multiline_key_value_string(text, to_json=False):
    """
    Parses a string with key-value pairs separated by newlines into a dictionary.
    
    Args:
        text (str): String with format 'Key: Value\\nKey: Value\\n...'
        to_json (bool): If True, returns a JSON string instead of a dict.
        
    Returns:
        dict or str: Parsed data as a dict or JSON string.
    """
    lines = text.strip().split('\n')
    data = {}

    for line in lines:
        if ':' in line:
            key, value = line.split(':', 1)
            data[key.strip()] = value.strip()

    return json.dumps(data, indent=2) if to_json else data

In [2]:
client = chromadb.PersistentClient(path="database/myDB")
collection = client.get_or_create_collection(name="documents")

In [3]:
results = collection.get(include=['documents'])
print(results)

{'ids': ['Book2.xlsx_0', 'daniel.csv_0', 'daniel.csv_1', 'daniel.csv_2'], 'embeddings': None, 'documents': ['', 'Document Number: 1\nContent: My name is Pythagoras\nCategory: Beauty', 'Document Number: 2\nContent: Trust and obey\nCategory: Finance', 'Document Number: 3\nContent: For there is no other way\nCategory: Love'], 'uris': None, 'data': None, 'metadatas': None, 'included': [<IncludeEnum.documents: 'documents'>]}


In [9]:
results['documents'][1]

'Document Number: 1\nContent: My name is Pythagoras\nCategory: Beauty'

In [15]:
import json 
parse_multiline_key_value_string(results['documents'][1])["Content"]

'My name is Pythagoras'

In [9]:
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
import re

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
def preprocess_and_embed(
    df: pd.DataFrame,
    text_columns: list,
    sbert_model: str = "paraphrase-distilroberta-base-v2",
    tfidf_threshold: float = 3,
    batch_size: int = 32
) -> pd.DataFrame:
    """
    Preprocess texts and compute embeddings for database insertion.
    
    Args:
        df (pd.DataFrame): Input dataframe containing text columns.
        text_columns (List[str]): List of columns to be concatenated and preprocessed.
        sbert_model (str): SentenceTransformer model name.
        tfidf_threshold (float): Threshold for tf-idf based stopword extension.
        batch_size (int): Batch size for embedding calculation.
    
    Returns:
        pd.DataFrame: Processed dataframe with an additional 'embedding' column.
    """
    print("🚩 Preprocessing started...")

    # Remove rows with missing text
    df = df.dropna(subset=text_columns, how="any")

    # Concatenate selected columns
    df["calculate_on"] = df.apply(lambda row: ' '.join([str(row[col]) for col in text_columns]), axis=1)

    texts = df["calculate_on"].tolist()

    # Setup NLP
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("sentencizer")

    # Create doc objects
    docs = list(nlp.pipe(texts))

    # TF-IDF to extend stopwords
    vectorizer = TfidfVectorizer()
    vectorizer.fit(texts)
    tfidf_stopwords = set([word for word, idf in zip(vectorizer.get_feature_names_out(), vectorizer.idf_) if idf <= tfidf_threshold])
    stop_words = STOP_WORDS.union(tfidf_stopwords)

    # Tokenization & Cleaning
    processed_texts = []
    for doc in docs:
        tokens = [token.lemma_.lower() for token in doc if
                  re.search('[a-z0-9]+', token.text) and
                  len(token.text) > 1 and
                  not token.is_digit and
                  not token.is_space and
                  token.lemma_.lower() not in stop_words]
        cleaned_tokens = [''.join([char for char in tok if char.isalpha()]) for tok in tokens]
        cleaned_tokens = [tok for tok in cleaned_tokens if tok.strip()]
        processed_texts.append(' '.join(cleaned_tokens))

    # Embedding
    print("🚩 Calculating embeddings...")
    model = SentenceTransformer(sbert_model)
    embeddings = model.encode(processed_texts, batch_size=batch_size, show_progress_bar=True)

    # Add columns for processed content and embeddings
    df["processed_text"] = processed_texts
    df["embedding"] = [embedding.tolist() for embedding in embeddings]  # convert np.ndarray to list for JSON/DB

    print("✅ Preprocessing completed!")
    return df.reset_index(drop=True)


In [16]:
def preprocess_and_embed(
    df: pd.DataFrame,
    text_columns: list,
    sbert_model: str = "paraphrase-distilroberta-base-v2",
    tfidf_threshold: float = 3,
    batch_size: int = 32,
    min_tokens: int = 20  # threshold for short preprocessed text
) -> pd.DataFrame:
    """
    Preprocess texts and compute embeddings for database insertion.
    
    If preprocessed text is too short, fallback to the original text.

    Args:
        df (pd.DataFrame): Input dataframe containing text columns.
        text_columns (List[str]): List of columns to be concatenated and preprocessed.
        sbert_model (str): SentenceTransformer model name.
        tfidf_threshold (float): Threshold for tf-idf based stopword extension.
        batch_size (int): Batch size for embedding calculation.
        min_tokens (int): Minimum number of tokens required to accept preprocessed text.
    
    Returns:
        pd.DataFrame: Processed dataframe with an additional 'embedding' column.
    """
    print("🚩 Preprocessing started...")

    # Remove rows with missing text
    df = df.dropna(subset=text_columns, how="any")

    # Concatenate selected columns
    df["calculate_on"] = df.apply(lambda row: ' '.join([str(row[col]) for col in text_columns]), axis=1)

    texts = df["calculate_on"].tolist()

    # Setup NLP
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("sentencizer")

    # Create doc objects
    docs = list(nlp.pipe(texts))

    # TF-IDF to extend stopwords
    vectorizer = TfidfVectorizer()
    vectorizer.fit(texts)
    tfidf_stopwords = set([word for word, idf in zip(vectorizer.get_feature_names_out(), vectorizer.idf_) if idf <= tfidf_threshold])
    stop_words = STOP_WORDS.union(tfidf_stopwords)

    # Tokenization & Cleaning with fallback
    processed_texts = []
    for i, doc in enumerate(docs):
        tokens = [token.lemma_.lower() for token in doc if
                  re.search('[a-z0-9]+', token.text) and
                  len(token.text) > 1 and
                  not token.is_digit and
                  not token.is_space and
                  token.lemma_.lower() not in stop_words]
        cleaned_tokens = [''.join([char for char in tok if char.isalpha()]) for tok in tokens]
        cleaned_tokens = [tok for tok in cleaned_tokens if tok.strip()]
        
        if len(cleaned_tokens) < min_tokens:
            processed_texts.append(df["calculate_on"].iloc[i])
        else:
            processed_texts.append(' '.join(cleaned_tokens))

    # Embedding
    print("🚩 Calculating embeddings...")
    model = SentenceTransformer(sbert_model)
    embeddings = model.encode(processed_texts, batch_size=batch_size, show_progress_bar=True)

    # Add columns for processed content and embeddings
    df["processed_text"] = processed_texts
    df["embedding"] = [embedding.tolist() for embedding in embeddings]

    print("✅ Preprocessing completed!")
    return df.reset_index(drop=True)


In [17]:
preprocessed = preprocess_and_embed(pd.read_csv("data/betty.csv"), ["Content"])

🚩 Preprocessing started...
🚩 Calculating embeddings...


Batches: 100%|██████████| 1/1 [00:00<00:00,  8.24it/s]

✅ Preprocessing completed!





In [18]:
preprocessed

Unnamed: 0,Document Number,Content,Category,calculate_on,processed_text,embedding
0,1,My name is Betty,General,My name is Betty,My name is Betty,"[0.38925281167030334, 0.9264926910400391, -0.2..."
1,2,Found only in the heart cardiac muscle is resp...,Finance,Found only in the heart cardiac muscle is resp...,find pump control hormone signal stimulate sti...,"[0.26081937551498413, 0.1862926036119461, -0.0..."
2,3,Move for me everything will be alright,Health,Move for me everything will be alright,Move for me everything will be alright,"[0.1871119886636734, 0.5584659576416016, 0.219..."


In [15]:
len(preprocessed["processed_text"][1])

201