# Install Packages

In [None]:
!pip install -q chromadb pypdf umap-learn langchain sentence-transformers

# Load the ChromaDB Collection

In [None]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from pypdf import PdfReader

# Text Preprocessing Functions

In [None]:
def _read_pdf(filename):
    reader = PdfReader(filename)
    pdf_texts = [p.extract_text().strip() for p in reader.pages]
    pdf_texts = [text for text in pdf_texts if text]
    return pdf_texts

def _chunk_texts(texts):
    character_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=0
    )
    character_split_texts = character_splitter.split_text('\n\n'.join(texts))

    token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

    token_split_texts = []
    for text in character_split_texts:
        token_split_texts += token_splitter.split_text(text)

    return token_split_texts

def load_chroma(filename, collection_name, embedding_function):
    texts = _read_pdf(filename)
    chunks = _chunk_texts(texts)

    chroma_cliet = chromadb.Client()
    chroma_collection = chroma_cliet.create_collection(name=collection_name, embedding_function=embedding_function)

    ids = [str(i) for i in range(len(chunks))]

    chroma_collection.add(ids=ids, documents=chunks)

    return chroma_collection

# Define Filepath, Embedding Function and Chroma Collection

In [None]:
filename='/content/drive/MyDrive/A Machine Learning/PDF and Data/Measuring Economic Uncertainty.pdf'
name = 'Economic_Uncertainity_Index'

In [None]:
embedding_function = SentenceTransformerEmbeddingFunction()
chroma_collection = load_chroma(filename=filename, collection_name=name, embedding_function=embedding_function)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Write a Query

In [None]:
query = "What world events cause a spike in economic uncertainty?"

# Embedding and Visualization Section

In [None]:
import umap
import plotly.graph_objs as go

### UMAP Transformer

In [None]:
embeddings = chroma_collection.get(include=['embeddings'])['embeddings']
umap_transformer = umap.UMAP(n_components=3, random_state=42,transform_seed=0).fit(embeddings)

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


### Retrieved Query Embeddings

In [None]:
results = chroma_collection.query(query_texts=query, n_results=5, include=['documents', 'embeddings'])
retrieved_embeddings = results['embeddings'][0]
query_embedding = embedding_function([query])

### Get 3D Embedding for Visualizaton

In [None]:
projected_dataset_embeddings_3d = umap_transformer.fit_transform(embeddings)
projected_retrieved_embeddings_3d = umap_transformer.transform(retrieved_embeddings)
projected_query_embedding_3d = umap_transformer.transform(query_embedding)[0]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


# Create the Vector DB Visualization
### This helps visualize the relevance of the query based on the document's content in RAG.

In [None]:
fig = go.Figure()

# Dataset embeddings
fig.add_trace(go.Scatter3d(
    x=projected_dataset_embeddings_3d[:, 0],
    y=projected_dataset_embeddings_3d[:, 1],
    z=projected_dataset_embeddings_3d[:, 2],
    mode='markers',
    marker=dict(size=3, color='blue'),
    name='Dataset Embeddings'
))

# Retrieved embeddings
fig.add_trace(go.Scatter3d(
    x=projected_retrieved_embeddings_3d[:, 0],
    y=projected_retrieved_embeddings_3d[:, 1],
    z=projected_retrieved_embeddings_3d[:, 2],
    mode='markers',
    marker=dict(size=8, color='red', symbol='circle-open'),
    name='Retrieved Embeddings'
))

# Original query embedding
fig.add_trace(go.Scatter3d(
    x=[projected_query_embedding_3d[0]],
    y=[projected_query_embedding_3d[1]],
    z=[projected_query_embedding_3d[2]],
    mode='markers+text',
    marker=dict(size=10, color='red', symbol='x'),
    name='Rag Query'
))

# Adding text manually adjusted to the right of the X marker
fig.add_trace(go.Scatter3d(
    x=[projected_query_embedding_3d[0] + 0.01],
    y=[projected_query_embedding_3d[1]],
    z=[projected_query_embedding_3d[2]],
    mode='text',
    text=["RAG Query"],
    textposition="middle right"
))


fig.update_layout(
    margin=dict(l=0, r=0, b=0, t=0),
    scene=dict(
        xaxis=dict(title='UMAP-1'),
        yaxis=dict(title='UMAP-2'),
        zaxis=dict(title='UMAP-3'),
    )
)

# Update layout with title
fig.update_layout(
    margin=dict(l=0, r=0, b=0, t=0),
    scene=dict(
        xaxis=dict(title='UMAP-1'),
        yaxis=dict(title='UMAP-2'),
        zaxis=dict(title='UMAP-3'),
    ),
    title=dict(
        text='Vector DB Dimension Reduction for RAG Performance Testing',
        x=0.45,
        xanchor='center',
        y=0.95,
        yanchor='top'
    )
)

fig.show()