In [1]:
from copy import deepcopy
from domino_data.vectordb import domino_pinecone3x_init_params, domino_pinecone3x_index_params
from itertools import islice
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import MlflowEmbeddings
from pinecone import Pinecone

import csv
import ntpath
import os
import re
import uuid

In [2]:
# The "./../embed_gen/sample_files" directory has an example of how the metadata (pages.csv) and page pdfs (in pdfs dir) should be formatted
metadata_file_path = "./../embed_gen/sample_files/pages.csv" # Replace with the path to your metadata file, which can be from a Domino Dataset
page_pdfs_dir_path = "./../embed_gen/sample_files/pdfs" # Replace with the path to your PDFs directory, which can be from a Domino Dataset

In [3]:
# Obtain documentation metadata used for VectorDB tagging
article_metadatas = {}
url_idx = 0
category_idx = 1
version_idx = 2
title_idx = 3
pdf_name_idx = 4
with open(metadata_file_path, "r") as metadata_file:
    reader = csv.reader(metadata_file)
    # Format per row: "url", "category", "version", "title", "pdf"
    # Skip row containing column titles
    next(reader, None)
    for row in reader:
        article_metadatas[row[pdf_name_idx]] = {
            "url": row[url_idx],
            "category": row[category_idx],
            "version": row[version_idx],
            "title": row[title_idx]
        }

# Sample to check quality
dict(islice(article_metadatas.items(), 0, 5))

{'index.pdf': {'url': 'https://docs.dominodatalab.com/',
  'category': '',
  'version': '',
  'title': 'Domino Documentation'},
 'release_notes_5-7-1.pdf': {'url': 'https://docs.dominodatalab.com/release_notes/5-7-1/',
  'category': 'release_notes',
  'version': '5-7-1',
  'title': 'Domino 5.7.1 (August 2023)'},
 'release_notes_5-7-2.pdf': {'url': 'https://docs.dominodatalab.com/release_notes/5-7-2/',
  'category': 'release_notes',
  'version': '5-7-2',
  'title': 'Domino 5.7.2 (September 2023)'},
 'en_5.7_admin_guide_053e1f_external-data-volumes.pdf': {'url': 'https://docs.dominodatalab.com/en/5.7/admin_guide/053e1f/external-data-volumes/',
  'category': 'admin_guide',
  'version': '5.7',
  'title': 'External data volumes'},
 'en_5.7_admin_guide_f5934f_data-source-audit-logs.pdf': {'url': 'https://docs.dominodatalab.com/en/5.7/admin_guide/f5934f/data-source-audit-logs/',
  'category': 'admin_guide',
  'version': '5.7',
  'title': 'Data Source audit logs'}}

In [4]:
# Link page content to metadata and insert into dataframe
article_texts = []
chunk_size = 1000
chunk_overlap = 0
strip_whitespace = True
for filename in os.listdir(page_pdfs_dir_path):
    if not filename.endswith(".pdf"):
        continue
    loader = PyPDFLoader(f"{page_pdfs_dir_path}/{filename}")
    article_text = loader.load_and_split(
        RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            strip_whitespace=strip_whitespace
        )
    )
        
    article_texts.append(article_text)

In [5]:
# Sample text
article_texts[0][0].page_content

'>User guide>Work with data>Access external data>External Data Volumes (EDVs)>Add EDVs to Projects\nAdd EDVs to Projects\nIf a Domino admin has registered an external data volume (EDV) and shared it with you, you can add it to any of your projects. When you\nremove an EDV from a project, it remains mounted and available to other projects in Domino.\n\x00. In your project, go to Data and click External Data Volumes.\n\x00. Click Add External Volume.\n\x00. Select an EDV from the list.\n\x00. Click Add.\nIf you cannot view, search for, or select external volumes, they are not registered in your deployment or you do not have access\nprivileges. Contact your Domino administrator for assistance.\nIf your volume is successfully mounted, it is listed in your project at Data > External Data Volumes. The table indicates which data\nplanes have access to this EDV:\n\x00. In your project, go to Data > External Volumes..'

In [6]:
# Replaces special characters, the information in the pdf header, footer, and Domino copyright-related text
# This yields a refined text with only relevant content to create vector embeddings from
# Add/modify to suite the needs of your text
for i in range(len(article_texts)):
    for j in range(len(article_texts[i])):
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("\r\n", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("\n", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("\x00", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("Domino Data Lab Knowledge Base Data Science Blog Training Copyright ©", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("Domino Data Lab. All rights reserved.", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("User Guide Admin Guide API Guide Release Notes Light Dark Search", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("\t", " ")
        article_texts[i][j].page_content = (re.sub(' +', ' ', article_texts[i][j].page_content))

In [7]:
# Print sample to test quality
article_texts[0][2].page_content

'By default, external volumes that are mounted to your project (and that you have access to) are also automatically mounted in supported executions. To access a volume in an execution, reference the mount path of the volumes.Use an EDV in a Project View all mounted volumes in a Project In your project, go to Data > External Data Volumes. Each volume’s properties are shown in the table. Name – An alias for the volume, set by your Domino administrator. Type – The type of volume. Domino supports NFS, AWS EFS, and Windows Share (SMB). Description – A description of the volume, set by your Domino administrator. Mount Path – The mount path of the volume: /domino/edv/name-of-volume. Use this mount path when using the volume in a Job, Workspace, or other supported Domino execution. Data Plane - In Domino Nexus deployments, the data plane where the volume is mounted. Important'

### Insertion into Pinecone vector database

In [8]:
# Replace index_name with the index name you want to use
# Ensure the index dimension is 1536 if using OpenAI
index_name = "pippy-test"

In [9]:
embeddings = MlflowEmbeddings(
    target_uri=os.environ["DOMINO_MLFLOW_DEPLOYMENTS"],
    endpoint="embeddings",
)

In [10]:
# Get vector database indices
datasource_name = "pinecone-pippy"
pc = Pinecone(**domino_pinecone3x_init_params(datasource_name))
print(pc.list_indexes())

index = pc.Index(**domino_pinecone3x_index_params(datasource_name, index_name))
# Get target index statistics
print(index.describe_index_stats())

{'indexes': [{'dimension': 1536,
              'host': 'pippy-test-u48kgg5.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'pippy-test',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 43}},
 'total_vector_count': 43}


In [11]:
texts_to_insert = []
metadatas_to_insert = []

# i represents article index
# j represents chunk index
for i in range(len(article_texts)):
    # Get the current article's name by checking the source of the first chunk
    article_name = ntpath.basename(article_texts[i][0].metadata['source'])
    for j in range(len(article_texts[i])):
        # deepcopy required to prevent overwriting
        matching_metadata = deepcopy(article_metadatas[article_name])
        texts_to_insert.append(article_texts[i][j].page_content)
        # Add text content as metadata 
        matching_metadata["text"] = article_texts[i][j].page_content
        metadatas_to_insert.append(matching_metadata)

In [12]:
# Check data to insert
print("Sample of texts_to_insert")
print(texts_to_insert[:2])
print()
print("Sample of metadatas_to_insert")
print(metadatas_to_insert[:2])

Sample of texts_to_insert
['>User guide>Work with data>Access external data>External Data Volumes (EDVs)>Add EDVs to Projects Add EDVs to Projects If a Domino admin has registered an external data volume (EDV) and shared it with you, you can add it to any of your projects. When you remove an EDV from a project, it remains mounted and available to other projects in Domino. . In your project, go to Data and click External Data Volumes. . Click Add External Volume. . Select an EDV from the list. . Click Add. If you cannot view, search for, or select external volumes, they are not registered in your deployment or you do not have access privileges. Contact your Domino administrator for assistance. If your volume is successfully mounted, it is listed in your project at Data > External Data Volumes. The table indicates which data planes have access to this EDV: . In your project, go to Data > External Volumes..', ' . Click the three vertical dots at the end of the row for the volume in the ta

In [13]:
ids = [str(uuid.uuid4()) for i in range(len(texts_to_insert))]
embedded_texts = [embeddings.embed_query(text) for text in texts_to_insert]
vectors = [{"id": id, "values": embedded_text, "metadata": metadata} for id, embedded_text, metadata in zip(ids, embedded_texts, metadatas_to_insert)]
index.upsert(vectors)

{'upserted_count': 43}

### Test vector embedding search

In [14]:
query = "How can enable a Data Plane for a Workspace?"
embedded_query = embeddings.embed_query(query)
# Get the vector closest to the embedded query
relevant_vectors = index.query(
    vector=embedded_query,
    top_k=1,
    include_metadata=True
)
# Get text in the metadata of the embedding
relevant_vectors["matches"]

[{'id': 'bad634f4-3bee-412d-906b-1388a4607ea6',
  'metadata': {'category': 'admin_guide',
               'text': '>Admin guide>Data Planes>Enable a Data Plane for '
                       'Workspaces Enable a Data Plane for Workspaces To '
                       'support workspaces, the data plane must be configured '
                       'so that users can connect directly to the data plane to '
                       'access interactive workloads, as described below. The '
                       'data plane must be served from a subdomain of the '
                       'domain used for the control plane. In other words, if '
                       'users connect to Domino at example.com, then data '
                       'planes must be served from data-plane.example.com. The '
                       'hostname above should resolve to a load balancer which '
                       'routes traffic to port on Pods with the following label '
                       'selector: app.kube

### Utility Cells

In [15]:
# WARNING!!! DELETES ALL VECTORS IN NAMESPACE
# Uncomment below to use
# index.delete(delete_all=True)