In [2]:
from copy import deepcopy
from domino_data.vectordb import DominoPineconeConfiguration
from itertools import islice
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.llms import OpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain_community.embeddings import MlflowEmbeddings

import csv
import ntpath
import os
import pinecone
import re

In [331]:
# Replace with path to your .csv metadata file + pdfs directory
# The "embed_gen/sample_files" directory has an example of how the metadata (pages.csv) and page pdfs (in pdfs dir) should be formatted
metadata_file_path = "/domino/datasets/local/Dataset_source_pdfs/pages.csv"
page_pdfs_dir_path = "/domino/datasets/local/Dataset_source_pdfs/pdfs"

In [332]:
# Obtain documentation metadata used for VectorDB tagging
article_metadatas = {}
url_idx = 0
category_idx = 1
version_idx = 2
title_idx = 3
pdf_name_idx = 4
with open(metadata_file_path, "r") as metadata_file:
    reader = csv.reader(metadata_file)
    # Format per row: "url", "category", "version", "title", "pdf"
    # Skip row containing column titles
    next(reader, None)
    for row in reader:
        article_metadatas[row[pdf_name_idx]] = {
            "url": row[url_idx],
            "category": row[category_idx],
            "version": row[version_idx],
            "title": row[title_idx]
        }

# Sample to check quality
dict(islice(article_metadatas.items(), 0, 5))

{'index.pdf': {'url': 'https://docs.dominodatalab.com/',
  'category': '',
  'version': '',
  'title': 'Domino Documentation'},
 'release_notes_5-7-1.pdf': {'url': 'https://docs.dominodatalab.com/release_notes/5-7-1/',
  'category': 'release_notes',
  'version': '5-7-1',
  'title': 'Domino 5.7.1 (August 2023)'},
 'release_notes_5-7-2.pdf': {'url': 'https://docs.dominodatalab.com/release_notes/5-7-2/',
  'category': 'release_notes',
  'version': '5-7-2',
  'title': 'Domino 5.7.2 (September 2023)'},
 'en_5.7_admin_guide_053e1f_external-data-volumes.pdf': {'url': 'https://docs.dominodatalab.com/en/5.7/admin_guide/053e1f/external-data-volumes/',
  'category': 'admin_guide',
  'version': '5.7',
  'title': 'External data volumes'},
 'en_5.7_admin_guide_f5934f_data-source-audit-logs.pdf': {'url': 'https://docs.dominodatalab.com/en/5.7/admin_guide/f5934f/data-source-audit-logs/',
  'category': 'admin_guide',
  'version': '5.7',
  'title': 'Data Source audit logs'}}

In [333]:
# Link page content to metadata and insert into dataframe
article_texts = []
chunk_size = 1000
chunk_overlap = 0
strip_whitespace = True
for filename in os.listdir(page_pdfs_dir_path):
    if not filename.endswith(".pdf"):
        continue
    loader = PyPDFLoader(f"{page_pdfs_dir_path}/{filename}")
    article_text = loader.load_and_split(
        RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            strip_whitespace=strip_whitespace
        )
    )
        
    article_texts.append(article_text)

In [334]:
# Sample text
article_texts[0][0].page_content

'Connect to Google Cloud Storage\nThis topic describes how to connect to Google Cloud Storage (GCS) from Domino. You must have network connectivity between GCS\nand your Domino deployment.\nThe easiest way to connect to a GCS instance from Domino is to create a Domino data source as described below.\n\x00. From the navigation pane, click Data.\n\x00. Click Create a Data Source.\n\x00. In the New Data Source window, from Select Data Store, select Google Cloud Storage.\n\x00. Enter the name of the Bucket.\n\x00. Enter the Data Source Name.\nNote\n\x00. Optional: Enter a Description to explain the purpose of the data source to others.\n\x00. Click Next.\n\x00. Copy the Private Key (JSON format). See creating a service account for instructions about creating a service account and\ndownloading the JSON credentials file. You must copy the entire content of the file. The Domino secret store backed by HashiCorp'

In [335]:
# Replace any special characters in the text. 
# Also remove the information in the pdf header, footer and the Domino copyright related text. 
# This gives a clean text with just the docs content to vectorize
for i in range(len(article_texts)):
    for j in range(len(article_texts[i])):
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("\r\n", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("\n", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("\x00", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("Domino Data Lab Knowledge Base Data Science Blog Training Copyright ©", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("Domino Data Lab. All rights reserved.", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("User Guide Admin Guide API Guide Release Notes Light Dark Search", " ")
        article_texts[i][j].page_content = article_texts[i][j].page_content.replace("\t", " ")
        article_texts[i][j].page_content = (re.sub(' +', ' ', article_texts[i][j].page_content))

Connect to Google Cloud Storage
This topic describes how to connect to Google Cloud Storage (GCS) from Domino. You must have network connectivity between GCS
and your Domino deployment.
The easiest way to connect to a GCS instance from Domino is to create a Domino data source as described below.
 . From the navigation pane, click Data.
 . Click Create a Data Source.
 . In the New Data Source window, from Select Data Store, select Google Cloud Storage.
 . Enter the name of the Bucket.
 . Enter the Data Source Name.
Note
 . Optional: Enter a Description to explain the purpose of the data source to others.
 . Click Next.
 . Copy the Private Key (JSON format). See creating a service account for instructions about creating a service account and
downloading the JSON credentials file. You must copy the entire content of the file. The Domino secret store backed by HashiCorp
------ 
 
Connect to Google Cloud Storage This topic describes how to connect to Google Cloud Storage (GCS) from Domino. 

In [336]:
# Print sample to test quality
article_texts[0][2].page_content

'After connecting to your Data Source, learn how to Use Data Sources. Share this Data Source with your collaborators.Next steps '

### Insertion into Pinecone vector database

In [338]:
index_name = "hacktest"

In [339]:
embeddings = MlflowEmbeddings(
    target_uri=os.environ["DOMINO_MLFLOW_DEPLOYMENTS"],
    endpoint="embeddings",
)

In [340]:
datasource_name = "PineconeHackathon"
conf = DominoPineconeConfiguration(datasource=datasource_name)
# The pinecone API key should be provided when creating the Domino data source and persisted securely.
# This api_key variable here is only used for satisfying the native pinecone python client initialization where
# api_key is a mandatory non-empty field.
api_key = os.environ.get("DOMINO_VECTOR_DB_METADATA", datasource_name)

pinecone.init(
    api_key=api_key,
    environment="domino",
    openapi_config=conf)

print(pinecone.list_indexes())

# Use the appropriate index_name based on the index you want to use
index = pinecone.Index(index_name)
index.describe_index_stats()

['hacktest']


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [341]:
index_name = "hacktest"
texts_to_insert = []
metadatas_to_insert = []

# i represents article index
# j represents chunk index
for i in range(len(article_texts)):
    # Get the current article's name by checking the source of the first chunk
    article_name = ntpath.basename(article_texts[i][0].metadata['source'])
    for j in range(len(article_texts[i])):
        # deepcopy required to prevent overwriting
        matching_metadata = deepcopy(article_metadatas[article_name])
        texts_to_insert.append(article_texts[i][j].page_content)
        metadatas_to_insert.append(matching_metadata)

In [342]:
# Check data to insert
print("Sample of texts_to_insert")
print(texts_to_insert[:2])
print()
print("Sample of metadatas_to_insert")
print(metadatas_to_insert[:2])

Sample of texts_to_insert
['Connect to Google Cloud Storage This topic describes how to connect to Google Cloud Storage (GCS) from Domino. You must have network connectivity between GCS and your Domino deployment. The easiest way to connect to a GCS instance from Domino is to create a Domino data source as described below. . From the navigation pane, click Data. . Click Create a Data Source. . In the New Data Source window, from Select Data Store, select Google Cloud Storage. . Enter the name of the Bucket. . Enter the Data Source Name. Note . Optional: Enter a Description to explain the purpose of the data source to others. . Click Next. . Copy the Private Key (JSON format). See creating a service account for instructions about creating a service account and downloading the JSON credentials file. You must copy the entire content of the file. The Domino secret store backed by HashiCorp', ' . Click Test Credentials. . If the data source authenticates, click Next. . Select who can view a

In [343]:
docsearch = Pinecone.from_texts(
    texts_to_insert, 
    embeddings, 
    metadatas=metadatas_to_insert, 
    index_name=index_name
)

### Test vector embedding search

In [346]:
query = "How can I use a Data Source in my Domino project?"
docs = docsearch.similarity_search(query)
docs[0].page_content

'After connecting to your Data Source, learn how to Use Data Sources. Share this Data Source with your collaborators.Next steps '

### Test OpenAI RAG using vector embeddings

In [283]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

### Utility Cells

In [329]:
# WARNING!!! DELETES ALL VECTORS IN NAMESPACE
# Uncomment below to use
# index = pinecone.Index(index_name)
# index.delete(delete_all=True)

{}