In [1]:
# ! pip install pinecone-client==2.2.4 # restart kernel after running this cell

In [3]:
from domino_data.vectordb import DominoPineconeConfiguration

# from langchain_community.document_loaders.csv_loader import CSVLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings import HuggingFaceBgeEmbeddings
# from langchain.vectorstores import Qdrant

import csv
import os
import pinecone
import sys

from mlflow.deployments import get_deploy_client
import os

client = get_deploy_client(os.environ['DOMINO_MLFLOW_DEPLOYMENTS'])

In [15]:
import pandas as pd
df = pd.read_csv('/mnt/code/data/help_markdown.csv')
print(df.shape)

df_drop_duplicates = df.drop_duplicates(subset='markdown')
print(df_drop_duplicates.shape)

df_drop_duplicates.to_csv('/mnt/code/data/help_markdown_dd.csv')

(1050, 2)
(664, 2)


In [4]:
csv.field_size_limit(sys.maxsize)

loader = CSVLoader(file_path="/mnt/data/{}/domino_help_clean.csv".format(os.environ['DOMINO_PROJECT_NAME']), source_column="url")

data = loader.load()

In [5]:
article_texts = []
chunk_size=1000
chunk_overlap=200
strip_whitespace = True

article_text = loader.load_and_split(
        RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            strip_whitespace=strip_whitespace
        )
    )
    
# refine texts
for chunk in article_text:
    # index of documentation path 
    path_idx = chunk.page_content.find("\n")
    chunk.page_content = chunk.page_content[path_idx + 1:]
        
article_texts.append(article_text)

In [6]:
# Print sample to test quality
article_texts[0][20].page_content

'\nTo resume your upload use the following syntax:\n\n**% domino upload-dataset user\\_id/project\\_name/upload\\_dataset Users/myuser/data aed75b88-cb1c-4ce3-a06b-05b675c296cf**\n\n### Managing Path Collision\n\nUse the`--fileUploadSetting`option to handle path collisions as follows:\n\n1.  `overwrite`: If a file already exists in the Dataset, the new file overwrites the existing file.\n    \n2.  `rename`: If a file already exists in the Dataset, the new file is uploaded and renamed with`_1`appended to the filename. For example,`/Users/myUser/data/file.txt`becomes`/Users/myUser/data/file_1.txt`\n    \n3.  `ignore`: If a file already exists in the Dataset, the new file is ignored.\n    \n\nTo use this option use the following syntax:'

In [8]:
# Load the embedding model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model_name = "BAAI/bge-small-en"
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/mnt/data/{}/model_cache/'.format(os.environ['DOMINO_PROJECT_NAME'])
embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model_name,
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )

In [9]:
datasource_name = "pinecone-domino-support"
conf = DominoPineconeConfiguration(datasource=datasource_name)

print(pinecone.__version__)

# The pinecone API key should be provided when creating the Domino Data Source and persisted securely.
# This api_key variable here is only used for satisfying the native pinecone python client initialization where
# api_key is a mandatory non-empty field.

api_key = os.environ.get("DOMINO_VECTOR_DB_METADATA", datasource_name)

pinecone.init(
    api_key=api_key,
    environment="domino",
    openapi_config=conf)

2.2.4


In [10]:
print(pinecone.list_indexes())
print(pinecone.describe_index('domino-support'))

['domino-support']
IndexDescription(name='domino-support', metric='cosine', replicas=1, dimension=384.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')


In [12]:
# check if index already exists, if not we create it
index_name = "domino-support"
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=384, ## 384 for bge-small-en 
        metric='cosine'
    )

#insert the embeddings
from langchain.vectorstores import Pinecone

vector_store = Pinecone.from_documents(
    article_texts[0],
    embeddings,
    index_name=index_name,
    namespace='domino-help'
)

In [None]:
# Uncomment below if you want to use Qdrant without the AI g/w
# qdrant_key = os.environ.get("QDRANT_KEY")
# qdrant_url = "https://59f8f159-fb60-44e8-bfc4-9f35c77ca8d4.us-east4-0.gcp.cloud.qdrant.io:6333"

# doc_store = Qdrant.from_documents(article_texts[0],
#                               embedding=embeddings,
#                               url = qdrant_url,
#                               api_key= qdrant_key,
#                               collection_name=f"domino-help")