In [1]:
! pip install pinecone-client==2.2.4 # restart kernel after running this cell

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pinecone-client==2.2.4
  Obtaining dependency information for pinecone-client==2.2.4 from https://files.pythonhosted.org/packages/df/d4/cffbb61236c6c1d7510e835c1ff843e4e7d705ed59d21c0e5b6dc1cb4fd8/pinecone_client-2.2.4-py3-none-any.whl.metadata
  Downloading pinecone_client-2.2.4-py3-none-any.whl.metadata (7.8 kB)
Collecting dnspython>=2.0.0 (from pinecone-client==2.2.4)
  Obtaining dependency information for dnspython>=2.0.0 from https://files.pythonhosted.org/packages/87/a1/8c5287991ddb8d3e4662f71356d9656d91ab3a36618c3dd11b280df0d255/dnspython-2.6.1-py3-none-any.whl.metadata
  Downloading dnspython-2.6.1-py3-none-any.whl.metadata (5.8 kB)
Downloading pinecone_client-2.2.4-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDow

In [1]:
from domino_data.vectordb import DominoPineconeConfiguration

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Qdrant

import csv
import os
import pinecone
import sys

from mlflow.deployments import get_deploy_client
import os

client = get_deploy_client(os.environ['DOMINO_MLFLOW_DEPLOYMENTS'])

  from tqdm.autonotebook import tqdm
* 'schema_extra' has been renamed to 'json_schema_extra'


In [15]:
import pandas as pd
df = pd.read_csv('/mnt/code/data/help_markdown.csv')
print(df.shape)

df_drop_duplicates = df.drop_duplicates(subset='markdown')
print(df_drop_duplicates.shape)

df_drop_duplicates.to_csv('/mnt/code/data/help_markdown_dd.csv')

(1050, 2)
(664, 2)


In [16]:
csv.field_size_limit(sys.maxsize)

loader = CSVLoader(file_path="/mnt/code/data/help_markdown_dd.csv", source_column="url")

data = loader.load()

In [17]:
article_texts = []
chunk_size=1000
chunk_overlap=200
strip_whitespace = True

article_text = loader.load_and_split(
        RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            strip_whitespace=strip_whitespace
        )
    )
    
# refine texts
for chunk in article_text:
    # index of documentation path 
    path_idx = chunk.page_content.find("\n")
    chunk.page_content = chunk.page_content[path_idx + 1:]
        
article_texts.append(article_text)

In [18]:
# Print sample to test quality
article_texts[0][20].page_content

'*   [Access project files during docker build using Dockerfile Instructions](https://tickets.dominodatalab.com/hc/en-us/articles/19693528289044-Access-project-files-during-docker-build-using-Dockerfile-Instructions)'

In [19]:
# Load the embedding model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model_name = "BAAI/bge-small-en"
# os.environ['SENTENCE_TRANSFORMERS_HOME'] = './model_cache/'
embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model_name,
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )

In [20]:
datasource_name = "pinecone-domino-support"
conf = DominoPineconeConfiguration(datasource=datasource_name)

print(pinecone.__version__)

# The pinecone API key should be provided when creating the Domino Data Source and persisted securely.
# This api_key variable here is only used for satisfying the native pinecone python client initialization where
# api_key is a mandatory non-empty field.

api_key = os.environ.get("DOMINO_VECTOR_DB_METADATA", datasource_name)

pinecone.init(
    api_key=api_key,
    environment="domino",
    openapi_config=conf)

2.2.4


In [21]:
print(pinecone.list_indexes())
print(pinecone.describe_index('domino-support'))

['domino-support']
IndexDescription(name='domino-support', metric='cosine', replicas=1, dimension=384.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')


In [22]:
#check if index already exists, if not we create it
index_name = "domino-support"
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=384, ## 384 for bge-small-en 
        metric='cosine'
    )

#insert the embeddings
from langchain.vectorstores import Pinecone

vector_store = Pinecone.from_documents(
    article_texts[0],
    embeddings,
    index_name=index_name,
    namespace='domino-help'
)

In [None]:
# Uncomment below if you want to use Qdrant without the AI g/w
# qdrant_key = os.environ.get("QDRANT_KEY")
# qdrant_url = "https://59f8f159-fb60-44e8-bfc4-9f35c77ca8d4.us-east4-0.gcp.cloud.qdrant.io:6333"

# doc_store = Qdrant.from_documents(article_texts[0],
#                               embedding=embeddings,
#                               url = qdrant_url,
#                               api_key= qdrant_key,
#                               collection_name=f"domino-help")