In [None]:
! pip install pinecone-client==2.2.4 

In [None]:
from domino_data.vectordb import DominoPineconeConfiguration

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Qdrant

import csv
import os
import pinecone
import sys

from mlflow.deployments import get_deploy_client
import os

client = get_deploy_client(os.environ['DOMINO_MLFLOW_DEPLOYMENTS'])

In [None]:
csv.field_size_limit(sys.maxsize)

loader = CSVLoader(file_path="/mnt/code/data/clean_help.csv", source_column="url")

data = loader.load()

In [None]:
article_texts = []
chunk_size = 5000
chunk_overlap = 0
strip_whitespace = True

article_text = loader.load_and_split(
        RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap, 
            strip_whitespace=strip_whitespace
        )
    )
    
# refine texts
for chunk in article_text:
    # index of documentation path 
    path_idx = chunk.page_content.find("\n")
    chunk.page_content = chunk.page_content[path_idx + 1:]
        
article_texts.append(article_text)

In [None]:
# Print sample to test quality
article_texts[0][20].page_content

In [None]:
# Load the embedding model
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embedding_model_name = "BAAI/bge-small-en"
os.environ['SENTENCE_TRANSFORMERS_HOME'] = '/mnt/code/model_cache/'
embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model_name,
                                      model_kwargs=model_kwargs,
                                      encode_kwargs=encode_kwargs
                                     )

In [None]:
datasource_name = "Rakuten"
conf = DominoPineconeConfiguration(datasource=datasource_name)
# The pinecone API key should be provided when creating the Domino Data Source and persisted securely.
# This api_key variable here is only used for satisfying the native pinecone python client initialization where
# api_key is a mandatory non-empty field.
api_key = os.environ.get("DOMINO_VECTOR_DB_METADATA", datasource_name)

pinecone.init(
    api_key=api_key,
    environment="domino",
    openapi_config=conf)

In [None]:
#check if index already exists, if not we create it
index_name = "rakuten"
if index_name not in pinecone.list_indexes():
    pinecone.create_index(
        name=index_name,
        dimension=384, ## 384 for bge-small-en 
        metric='cosine'
    )

#insert the embeddings
from langchain.vectorstores import Pinecone
vector_store = Pinecone.from_documents(
    article_texts[0],
    embeddings,
    index_name=index_name
)

In [None]:
# Uncomment below if you want to use Qdrant without the AI g/w
# qdrant_key = os.environ.get("QDRANT_KEY")
# qdrant_url = "https://59f8f159-fb60-44e8-bfc4-9f35c77ca8d4.us-east4-0.gcp.cloud.qdrant.io:6333"

# doc_store = Qdrant.from_documents(article_texts[0],
#                               embedding=embeddings,
#                               url = qdrant_url,
#                               api_key= qdrant_key,
#                               collection_name=f"rakuten")