In [None]:
import numpy as np
import os
import shutil
import time
import glob
import os
import base64
import subprocess
from tqdm import tqdm
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain, ConversationChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers.txt import TextParser
from langchain.memory import VectorStoreRetrieverMemory
from langchain.prompts import PromptTemplate
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.vectorstores.tiledb import TileDB
from minio import Minio

In [None]:
client = Minio("minio-service.kubeflow.svc.cluster.local:9000",
    access_key="minio",
    secret_key="minio123",
    secure=False,           
)

In [None]:
type(client)

In [None]:
bucket_name = "newtiledb"

In [None]:
def upload_files(bucket_name, file_location, client):
    found = False  # Initialize 'found' before the try block
    try:
        found = client.bucket_exists(bucket_name)
    except Exception as e:
        print("error trying to search for MinIO Bucket:", e)
        return  # Return early since we cannot proceed without knowing if the bucket exists

    if not found:
        try:
            client.make_bucket(bucket_name)
            print("Created bucket", bucket_name)
        except Exception as e:
            print("Failed to create bucket:", e)
            return  # Return early since we cannot proceed if the bucket cannot be created
    else:
        print("Bucket", bucket_name, "exists, we won't attempt to create one")
        
    # Ensure the directory exists
    if not os.path.isdir(file_location):
        print(f"The directory {file_location} does not exist.")
        return

    # Iterate through all files in the directory
    for file_name in os.listdir(file_location):
        # Construct the full file path
        source_file = os.path.join(file_location, file_name)
        # Check if it's a file and not a directory
        if os.path.isfile(source_file):
            try:
                # Upload the file
                client.fput_object(bucket_name, file_name, source_file)
                print(f"Successfully uploaded {file_name} to bucket {bucket_name}.")
            except Exception as e:
                print(f"Failed to upload {file_name}: {e}")


In [None]:
upload_files(bucket_name,"documentation",client)

In [None]:
def load_docs(source_dir: str) -> list:
    """Load all documents in a the given directory."""
    fns = glob.glob(os.path.join(source_dir, "*.txt"))    
    docs = []
    for i, fn in enumerate(tqdm(fns, desc="Loading documents...")):
        docs.extend(load_doc(fn))

    return docs

In [None]:
def process_docs(docs: list, chunk_size: int, chunk_overlap: int) -> list:
    """Load the documents and split them into chunks."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(docs)
    return texts


In [None]:
documents = process_docs(docs, chunk_size=500, chunk_overlap=0)

In [None]:
docs = load_docs("tmp_docs")

In [308]:
import kserve

ModuleNotFoundError: No module named 'kserve'

In [300]:
db = TileDB.from_documents(
    documents, embeddings, index_uri=index_name, index_type="FLAT")

TileDBError: [TileDB::StorageManager] Error: Cannot create group; Group 'file:///home/jovyan/tiledb_index' already exists

In [305]:
import os
from minio import Minio

# Initialize the Minio client (assuming it's already done elsewhere in your code)
# client = Minio("YOUR_MINIO_ENDPOINT", access_key="YOUR_ACCESS_KEY", secret_key="YOUR_SECRET_KEY", secure=True)

def download_files(bucket_name):
    # Ensure the documentation directory exists
    target_directory = "tmp_docs"
    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    # List all objects in the bucket
    objects = client.list_objects(bucket_name, recursive=True)
    for obj in objects:
        # Check if the object is a .txt file
        if obj.object_name.endswith('.txt'):
            # Construct the full path for the file to be downloaded
            destination_path = os.path.join(target_directory, obj.object_name)
            
            # Ensure the subdirectory exists
            os.makedirs(os.path.dirname(destination_path), exist_ok=True)
            
            # Download the object
            client.fget_object(bucket_name, obj.object_name, destination_path)
            print(f"Downloaded {obj.object_name} to {destination_path}")
        else:
            print(f"Skipping non-txt file {obj.object_name}")

# Example usage
# download_files('your-bucket-name')


In [307]:
download_files(bucket_name)

Skipping non-txt file <minio.datatypes.Object object at 0x7c37000622d0>
Skipping non-txt file <minio.datatypes.Object object at 0x7c37000627d0>
Skipping non-txt file <minio.datatypes.Object object at 0x7c37002249d0>
Skipping non-txt file <minio.datatypes.Object object at 0x7c3700225590>
Skipping non-txt file <minio.datatypes.Object object at 0x7c37002256d0>
Skipping non-txt file <minio.datatypes.Object object at 0x7c3700225990>
Downloaded LLM.txt to tmp_docs/LLM.txt
Skipping non-txt file __tiledb_group.tdb
Downloaded array_db.txt to tmp_docs/array_db.txt
Downloaded feature_store.txt to tmp_docs/feature_store.txt
Downloaded human_in_loop.txt to tmp_docs/human_in_loop.txt
Skipping non-txt file index
Downloaded tiledb.txt to tmp_docs/tiledb.txt
Downloaded vector_database.txt to tmp_docs/vector_database.txt


In [None]:
embeddings_model = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embeddings_model)

In [None]:
query = "tell me about vector dbs?"
docs = db.similarity_search(query)
docs[0].page_content

In [None]:
embedding_vector = embeddings.embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
docs[0].page_content

In [None]:
docs_and_scores = db.similarity_search_with_score(query)
docs_and_scores[0]

In [None]:
retriever = db.as_retriever(search_type="mmr")
retriever.get_relevant_documents(query)

In [None]:
db.max_marginal_relevance_search(query, k=2, fetch_k=10)

In [None]:
predictor_image = (input("Enter the name of the predictor image (default: dpoulopoulos/qna-vectorstore:v0.1.0): ")
                   or "dpoulopoulos/qna-vectorstore:v0.1.0")

In [None]:
def encode_base64(message: str):
    encoded_bytes = base64.b64encode(message.encode('ASCII'))
    return encoded_bytes.decode('ASCII'

In [None]:
isvc = """
apiVersion: v1
kind: Secret
metadata:
  name: minio-secret
type: Opaque
data:
  MINIO_ACCESS_KEY: {0}
  MINIO_SECRET_KEY: {1}

---
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
  name: vectorstore
spec:
  predictor:
    containers:
    - name: kserve-container
      image: {2}
      imagePullPolicy: Always
      resources:
        requests:
          memory: "2Gi"
          cpu: "500m"
        limits:
          memory: "2Gi"
          cpu: "500m"
      args:
      - --persist-uri
      - {3}
      env:
      # If you are running behind a proxy, uncomment the following lines and replace the values with your proxy URLs.
      # - name: HTTP_PROXY
      #   value: <your http proxy URL>
      # - name: HTTPS_PROXY
      #   value: <your https proxy URL>
      # - name: NO_PROXY
      #   value: .local
      - name: MLFLOW_S3_ENDPOINT_URL
        value: {4}
      - name: TRANSFORMERS_CACHE
        value: /src
      - name: SENTENCE_TRANSFORMERS_HOME
        value: /src
      - name: MINIO_ACCESS_KEY
        valueFrom:
          secretKeyRef:
            key: MINIO_ACCESS_KEY
            name: minio-secret
      - name: MINIO_SECRET_KEY
        valueFrom:
          secretKeyRef:
            key: MINIO_SECRET_KEY
            name: minio-secret
""".format(encode_base64(os.environ["AWS_ACCESS_KEY_ID"]),
           encode_base64(os.environ["AWS_SECRET_ACCESS_KEY"]),
           predictor_image, uri, os.environ["MLFLOW_S3_ENDPOINT_URL"])

with open("vectorstore-isvc.yaml", "w") as f:
    f.write(isvc)

In [None]:
subprocess.run(["kubectl", "apply", "-f", "vectorstore-isvc.yaml"])