### Upgrade Python to 3.11 first

Follow docs at https://medium.com/google-cloud/upgrade-google-vertex-ai-workbench-notebook-python-version-64ee1d8b4a9 

In a terminal run the following

```
conda deactivate
conda create -n python311 python=3.11
conda activate python311
conda install ipykernel
ipython kernel install --name "python311" --user
```

Then back in the launcher create a new file with the new Kernel. Remember to use `%pip install...` to use the current env's Python.


In [5]:
from platform import python_version

python_version()

'3.11.4'

In [None]:
%pip install langchain chromadb google-cloud-aiplatform 

In [12]:
from typing import cast
import math
import os

import chromadb
from chromadb.api import Collection
from chromadb.types import Metadata

from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

import vertexai
from vertexai.preview.language_models import TextGenerationModel

In [13]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'

def get_docos(path:str) -> list[str]:
    files: list[str] = []

    for filename in os.listdir(path):
        if filename.endswith("txt"):
            files.append(os.path.join(path, filename))
    
    return files


def split_doc(text_body:str, source:str, chunk_size:int) -> list[Document]:

    overlap = math.floor(chunk_size / 10) # big assumption this is ok

    splitter = CharacterTextSplitter(separator="\n", chunk_size=chunk_size, chunk_overlap=overlap, length_function=len)
    result = splitter.create_documents([text_body], metadatas=[{"source":source}])

    return result


def insert_into_vector_db(chromaCollection:Collection, pathToDocs:str, chunk_size:int):
    docs = get_docos(pathToDocs)

    theId = 1

    for doc in docs:
        f = open(doc)
        content = f.read()
        f.close()
        
        chunked_docs = split_doc(content, doc, chunk_size)
        text_chunks = [x.page_content for x in chunked_docs]
        meta = [cast(Metadata, x.metadata) for x in chunked_docs]
        ids = [str(x) for x in range(theId, len(text_chunks)+theId)]
        
        chromaCollection.add(ids=ids, documents=text_chunks, metadatas=meta)

        theId += len(text_chunks)


def ask_question(question:str, chromaCollection:Collection, model:TextGenerationModel, info=False):
    print(f"{bcolors.OKCYAN}Asking...{bcolors.ENDC}\n")

    query_result = chromaCollection.query(query_texts=[question], n_results=3)
    
    parameters = {
        "temperature": 0.4,
        "max_output_tokens": 256,
        "top_p": 0.79,
        "top_k": 40
    }

    if info:
        print(f"Using temp: {parameters['temperature']} | top_p: {parameters['top_p']} | top_k: {parameters['top_k']} | max_out_tokens: {parameters['max_output_tokens']}\n")
        print("Vector DB similarity results:\n")
        print(query_result)
        print('\n')

    if query_result['documents'] is not None:
        context_string = " ".join(query_result['documents'][0])
        
        prompt = f"""
        {context_string} 
        
        {question} 
        """

        response = model.predict(prompt, **parameters)
        print(f'{bcolors.OKCYAN}Response:{bcolors.ENDC}\n\n{response.text}')


def ask_question_chronological_context(question:str, chromaCollection:Collection, model:TextGenerationModel, info=False, n_results=3, surrounding=1):
    ''' Similar to ask_question but after getting the most relevant documents from ChromaDB, it takes advantage of the sequential nature of the ID and also
        gets the surrounding documents. Ie: If I ask about "steam" the document immediatelly before and after the one containing the word steam may be highly
        relevant.

        This is highly coupled to ChromaDB. I'm unsure whether other VectorDBs allow to be queried by ID, etc
    '''
    
    print(f"{bcolors.OKCYAN}Asking...{bcolors.ENDC}\n")

    query_result = chromaCollection.query(query_texts=[question], n_results=n_results)
    ids = [int(x) for x in query_result['ids'][0]]
    ids.sort()

    all_ids = []
    for id in ids:
        for i in range(1, surrounding+1):
            pos = id - (surrounding + 1 - i)
            all_ids.append(str(pos))

        all_ids.append(str(id))
        
        for i in range(1, surrounding+1):
            all_ids.append(str(id+i))
    

    query_result = chromaCollection.get(ids=all_ids)

    parameters = {
        "temperature": 0.4,
        "max_output_tokens": 256,
        "top_p": 0.79,
        "top_k": 40
    }

    if info:
        print(f"Using temp: {parameters['temperature']} | top_p: {parameters['top_p']} | top_k: {parameters['top_k']} | max_out_tokens: {parameters['max_output_tokens']}\n")
        print("Vector DB similarity results:\n")
        print(query_result)
        print('\n')

    if query_result['documents'] is not None:
        context_string = " ".join(query_result['documents'][0])
        
        prompt = f"""
        {context_string} 
        
        {question} 
        """

        response = model.predict(prompt, **parameters)
        print(f'{bcolors.OKCYAN}Response:{bcolors.ENDC}\n\n{response.text}')

In [15]:
print("Initializing vertexai...")
PROJECT_ID = "fryan-crdemo-1"
vertexai.init(project=PROJECT_ID, location="us-central1")
generation_model = cast(TextGenerationModel, TextGenerationModel.from_pretrained("text-bison@001"))
print(f"Initializing vertexai... {bcolors.OKGREEN}DONE{bcolors.ENDC}\n")

Initializing vertexai...
Initializing vertexai... [92mDONE[0m



In [16]:
print(f'\n{bcolors.WARNING}If running for the first time, ChromaDB will download a small text model to extract embeddings, this may take a minute.{bcolors.ENDC}\n')

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="collection_ancient")
print("Parsing docs into Vector DB...")
insert_into_vector_db(collection, "./docs", 300)
print(f"Parsing docs into Vector DB... {bcolors.OKGREEN}DONE{bcolors.ENDC}\n")

Parsing docs into Vector DB...


/home/jupyter/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100% 79.3M/79.3M [00:07<00:00, 10.8MiB/s]


Parsing docs into Vector DB... [92mDONE[0m



In [20]:
question = "Which ancient civilization was the most militarized?"
print(f"{bcolors.OKCYAN}Question:{bcolors.ENDC} {question}\n")
ask_question(question, collection, generation_model, False)

[96mQuestion:[0m Which ancient civilization was the most militarized?

[96mAsking...[0m

[96mResponse:[0m

The ancient Roman civilization was the most militarized civilization in the world. The Romans had a highly developed military system that was based on a strong infantry force. The Roman army was also well-equipped with weapons and armor. The Romans were able to conquer a large empire because of their military strength.
