# Retriever.Elser

## Conventions

@notebook: run in jupyter notebook
@console: run in static console 

## References

Source of Truth for Elser files: 

- https://github.com/c3-e/c3generativeAi/blob/2982c27aa26360a03b5fa622c0677a96f668e821/genai/genAiBase/src/retriever/Genai.Retriever.Elser.c3typ

- https://github.com/c3-e/c3generativeAi/blob/860f8d5ef8c45c85487b10c4e20753a846e64f23/genai/genAiBase/src/retriever/Genai.Retriever.Elser.py

## HW Profile and App Setup

In [None]:
// Run from $cluster_url/$env_name/c3/static/console/
var cpus = 8;
var memory = 64000;
var jvmRatio = 0.5;
C3.app().nodePool("singlenode").setHardwareProfile(cpus, memory, 0).setJvmSpec(jvmRatio).update();

In [4]:
!pip install deepdiff

Collecting deepdiff
  Downloading deepdiff-6.7.1-py3-none-any.whl (76 kB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 76.6/76.6 kB 6.3 MB/s eta 0:00:00
[?25hCollecting ordered-set<4.2.0,>=4.0.2
  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)
Installing collected packages: ordered-set, deepdiff
Successfully installed deepdiff-6.7.1 ordered-set-4.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import os
import tempfile
import shutil
import zipfile
import re
from typing import TYPE_CHECKING, Final
import json
from deepdiff import DeepDiff

if TYPE_CHECKING:
    c3 = {}  # Make pylance stop complaining about c3

In [6]:
c3

NameError: name 'c3' is not defined

In [7]:
test = "ALLCAPS"

In [8]:
test.lower()

'allcaps'

In [2]:
# @global
TEST_FILE_COUNT: Final[int] = 5

NameError: name 'Final' is not defined

In [None]:
# @notebook
c3.Pkg.setDevMode(True)

In [None]:
# @notebook
c3.UiSdlConfig.setConfigValue("infrastructure.webpackMode", "development")

In [None]:
# @notebook
c3.EnableAclPrivilege.removeAll(confirm=True)

In [None]:
mapbox_secret = ''

In [None]:
# @notebook
c3.UiSdlMapboxConfig.setSecretValue('accessToken', mapbox_secret)

In [None]:
open_ai_secret = ''

In [None]:
# @notebook
c3.Genai.Llm.OpenAI.Config.make().setSecretValue("apiKey", open_ai_secret)

In [None]:
import logging
import sys
logger = logging.getLogger()
handler = logging.StreamHandler(sys.stdout)
logger.addHandler(handler)
def isDebugEnabled(self):
    return True
logger.isDebugEnabled = isDebugEnabled.__get__(logger)

In [None]:
logger.setLevel(logging.DEBUG)
handler.setLevel(logging.DEBUG)

## Create Test Context

In [None]:
# NOTE: Ignore for now
# TODO: Determine whether we need to import any functions
retrieverSourceFile = c3.Genai.PyUtil.importResourceFile(
    "genai/genAiBase/src/retriever/Genai.Retriever.Elser.py"
)

In [None]:
filename = "Retriever.Elser.ipynb"
baseLocation = re.sub(r"\.[^.]+$", "", filename) 
ctx = c3.TestApi.createContext(filename)

In [None]:
rootLocation = baseLocation + '/root'

In [None]:
targetLocation = baseLocation + '/target'

In [None]:
elserName = filename

In [None]:
elserApiKey = 'X2p0SDY0MEI3RXRYbkVwbzdheFY6X1NXMFQ0WlBRRUd0RXVwaXU1MGtwQQ=='

In [None]:
# TODO: define all fields for object
vectorStore = c3.Genai.Retriever.Elser.make({
     "name": filename,
     "apikey": elserApiKey
    #  "baseUrl" taking default value, can specify your own
    # "modelId" taking default value, can specify your own
}).upsert().get('this')

In [None]:
c3.Genai.Retriever.ColBERT.DataConfig(name=filename).setConfig()

In [None]:
c3.Genai.Retriever.ColBERT.DataConfig(name=filename).getConfig()

In [None]:
sourceCollection = c3.Genai.SourceCollection(id=filename,rootUrl=rootLocation,
      targetUrl=targetLocation).upsert().get('this')

In [None]:
# Upload documents to the data folder
# @notebook
# TODO: Ticket against CORNEA project to get access to documents
local_dir = 'data/'

# limit test to a few amount of files
for filename in os.listdir(local_dir)[:TEST_FILE_COUNT]:
        if filename.endswith(".pdf"):
            new_filename = filename.replace(" ", "").replace("-", "").replace("[", "").replace("]", "")
            old_filepath = os.path.join(local_dir, filename)
            new_filepath = os.path.join(local_dir, new_filename)
            os.rename(old_filepath, new_filepath)

for fname in os.listdir(local_dir):
    if fname.endswith(".pdf"):
        local_fullname = os.path.join(local_dir, fname)
        remote_fullname = os.path.join(sourceCollection.rootUrl, fname)
        if os.path.isfile(local_fullname):  # check if file exists
            with open(local_fullname, "rb") as f:
                c3.Client.uploadFile(f, remote_fullname)
                #print(f"File {local_fullname} uploaded!")
        else:
            print(f"File {local_fullname} does not exist.")

In [None]:
files = c3.FileSystem.inst().listFiles(sourceCollection.rootUrl, limit=300)

In [None]:
nSourceFiles = c3.Genai.SourceFile.fetchCount()

In [None]:
# TODO: Change to asynchronous wait logic
while nSourceFiles > 0: 
    c3.Genai.SourceFile.removeAll({}, True)
    nSourceFiles = c3.Genai.SourceFile.fetchCount()

In [None]:
assert len(nSourceFiles) == 0

In [None]:
# Notebook
def applyMetadata(file):
    metadata = {}
    # View first 2 passages
    passages = file.readPassages()
    indexes = list(range(0,2))
    if len(passages) > len(indexes):
        reqd_passages = [passages[ind].contentStr for ind in indexes]
    else:
        reqd_passages = [p.contentStr for p in passages]
    # Concatenate selected passages
    passage_full  = '\n'.join(reqd_passages)
    # extract classification marking for the document
    if "top secret" in passage_full.lower() or "ts" in passage_full.lower():
        metadata['classificationMarking'] = "TS"
    elif "secret" in passage_full.lower():
        metadata['classificationMarking'] = "S"
    elif "controlled unclassified information" in passage_full.lower() or "cui" in passage_full.lower():
        metadata['classificationMarking'] = 'CUI'
    else:
        metadata['classificationMarking'] = 'unknown'
    metadata['fileName'] = file.originalFile.url.split('/')[-1]
    metadata['fileType'] = metadata['fileName'].split('.')[-1]
    metadata = c3.Genai.SourceFile.Metadata(**metadata)
    file.withField('metadata', metadata).upsert()
        
    return metadata

In [None]:
# Notebook
c3.Genai.SourceCollection.forId(filename).setMetadataConfig({"embedMetadata": True})
c3.Genai.SourceCollection.forId(filename).config()

In [None]:
# Notebook
func = c3.Lambda.fromPyFunc(applyMetadata)
c3.Genai.SourceCollection.forId(filename).withSyncMetadataLambda(func).upsert()

In [None]:

syncJob = c3.Genai.SourceCollection.forId(filename).sync()
# This may take a long time because installRuntime must execute on the first run
c3.TestApi.waitForJob(ctx, syncJob.fileSystemJob, 1, 600)

In [None]:
// @console
// run in single batches to avoid r/w corruption
var batchSize = 1;
var spec = BatchFetchSpec.builder().batchSize(batchSize).include("id, this").filter("!exists(passagesFile)").build();
 
job = ObjBatchMapReduceJob.forId(Genai.SourceFile.eachObjBatch(spec, (objs, ctx) => {
    txs = Genai.SourceFile.TextSplitter.Spec.make({'textSplitter': 'TokenTextSplitter'});
    pdfChunkerSpec = Genai.SourceFile.Chunker.UniversalChunker.Spec.make({'chunker': Genai.SourceFile.Chunker.PyPdf, 'textSplitters': [txs]})

 
    fileExtMap = {
      '.pdf': pdfChunkerSpec,
    }
 
    chunkerSpec = Genai.SourceFile.Chunker.UniversalChunker.Spec.make({'chunker': Genai.SourceFile.Chunker.Universal, 'textSplitters': [txs], 'fileExtToChunkerSpecMap': fileExtMap})
    chunkerSpec.chunker.chunkFilesBatch(objs, chunkerSpec);
}))

In [None]:
c3.Genai.Retriever.Elser.DataConfig(name=filename).setConfigValue("useRemote", False)

In [None]:
c3.Genai.Retriever.Elser.DataConfig(name=filename).getConfig()

In [None]:
# Notebook
mergedPassages = sourceCollection.mergedPassages(True, True)

In [None]:
# Notebook check, should be NOT empty
sourceCollectionPassages = sourceCollection.readPassages()

In [None]:
TEST_PASSAGE_COUNT: Final[int] = len(sourceCollectionPassages)
TEST_PASSAGES: Final[type(c3.Genai.SourcePassage)] = sourceCollectionPassages

In [None]:
# calling indexCollections raises errors due to arrayBuilder issues
passages = sourceCollection.readPassages()
len(passages)

In [None]:
// @console
// NOTE: used if call above fails
function serializeArray(typeName, typeArray) {
    var retVal = [];
    for (i = 0; i < typeArray.length; i++) {
        retVal.push(typeName.make(typeArray[i]))
    }
    return retVal;
}
var gsc = Genai.SourceCollection.forId(customerName).get("this");
var passages = gsc.readPassages()
passages = serializeArray(Genai.SourcePassage, passages)

In [None]:
objList = vectorStore.indexFiles(passages)
objList.count()

In [None]:
# NOTE: The following will be updated as implementation changes

In [None]:
# TODO: Sample string from each of the sourcePDFS and construct query

In [None]:
testQuestion = ''

In [None]:
# assert filename correct

In [None]:
# assert baseUrl correct

In [None]:
# assert modelId correct

In [None]:
# assert apiKey correct

In [None]:
# Test initialize
result = None
try:
    result = vectorStore.initialize(failIfMissing=False)
except NotImplementedError as e:
    result = e
except RuntimeError as e:
    result = e
except Exception as e:
    result = e

assert type(result) != type(NotImplementedError)
assert type(result) != type(RuntimeError)
assert type(result) != type(Exception)
assert len(result) == TEST_PASSAGE_COUNT

In [None]:
# TODO: Add mock JSON response
TEST_CREATE_INDEX_RESPONSE: Final[json] = {}

In [None]:
# TODO: Finish test
result = None
try:
    result = vectorStore.createIndex()
except Exception as e:
    result = e
assert type(result) != type(Exception)
assert type(result) == type(TEST_CREATE_INDEX_RESPONSE)
assert not DeepDiff(result, TEST_CREATE_INDEX_RESPONSE)

In [None]:
# TODO: Add mock JSON response
TEST_CREATE_INGEST_RESPONSE: Final[json] = {}

In [None]:
# TODO: Finish test
result = None
try:
    result = vectorStore._createingest()
except Exception as e:
    result = e
assert type(result) != type(Exception)
assert type(result) == type(TEST_CREATE_INGEST_RESPONSE)
assert not DeepDiff(result, TEST_CREATE_INGEST_RESPONSE)

In [None]:
# TODO: Finish test
result = None
try:
    result = vectorStore._createingest()
except Exception as e:
    result = e
assert type(result) != type(Exception)
assert type(result) == type(TEST_CREATE_INGEST_RESPONSE)
assert not DeepDiff(result, TEST_CREATE_INGEST_RESPONSE)

In [None]:
# TODO: Finish test
result = None
try:
    result = vectorStore.indexPassages(TEST_PASSAGES)
except Exception as e:
    result = e

assert type(result) != type(Exception)
assert type(result) == type(getattr(c3, f"ObjList<Genai.SourcePassage>")(objs=TEST_PASSAGES))
assert not DeepDiff(result, TEST_PASSAGES)

In [None]:
# TODO: Finish test
results = vectorStore.similaritySearch(
    
)


In [None]:
# TODO: Finish test
result = None
try:
    result = vectorStore.unindexPassages(TEST_PASSAGES)
except Exception as e:
    result = e

assert type(result) != type(Exception)
assert type(result) == type(getattr(c3, f"ObjList<Genai.SourcePassage>")(objs=TEST_PASSAGES))
assert not DeepDiff(result, TEST_PASSAGES)

In [None]:
# should return NotImplementedError 
# TODO: Finish test
result = None
try:
    result = vectorStore.indexPassagesFile(TEST_PASSAGES)
except Exception as e:
    result = e

assert type(result) != type(Exception)
assert type(result) == type(getattr(c3, f"ObjList<Genai.SourcePassage>")(objs=TEST_PASSAGES))
assert not DeepDiff(result, TEST_PASSAGES)

In [None]:
# TODO: Finish test
result = c3.Genai.Retriever.Elser.forName(filename)

In [None]:
# should return NotImplementedError 
result = None
try:
    result = vectorStore.indexedSourceFiles()
except NotImplementedError as nie:
    result = nie

assert type(result) != type(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore.indexedSourcePassages()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore.passageCount()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore.getNativeRetriever()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore._safe_this()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore._local_root()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore._data_root()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore._corpus_path()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore._zip_and_upload()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore._download_and_unzip()
except NotImplementedError as nie:
    print(nie)

In [None]:
# should return NotImplementedError 
try:
    vectorStore._do_index()
except NotImplementedError as nie:
    print(nie)