In [31]:
!pip install langchain-chroma

Collecting langchain-chroma
  Downloading langchain_chroma-0.2.4-py3-none-any.whl.metadata (1.1 kB)
Collecting chromadb>=1.0.9 (from langchain-chroma)
  Downloading chromadb-1.0.12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb>=1.0.9->langchain-chroma)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting fastapi==0.115.9 (from chromadb>=1.0.9->langchain-chroma)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting posthog>=2.4.0 (from chromadb>=1.0.9->langchain-chroma)
  Downloading posthog-4.8.0-py3-none-any.whl.metadata (5.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb>=1.0.9->langchain-chroma)
  Downloading onnxruntime-1.22.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pypika>=0.48.9 (from chromadb>=1.0.9->langchain-chroma)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [15]:
import pandas as pd
import time
import json, os
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from langchain_chroma import Chroma

In [16]:
KVUri = "https://dipanjans-kv.vault.azure.net"
credential = DefaultAzureCredential()
client = SecretClient(vault_url=KVUri, credential=credential)


oai_endpoint_embed= client.get_secret("azure-openai-endpoint-embed")
oai_key_embed=      client.get_secret("azure-openai-key-embed")


embeds = AzureOpenAIEmbeddings(azure_deployment="dipanjan_ada_embed_150k",
                        model = 'text-embedding-ada-002',
                        azure_endpoint= oai_endpoint_embed.value,
                        api_key = oai_key_embed.value)

####### Testing llm and embed model ######
# out = (embeds.embed_query("who is this?"), llm.invoke("who is this?"))
# print(out[0])
# print("****")
# print(out[1])

In [17]:
# course recommendation RAG assignment
def get_df_from_git(github_url):
    if github_url.split(".")[-1] == 'csv':
        df = pd.read_csv(github_url)
        df['desc_word_len'] = df['description'].apply(lambda x: len(x.split())) # assignment 1
        return df , df.shape
    elif github_url.split(".")[-1] == 'json':
        df= pd.read_json(github_url)
        df['answer_snippet_len_words'] = df['answer_snippet'].apply(lambda x: len(x.split()))
        return df , df.shape
        
# github_url = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv"
# # get_df_from_git(github_url)[0]

# github_url = 'https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/self_critique_loop_dataset.json'
# get_df_from_git(github_url)[0]

# Breaking the dataframe into multiple dfs for avoiding token / rate limit errors while generating embeddings


In [18]:
def divide_and_embed(df, embed_batch_max_word_count, embed_column_name, len_column):
    lst= df[len_column].tolist()
    ind_lst= []
    # sum_lst= []
    start, end, sum1 = 0,1,lst[0]
    while end < len(lst):
        if sum1 < embed_batch_max_word_count:
            sum1= sum1 + lst[end]
            end += 1
        else:
            # sum_lst.append(sum1 - lst[end-1])
            sum1 = 0
            ind_lst.append((start, end-1))
            start = end-1
    else:
        ind_lst.append((start, len(lst)))

    dfs = [df.iloc[i:j] for i,j in ind_lst] ### Multiple dfs with n(words) < 250
    embeds_df = pd.DataFrame()

    for i in dfs:
        new_df_embed = i.copy()
        new_df_embed[embed_column_name+'_embed'] = new_df_embed[embed_column_name].apply(embeds.embed_query)
        embeds_df= pd.concat([embeds_df, new_df_embed])
        time.sleep(2)

    embeds_df
    
    return embeds_df 

########## Assignment 2 ##
# github_url = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv"
# df = get_df_from_git(github_url)[0]


#### Assignment 4 ##
github_url = 'https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/self_critique_loop_dataset.json'
df = get_df_from_git(github_url)[0]

embed_column_name = 'answer_snippet'
len_column = 'answer_snippet_len_words'
df_embeds= divide_and_embed(df, 500, embed_column_name, len_column)

In [23]:
print(df['doc_id'].shape[0] == df['doc_id'].nunique())
print(df_embeds.columns)

True
Index(['doc_id', 'question', 'answer_snippet', 'source',
       'confidence_indicator', 'last_updated', 'answer_snippet_len_words',
       'answer_snippet_embed'],
      dtype='object')


# Push to vectorDB (Using local vecDBs for now, though there are other scalable options)

In [26]:
chroma_db = Chroma(collection_name= 'assignment_4_vec_index',
                   embedding_function= None, persist_directory= 'assignment_4_vec_index')
chroma_db._collection.add(embeddings= df_embeds['answer_snippet_embed'].tolist(),
                          ids= df_embeds['doc_id'].tolist(),
                          documents= df_embeds['answer_snippet'].tolist(),
                          metadatas= df_embeds.drop(columns=['doc_id','answer_snippet_embed','answer_snippet']).to_dict(orient= 'records')
                          )


# Check if upsert success 

In [27]:
chroma_db = Chroma(collection_name= 'assignment_4_vec_index',
                   embedding_function= None, persist_directory= 'assignment_4_vec_index')
all_docs = chroma_db._collection.get()
for i in range(len(all_docs['ids'])):
    print('id ---- ', all_docs['ids'][i])
    print('metadata ###', all_docs['metadatas'][i])
    print('answer_snippets ***', all_docs['documents'][i])

id ----  KB001
metadata ### {'question': 'What are best practices for debugging?', 'last_updated': '2024-01-10', 'answer_snippet_len_words': 9, 'source': 'debugging_guide.md', 'confidence_indicator': 'moderate'}
answer_snippets *** When addressing debugging, it's important to follow well-defined patterns...
id ----  KB002
metadata ### {'source': 'performance tuning_guide.md', 'confidence_indicator': 'moderate', 'answer_snippet_len_words': 10, 'last_updated': '2024-02-10', 'question': 'What are best practices for performance tuning?'}
answer_snippets *** When addressing performance tuning, it's important to follow well-defined patterns...
id ----  KB003
metadata ### {'question': 'What are best practices for caching?', 'answer_snippet_len_words': 9, 'confidence_indicator': 'moderate', 'source': 'caching_guide.md', 'last_updated': '2024-03-10'}
answer_snippets *** When addressing caching, it's important to follow well-defined patterns...
id ----  KB004
metadata ### {'last_updated': '2024-