In [31]:
!pip install langchain-chroma

Collecting langchain-chroma
  Downloading langchain_chroma-0.2.4-py3-none-any.whl.metadata (1.1 kB)
Collecting chromadb>=1.0.9 (from langchain-chroma)
  Downloading chromadb-1.0.12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb>=1.0.9->langchain-chroma)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting fastapi==0.115.9 (from chromadb>=1.0.9->langchain-chroma)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting posthog>=2.4.0 (from chromadb>=1.0.9->langchain-chroma)
  Downloading posthog-4.8.0-py3-none-any.whl.metadata (5.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb>=1.0.9->langchain-chroma)
  Downloading onnxruntime-1.22.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pypika>=0.48.9 (from chromadb>=1.0.9->langchain-chroma)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
import time
import json, os
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from langchain_chroma import Chroma

In [None]:
KVUri = "https://dipanjans-kv.vault.azure.net"
credential = DefaultAzureCredential()
client = SecretClient(vault_url=KVUri, credential=credential)


oai_endpoint_embed= client.get_secret("azure-openai-endpoint-embed")
oai_key_embed=      client.get_secret("azure-openai-key-embed")


embeds = AzureOpenAIEmbeddings(azure_deployment="dipanjan_ada_embed_150k",
                        model = 'text-embedding-ada-002',
                        azure_endpoint= oai_endpoint_embed.value,
                        api_key = oai_key_embed.value)

####### Testing llm and embed model ######
# out = (embeds.embed_query("who is this?"), llm.invoke("who is this?"))
# print(out[0])
# print("****")
# print(out[1])

In [38]:
# course recommendation RAG assignment
def get_df_from_git(github_url):
    df = pd.read_csv(github_url)
    df['desc_word_len'] = df['description'].apply(lambda x: len(x.split()))
    return df , df.shape

# github_url = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv"
# # get_df_from_git(github_url)[0]

# Breaking the dataframe into multiple dfs for avoiding token / rate limit errors while generating embeddings


In [None]:
def divide_and_embed(df, embed_batch_max_word_count):
    lst= df['desc_word_len'].tolist()
    ind_lst= []
    # sum_lst= []
    start, end, sum1 = 0,1,lst[0]
    while end < len(lst):
        if sum1 < embed_batch_max_word_count:
            sum1= sum1 + lst[end]
            end += 1
        else:
            # sum_lst.append(sum1 - lst[end-1])
            sum1 = 0
            ind_lst.append((start, end-1))
            start = end-1
    else:
        ind_lst.append((start, len(lst)))

    dfs = [df.iloc[i:j] for i,j in ind_lst] ### Multiple dfs with n(words) < 250
    embeds_df = pd.DataFrame()

    for i in dfs:
        new_df_embed = i.copy()
        new_df_embed['desc_embed'] = new_df_embed['description'].apply(embeds.embed_query)
        embeds_df= pd.concat([embeds_df, new_df_embed])
        time.sleep(2)

    embeds_df
    
    return embeds_df 

github_url = "https://raw.githubusercontent.com/Bluedata-Consulting/GAAPB01-training-code-base/refs/heads/main/Assignments/assignment2dataset.csv"
df = get_df_from_git(github_url)[0]
df_embeds= divide_and_embed(df, 500)

# Push to vectorDB (Using local vecDBs for now, though there are other scalable options)

In [53]:
chroma_db = Chroma(collection_name= 'course_description_vi',
                   embedding_function= None, persist_directory= 'chroma_store_as2')
chroma_db._collection.add(embeddings= df_embeds['desc_embed'].tolist(),
                          ids= list(map(str,df_embeds.index)),
                          documents= df_embeds['description'].tolist(),
                          metadatas= df_embeds.drop(columns=['description','desc_embed']).to_dict(orient= 'records')
                          )


# Check if upsert success 

In [56]:
chroma_db = Chroma(collection_name= 'course_description_vi',
                   embedding_function= None, persist_directory= 'chroma_store_as2')
all_docs = chroma_db._collection.get()
for i in range(len(all_docs['ids'])):
    print('id ---- ', all_docs['ids'][i])
    print('metadata ###', all_docs['metadatas'][i])
    print('desc ***', all_docs['documents'][i])

id ----  0
metadata ### {'title': 'Foundations of Machine Learning', 'desc_word_len': 50, 'course_id': 'C001'}
desc *** Understand foundational machine learning algorithms including regression, classification, clustering, and dimensionality reduction. This course covers data pre-processing, feature engineering, model selection, hyperparameter tuning, and evaluation metrics. Hands-on labs use scikit-learn and Python to implement end-to-end workflows on real-world datasets, preparing learners for practical machine learning applications with interactive engaging exercises.
id ----  1
metadata ### {'course_id': 'C002', 'desc_word_len': 51, 'title': 'Deep Learning with TensorFlow and Keras'}
desc *** Explore neural network architectures using TensorFlow and Keras frameworks. This course covers feedforward networks, convolutional neural networks, recurrent neural networks, and transfer learning. Learn to build, train, evaluate, and optimize deep learning models for image classification, sequ