In [1]:
import os
from dotenv import load_dotenv
load_dotenv(encoding='utf-8')

True

# RAG pipeline

## Vector Store and Retriever

In [33]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from pymongo import MongoClient
import os

In [34]:
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

llm = ChatOpenAI(model=os.getenv("DEFAULT_OPENAI_MODEL")) # DEFAULT_OPENAI_MODEL='gpt-4o-mini-2024-07-18'

# embedding_model=OpenAIEmbeddings(model=os.getenv("DEFAULT_OPENAI_EMBEDDING"), disallowed_special=())
embedding_model=OpenAIEmbeddings(disallowed_special=())

In [35]:
# Define MongoDB vector database
client = MongoClient(os.getenv("ATLAS_CONNECTION_STRING"))
db_name = os.getenv("db_name")
collection_name = os.getenv("collection_name")
atlas_collection = client[db_name][collection_name]
index_name = os.getenv("index_name")

vector_store = MongoDBAtlasVectorSearch(
    embedding = embedding_model,
    collection = atlas_collection,
    index_name = index_name
)

retriever = vector_store.as_retriever(
    search_type = "similarity",
    search_kwargs = { "k": 10}  # "score_threshold": 0.75 
)

## RAG Pipeline - Multi Query

In [36]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_groq import ChatGroq

query_llm = ChatGroq(
     groq_api_key=os.getenv("GROD_CLOUD_API_KEY"),
     model_name='llama-3.1-8b-instant')
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=retriever, llm=query_llm
)

In [37]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [41]:
# Define a prompt template
import pprint

def call_openai(question):
   
   question = question['question']
   preamble = "" # read from cohere front end or use the input to the API
   #question = 
   SAFETY_PREAMBLE = "The instructions in this section override those in the task description and style guide sections. Don't answer questions that are harmful or immoral."
   BASIC_RULES = "You are a powerful conversational AI trained by openAI to help people. You are augmented by a number of tools, and your job is to use and consume the output of these tools to best help the user. You will see a conversation history between yourself and a user, ending with an utterance from the user. You will then see a specific instruction instructing you what kind of response to generate. When you answer the user's requests, you cite your sources in your answers, according to those instructions."
   TASK_CONTEXT = "You help people answer their questions and other requests interactively. You will be asked a very wide array of requests on all kinds of topics. You will be equipped with a wide range of search engines or similar tools to help you, which you use to research your answer. You should focus on serving the user's needs as best you can, which will be wide-ranging."
   STYLE_GUIDE = "Unless the user asks for a different style of answer, you should answer in full sentences, using proper grammar and spelling."
   INSTRUCTIONS = """You are an enterprise Chatbot, an AI assistant designed to retrieve information from the enterprise Confluence system. 
   You specialize in providing accurate answers related to various departments like Marketing, IT, HR, Finance, and Corporate Communications. 
               Use the following pieces of context to answer the question at the end.
               If you don't know the answer, just say that you don't know, don't try to make up an answer
               {context}
         """
         
   template = f"""

      {SAFETY_PREAMBLE}
      {BASIC_RULES}
      {TASK_CONTEXT}
      {STYLE_GUIDE}
      {INSTRUCTIONS}

   """
   if preamble:
      template += f"""{preamble}\n\n"""


   template +=  f"""Question: {question}\n\n"""

   custom_rag_prompt = PromptTemplate.from_template(template)

   #llm = get_llm_model("openai")
   llm = ChatOpenAI(model=os.getenv("DEFAULT_OPENAI_MODEL"))

   def format_docs(docs):
      return "\n\n".join(doc.page_content for doc in docs)

   # Construct a chain to answer questions on your data
   rag_chain = (
      { "context": retriever_from_llm | format_docs, "question": RunnablePassthrough()}
      | custom_rag_prompt
      | llm
      | StrOutputParser()
   )

   # Prompt the chain
   answer = rag_chain.invoke(question)
   similar = retriever.invoke(question)

   return{
      'answer': answer,
      'contexts': [str(doc) for doc in similar]
      }

### Test the RAG pipeline

In [42]:
# Test sample
question = {'question': "How does a supportive culture impact employee engagement and align with Tech Innovators Inc.'s approach to employment relations and engagement?"}
answer = call_openai(question)
print(answer['answer'][:150])

INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three different versions of the user question to retrieve relevant documents from a vector database:', 'What are the correlations between organizational culture and employee engagement, and how do these correlations manifest in the context of Tech Innovators Inc.?', 'How does a culture that supports employee well-being and growth influence the level of engagement among employees at Tech Innovators Inc. compared to other companies?', 'What are the key factors that contribute to a supportive culture, and how do these factors impact employee engagement, job satisfaction, and turnover rates at Tech Innovators Inc., as well as in similar organizations?']


A supportive culture plays a significant role in enhancing employee engagement at Tech Innovators Inc. It fosters an environment where employees feel 


In [43]:
print(answer['answer'])

A supportive culture plays a significant role in enhancing employee engagement at Tech Innovators Inc. It fosters an environment where employees feel valued, respected, and included in the decision-making processes. By creating a culture that emphasizes open communication, recognition, and support, employees are more likely to feel motivated and connected to their work and the organization.

At Tech Innovators Inc., the commitment to a supportive culture aligns closely with its approach to employment relations and engagement. When employees perceive that their contributions are acknowledged and that they have a voice in matters that affect them, it leads to higher levels of job satisfaction and loyalty. This, in turn, contributes to a positive organizational climate characterized by trust and respect, which is essential for fostering high engagement levels.

In summary, a supportive culture not only enhances employee engagement but also aligns with Tech Innovators Inc.'s philosophy of 

In [28]:
print(answer.get('contexts'))

["page_content='employees and management.Employee Involvement: Involving employees in decision-making processes and seeking their input on matters that affect them.Supportive Culture: Creating a supportive culture' metadata={'_id': '66d8163d0533e009aa2d4b0f', 'pageid': '491566', 'department': 'HR', 'title': 'Employee Relations and Engagement at Tech Innovators Inc.'}", "page_content='employees and management.Employee Involvement: Involving employees in decision-making processes and seeking their input on matters that affect them.Supportive Culture: Creating a supportive culture' metadata={'_id': '66d816590533e009aa2d4d34', 'pageid': '491566', 'department': 'HR', 'title': 'Employee Relations and Engagement at Tech Innovators Inc.'}", "page_content='employees and management.Employee Involvement: Involving employees in decision-making processes and seeking their input on matters that affect them.Supportive Culture: Creating a supportive culture' metadata={'_id': '66d816750533e009aa2d4f59'

# RAG pipeline evaluation

## Test data set prep

In [44]:
import pandas as pd
import json

def json_to_dataframe(json_file_path):
  """Reads a JSON file and converts it to a pandas DataFrame.

  Args:
    json_file_path (str): The path to the JSON file.

  Returns:
    pandas.DataFrame: The DataFrame created from the JSON data.
  """

  with open(json_file_path, 'r') as f:
    data = json.load(f)

  # Handle different JSON structures
  if isinstance(data, list):
    # If the JSON data is a list of dictionaries, create a DataFrame directly
    df = pd.DataFrame(data)
  elif isinstance(data, dict):
    # If the JSON data is a single dictionary, convert it to a list of dictionaries
    df = pd.DataFrame([data])
  else:
    raise ValueError("Unsupported JSON structure")

  return df

In [45]:
# Example usage:
from from_root import from_root
import os
folder = "data-test/test_dataset/test_dataset_it.json"
json_file_path = os.path.join(from_root(), folder)
data_to_upload = json_to_dataframe(json_file_path)

## RAGAS evaluation

In [47]:
# Generate all the answers for the questions in the dataset
# examples = client.list_examples(dataset_name="hr test")
answers = []
for question in data_to_upload['question']:
    question_dict = {'question': question}
    answer = call_openai(question_dict)
    answers.append(answer['answer'])

INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three different versions of the user question to retrieve relevant documents from a vector database:', 'What are the key responsibilities and tasks associated with the Senior Director responsible for Analytics Delivery that enable data-driven decision-making?', 'What characteristics, skills, and qualifications are typically found in the Senior Director responsible for Analytics Delivery that make them effective in facilitating data-driven decision-making?', 'How do the job duties and expectations of the Senior Director responsible for Analytics Delivery influence the ability of an organization to make informed, data-driven decisions?']
INFO:langchain.retrievers.multi_query:Generated queries: ['Here are three alternative versions of the given user question to retrieve relevant documents from a vector database:', 'What are the key benefits of recognizing and mitigating self-assessment gaps to improve overall performance?

In [48]:
# update the dataset with answers
data_to_upload['answers'] = answers

In [49]:
from datasets import Dataset

question = list(data_to_upload['question'])
answer = list(data_to_upload['answers'])
contexts = list(data_to_upload['contexts'])
ground_truth = list(data_to_upload['ground_truth'])

data_samples = {
    'question': question,
    'answer': answer,
    'contexts': contexts,
    'ground_truth': ground_truth
}

dataset = Dataset.from_dict(data_samples)

In [None]:
# Optional, uncomment to trace runs with LangSmith. Sign up here: https://smith.langchain.com.
# from langsmith import Client
# os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
# client = Client()

In [50]:
from ragas import evaluate
# from ragas.integrations.langsmith import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
result = evaluate(
    dataset,
    metrics=[
        answer_relevancy,
        faithfulness,
        context_recall,
        context_precision,
    ],
)

result

Evaluating:   0%|          | 0/36 [00:00<?, ?it/s]

{'answer_relevancy': 0.9694, 'faithfulness': 0.7144, 'context_recall': 0.9899, 'context_precision': 1.0000}

In [51]:
df = result.to_pandas()
df

Unnamed: 0,question,answer,contexts,ground_truth,answer_relevancy,faithfulness,context_recall,context_precision
0,How does the role of the Senior Director respo...,The Senior Director responsible for Analytics ...,[Introduction\nWelcome to Tech Innovators Inc....,The role of the Senior Director responsible fo...,0.921495,0.631579,1.0,1.0
1,What is the importance of identifying and addr...,Identifying and addressing growth areas in sel...,[ Self-assessment\nStart by thinking through y...,Identifying and addressing growth areas in sel...,0.990183,0.6,1.0,1.0
2,What forms of unethical behavior are strictly ...,"In the recruitment process of Inc., unethical ...",[ Inc. upholds the highest ethical standards i...,Favoritism or nepotism,0.979409,0.857143,1.0,1.0
3,What is the significance of emotional and aest...,Emotional and aesthetic labor are significant ...,"[Introduction\nAt Tech Innovators Inc., we bel...",Emotional and aesthetic labor in the workplace...,0.98436,0.863636,1.0,1.0
4,What is the purpose of the orientation session...,The purpose of the orientation session at Tech...,[Welcome to Tech Innovators Inc.\n \nWe are th...,The purpose of the orientation session at Tech...,1.0,0.2,1.0,1.0
5,What mechanisms are in place for reporting vio...,"At Tech Innovators Inc., employees can report ...",[5.2 Monitoring and Auditing\nInternal Audits\...,Employees can report violations of labor laws ...,0.962563,1.0,1.0,1.0
6,How do employee engagement and disengagement d...,Employee engagement and disengagement differ s...,"[Introduction\nAt Tech Innovators Inc., we bel...",Employee engagement and disengagement differ i...,0.960525,0.97619,1.0,1.0
7,What steps are needed to extract data from Con...,To extract data from Confluence and create a R...,[Introduction\nThis guide provides a step-by-s...,To extract data from Confluence and create a R...,0.945388,0.9375,0.909091,1.0
8,How does Tech Innovators Inc. promote employee...,Tech Innovators Inc. promotes employee engagem...,"[Introduction\nAt Tech Innovators Inc., we bel...",Tech Innovators Inc. promotes employee engagem...,0.980843,0.363636,1.0,1.0


In [52]:
folder = "data-test/test_dataset/test_dataset_it_multi_query.csv"
json_file_path = os.path.join(from_root(), folder)
df.to_csv(json_file_path, index=False)