In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rag-pdf/Investment Case For Disruptive Innovation.pdf
/kaggle/input/questions/Evaluation_Questions.txt
/kaggle/input/ark-data/ARK.md


# Strategy for RAG System

* Given PDF Document is very complex unstructured data, so I will use Llama Index, Llama Parser libraries and packages to convert this PDF document into markdown document. 
* I can parse each section of the markdown document for each and every question but it's time consuming and not the objective of this task, I will use this markdown document to build a good RAG system. 
* From langchain unstructured markdown document loader, load the document, split document in smaller chunks, convert these chunks in embeddings and store their embeddings into a vector database. 
* There are many algorithms and tools available to retrieve related information from vector batabse, but for time, processing power, memory and cost constraints, I will use a simple but effective maximum margin relevance algorithm. 
* I will google Flan t5 LLM model to generate the response. 
* There are many methods and techniques available to get accurate, more related answers from LLM. I am planning to use Stuff, Refine, Map Reduce, Map Rerank to feed LLM to generate better answers. 
* For evaluation purpose I am planning to use RAGAs library, so we can get deeper insight about model performance and accuracy, we will get 4 matrix score (faithfulness, answer_relevancy, context_recall, context_precision) for each answer, also this approach use another LLM to evaluate this so no human bias. 
* If time permits, I would like to try few models such as Llama3, ChatGPT 3.5 Turbo, Phi3  and few more models from hugging face, to build leaderboard so we can choose very best model 
* Last but not least, I would like to build a research agent who can get information from the internet and feed into LLM using React, COT and Plan and Execute concepts. 


# Document Ingestion

Given PDF document is very complex document, Simple PDF loader will not give good results so I am using LlamaParse service to handle complex PDF document and convert to Markdown, so we can get most of text data, there are ways to navigate through each and every section of markdown and get exact information, but because of time constrain, I will just use markdown text.  

LlamaParse: Proprietary parsing for complex documents with embedded objects such as tables and figures. LlamaParse directly integrates with LlamaIndex ingestion and retrieval to let you build retrieval over complex, semi-structured documents. It is promised to be able to answer complex questions that simply weren’t possible previously.

LlamaParse is the world's first genAI-native document parsing platform - built with LLMs and for LLM use cases.

Your LLM application performance is only as good as your data. The main goal of LlamaParse is to parse and clean your data, ensuring that it's good quality before passing to any downstream LLM use case such as advanced RAG.

To Execute below code, you need API Key from Llama Index. 
You get 1k free pages a day. If you sign up for the paid plan, you get 7k free pages a week, and then $0.003 for each page.

In [2]:
!pip install -q llama-index llama-parse

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
import nest_asyncio; nest_asyncio.apply()

embed_model = OpenAIEmbedding(model = "text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")
Settings.llm = llm

from llama_parse import LlamaParse
document = LlamaParse(api_key = "XXXXX", result_type="markdown").load_data("/kaggle/input/rag-pdf/Investment Case For Disruptive Innovation.pdf")

print(document[0].text[0:1000])
print(document[0].text)  ##our markdown document from complex PDF

## Sample Code through navigate through Markdown Structure and get exact answer for given question 
"""
from llama_index.core.node_parser import MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(llm = OpenAI(model="gpt-3.5-turbo-0125"),num_workers=8)
nodes = node_parser.get_nodes_from_documents(document)
base_nodes ,objects = node_parser.get_nodes_and_objects(nodes)
recursive_index = VectorStoreIndex(nodes=base_nodes+objects)
queryengine = recursive_index.as_query_engine(similarity_top_k=25)
"""

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
libpysal 4.9.2 requires packaging>=22, but you have packaging 21.3 which is incompatible.
libpysal 4.9.2 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.
momepy 0.7.0 requires shapely>=2, but you have shapely 1.8.5.post1 which is incompatible.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.8.1 which is incompatible.
spopt 0.6.0 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.[0m[31m
[0mError while parsing the file '/kaggle/input/rag-pdf/Investment Case For Disruptive Innovation.pdf': Failed to parse the file: {"detail":"Invalid authentication token"}


IndexError: list index out of range

In [None]:
## Get Questions in List so we can loop through it and prepare Evaluation dataset
my_file = open("/kaggle/input/questions/Evaluation_Questions.txt", "r") 
data = my_file.read() 
data_into_list = data.split("\n") 
data_into_list = list(filter(None, data_into_list))
my_file.close() 
questions = [sub[5:] for sub in data_into_list]
print(questions) 

In [None]:
!pip install -q langchain
!pip install -q tiktoken
!pip install -q chromadb
#!pip install -q faiss-cpu 
#!pip install -q openai
!pip install -q sentence_transformers
!pip install -q ragas
!pip install -q evaluate
!pip install -q unstructured langchain_community langchain-text-splitters

import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
#from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import Document
#from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
import evaluate 

import tempfile
import json

from langchain_community.document_loaders import UnstructuredMarkdownLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [None]:
# loading Markdown document
loader = UnstructuredMarkdownLoader("/kaggle/input/ark-data/ARK.md")
documents = loader.load()

# using local model, we can use ChatGPT 3.5 Turbo, 4, Llama but because of API cost $, Memory Constrain
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "Fill"
cache_dir = "./cache"
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name, cache_folder=cache_dir) 

text_splitter = CharacterTextSplitter(
chunk_size=1000, chunk_overlap=100, separator="\n")
#chunk_size=2000, chunk_overlap=200, separator="\n")
docs = text_splitter.split_documents(documents=documents)
vectordb = Chroma.from_documents(docs, embeddings, persist_directory=cache_dir)

query = "What are the significant risks associated with investing in innovation as highlighted by ARK?"
results_with_scores = vectordb.similarity_search_with_score(query,k=5)
for doc, score in results_with_scores: print(f"Content: {doc.page_content}\nMetadata: {doc.metadata}\nScore: {score}\n\n")

# Information Retrieval

* Hierarchical Chunking
* FlagEmbeddingReranker
* RankGPTRerank
* Maximal Marginal Relevance: MMR tries to reduce the redundancy of results while at the same time maintaining query relevance of results for already ranked documents/phrases etc. 

There are many ways and algorithms are available to retirve information from vector databse, I am using MMR for example here, for professional work we can try different algorithms and evaluate performance. 

In [None]:
from langchain.llms import HuggingFacePipeline

# We want to make this a retriever, so we need to convert our index.  
# This will create a wrapper around the functionality of our vector database 
# so we can search for similar documents/chunks in the vectorstore and retrieve the results:
retriever = vectordb.as_retriever(search_type="mmr",search_kwargs={"k": 4, "lambda_mult": 0.1})
#retriever = vectordb.as_retriever()
# This chain will be used to do QA on the document. We will need
# 1 - A LLM to do the language interpretation
# 2 - A vector database that can perform document retrieval
# 3 - Specification on how to deal with this data

hf_llm = HuggingFacePipeline.from_model_id(model_id="google/flan-t5-large",task="text2text-generation",model_kwargs={"do_sample":True,"max_length": 2048,"cache_dir": cache_dir,},)

# Answer Synthesis

* Create and Refine strategy
* Hierarchical Summarization Strategy
* LangChain Types 
    1.     STUFF:  We just put all of it into one prompt and send that to the language model and get back one response.
    1.     MAP REDUCE: This takes all the chunks, passes them along with the query to a language model, gets back a response, and then uses another language model call to summarize all of the individual responses into a final answer.
    1.     REFINE: “Refine”, which is another method, is again used to loop over many documents. But it does it iteratively.It builds upon the response from the previous document
    1.     MAP_RERANK: “Map_rerank” is a pretty interesting and a bit more experimental one where we do a single call to the language model for each document. And we also ask it to return a score.And then we select the highest score.

There are many algorithms, agents available from Self Reflection, React, Chain of Thought, Plan and Execute, I will use 4 various Chain Types to improve coherence, faithfulness, and relevance of answers. 

In [None]:
from langchain.chains import RetrievalQA
#llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="refine", retriever=retriever)

In [None]:
qa.invoke(questions[0])

In [None]:
qa.invoke(questions[1])

In [None]:
qa.invoke(questions[2])

In [None]:
qa.invoke(questions[3])

In [None]:
qa.invoke(questions[4])

In [None]:
qa.invoke(questions[5])

In [None]:
qa.invoke(questions[6])

In [None]:
qa.invoke(questions[7])

In [None]:
qa.invoke(questions[8])

In [None]:
qa.invoke(questions[9])

In [None]:
qa.invoke(questions[10])

In [None]:
qa.invoke(questions[11])

In [None]:
qa.invoke(questions[12])

In [None]:
qa.invoke(questions[13])

In [None]:
qa.invoke(questions[14])

In [None]:
qa.invoke(questions[15])

In [None]:
qa.invoke(questions[16])

In [None]:
qa.invoke(questions[17])

In [None]:
qa.invoke(questions[18])

In [None]:
qa.invoke(questions[19])

In [None]:
def prepareData(qa): 
    from datasets import Dataset
        #ground_truths = [["GT1"],["GT2"],["GT3"]]
    answers = []
    contexts = []

    # Inference
    for query in questions:
        answers.append(qa.invoke(query))
        contexts.append([docs.page_content for docs in retriever.invoke(query)])

    # To dict
    data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    #"ground_truths": ground_truths
    }
    # Convert dict to dataset
    dataset = Dataset.from_dict(data)
    return dataset

In [None]:
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="refine", retriever=retriever)
dataset_refine = prepareData(qa)

qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff", retriever=retriever)
dataset_stuff = prepareData(qa)

qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="map_reduce", retriever=retriever)
dataset_map_reduce = prepareData(qa)

qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="map_rerank", retriever=retriever)
dataset_map_rerank = prepareData(qa)

### we can just change the model to ChatGPT 4,3.5 Turbo, Llamma 3, Llamma2, .... and create mode evaluation data sets for leader board. 

# Evaluation Metrics

**RAGAs (Retrieval-Augmented Generation Assessment):** RAGAs is that it started out as a framework for “reference-free” evaluation, That means, instead of having to rely on human-annotated ground truth labels in the evaluation dataset, RAGAs leverages LLMs under the hood to conduct the evaluations.

* faithfulness
* answer_relevancy 
* context_recall
* context_precision

In [None]:
from ragas import evaluate
from ragas.metrics import (faithfulness, answer_relevancy, context_recall, context_precision,)

result = evaluate(
    dataset = dataset, 
    metrics=[faithfulness, answer_relevancy],
)

df = result.to_pandas()

# Benchmarking

**Leader Board**

In [None]:
import itertools
import numpy as np
import pandas as pd

param_grid = {  
    'Model_Name': ["HF_LLM","Llama3","GPT 3.5 Turbo"],
    'Chain_Type': ["stuff","refine","map reduce","map rerank"],
    }

# Generate all combinations of parameters
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

LeaderBoard = pd.DataFrame(all_params)
LeaderBoard[['Faithfulness','Answer_Relevancy','Context_Recall','Context_Precision']] = ""
LeaderBoard

**Research Agents**

LangChain Agents are very powerful tool to get latest information from external sources and augment with static documents, it can perform various tasks, just one example shown below, how to get infor from internet. 



In [None]:
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

In [None]:
model_checkpoint = hf_llm
tokenizer = AutoTokenizer.from_pretrained(hf_llm)
model = AutoModelForCausalLM.from_pretrained(hf_llm)
pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline = pipeline)
tools = load_tools(["serpapi"],llm=hf_llm)
agent = initialize_agent(tools,llm,agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose= True)
agent.run(""" How do neural networks serve as a catalyst for other technologies? """)