# RAG + Graph Knowledge + LlamaIndex
https://medium.aiplanet.com/implement-rag-with-knowledge-graph-and-llama-index-6a3370e93cdd

In [1]:
import logging
import sys
import textwrap

import torch
from langchain.chains import LLMChain
from langchain.embeddings import (HuggingFaceEmbeddings,
                                  HuggingFaceInferenceAPIEmbeddings)
from langchain_community.embeddings import LlamaCppEmbeddings
from langchain_community.llms import LlamaCpp
from llama_cpp import Llama
# from llama_index.llm_predictor import LLMPredictor
# https://docs.llamaindex.ai/en/stable/changes/deprecated_terms.html#llmpredictor
from llama_index.core import (KnowledgeGraphIndex, ServiceContext,
                              SimpleDirectoryReader)
from llama_index.core.graph_stores import SimpleGraphStore
from llama_index.core.storage.storage_context import StorageContext
from llama_index.embeddings.langchain import LangchainEmbedding
# from llama_index.embeddings.langchain import LangchainEmbedding
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
from pyvis.network import Network
from sentence_transformers import SentenceTransformer

# SimpleDirectoryReader to read unstructured data.
# LLMPredictor : Utilized for generating predictions using large language models(LLM)
# ServiceContext : Supples contextual data vital for orchestrating various services
# KnowledgeGraphIndex : Required for both construction and manipulation of Knowledge Graphs.
# SimpleGraphStore : Serves as a straightforward repoistory for storing graph data.

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
prompt = "Some text is provided below. Given the text, extract up to 3 knowledge triplets in the form of (subject, predicate, object). Avoid stopwords.\n---------------------\nExample:Text: Alice is Bob's mother.Triplets:\n(Alice, is mother of, Bob)\nText: Philz is a coffee shop founded in Berkeley in 1982.\nTriplets:\n(Philz, is, coffee shop)\n(Philz, founded in, Berkeley)\n(Philz, founded in, 1982)\n---------------------\nText: page_label: 2\nfile_path: data/Employee-Stock-Option-Plans-ESOP-Best-Practices-2.pdf\n\nTable of Contents  \nPart I: Intro to Options Plans  \n•What is an ESOP?  \n•What is an Option?  \n•Lifecycle of a Startup ESOP  \n•Common Terms in an Options Package  \n•Why Issue Options to Employees ? \n–A Defining Characteristic of Startup Culture  \n–A Necessary Part of the Capital Structure  \n•When to Create an ESOP ? \n•Communicating Options to Employees: % versus $ \n \nPart II: How Much to Grant  \n•Two Approaches  \n•The Top -Down Process  \n–1. How Much Equity to Set Aside in the ESOP ? \n–2. A Typical Distribution Schedule  \n•The Bottom -Up Process  \n–1. Segment Your Human Resources  \n–2. Establish Pay Multipliers for Each Role \n–3. Determine the Dollar Value of the Options Grant  \n–4. Determine the Current Share Price  \n–5. Calculate the Options Grant  \n–An Example: Hiring a CTO  \n•Important Takeaways  Part III : The Fine Print – Terms  \n•Strike Price  \n•Vesting Schedule  \n•The Cliff  \n–Example: Standard Vesting w/ a Cliff \n•Vesting in a Liquidity Event  \n•Exercising Options  \n•Tax Considerations  \n•Legal Advice  \n \nPart IV: ESOPs for the Long Term  \n•Retention Grants  \n•Discretionary Grants  \n•Social Impact Considerations  \n•Options Modeling – Overview  \n•Options Modeling – A Detailed Example  \n \nPart V: Resources & Further Reading\nTriplets:\n"

print(textwrap.fill(prompt,90))

Some text is provided below. Given the text, extract up to 3 knowledge triplets in the
form of (subject, predicate, object). Avoid stopwords. --------------------- Example:Text:
Alice is Bob's mother.Triplets: (Alice, is mother of, Bob) Text: Philz is a coffee shop
founded in Berkeley in 1982. Triplets: (Philz, is, coffee shop) (Philz, founded in,
Berkeley) (Philz, founded in, 1982) --------------------- Text: page_label: 2 file_path:
data/Employee-Stock-Option-Plans-ESOP-Best-Practices-2.pdf  Table of Contents   Part I:
Intro to Options Plans   •What is an ESOP?   •What is an Option?   •Lifecycle of a Startup
ESOP   •Common Terms in an Options Package   •Why Issue Options to Employees ?  –A
Defining Characteristic of Startup Culture   –A Necessary Part of the Capital Structure
•When to Create an ESOP ?  •Communicating Options to Employees: % versus $    Part II: How
Much to Grant   •Two Approaches   •The Top -Down Process   –1. How Much Equity to Set
Aside in the ESOP ?  –2. A Typical

In [4]:
DATA_DIR = "data"
MODELS_DIR = "/home/ehsan/Documents/LLMs/text-generation-webui/models"
EMBEDDING_MODEL_ID = "./models/sentence-transformers/all-mpnet-base-v2"
LLM_MODEL_ID = "/zephyr-7b-beta.Q5_K_M.gguf"
# https://content.accion.org/wp-content/uploads/2018/08/Employee-Stock-Option-Plans-ESOP-Best-Practices-2.pdf

In [5]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [6]:
# HF_TOKEN = "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"
# llm = HuggingFaceInferenceAPI(
#     model_name="HuggingFaceH4/zephyr-7b-beta", token=HF_TOKEN
# )

In [7]:
# embed_model = LangchainEmbedding(
#   HuggingFaceInferenceAPIEmbeddings(api_key=HF_TOKEN,model_name="thenlper/gte-large")
# )

In [8]:
lc_embed_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_ID
)
embed_model = LangchainEmbedding(lc_embed_model)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: ./models/sentence-transformers/all-mpnet-base-v2
Load pretrained SentenceTransformer: ./models/sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cuda
Use pytorch device_name: cuda


In [9]:
test_prompt = """
Question: A rap battle between Stephen Colbert and John Oliver
"""

In [10]:
# test embedding
test_embeddings = lc_embed_model.embed_documents(test_prompt)
test_embeddings

[[-0.012503332458436489,
  0.06143876165151596,
  -0.0067345211282372475,
  0.025237778201699257,
  0.014757907949388027,
  0.03317747265100479,
  -0.01709829270839691,
  -0.0052545578218996525,
  -0.035173073410987854,
  0.026807188987731934,
  0.027776483446359634,
  0.0016695831436663866,
  -0.03590533137321472,
  0.08371192216873169,
  0.0020817057229578495,
  -0.0036475248634815216,
  -0.008081009611487389,
  0.007572973147034645,
  -0.04329582303762436,
  -0.02622479572892189,
  -0.021265167742967606,
  0.013262772932648659,
  -0.005542005877941847,
  -0.031179292127490044,
  0.005881240125745535,
  -0.05413942411541939,
  0.04023878648877144,
  -0.007921067997813225,
  0.01748610846698284,
  -0.0415988564491272,
  0.01805800385773182,
  -0.00015742408868391067,
  -0.0014502581907436252,
  -0.034991998225450516,
  3.953376563003985e-06,
  -0.0013550113653764129,
  0.006736463401466608,
  0.03578318655490875,
  -0.04817281290888786,
  0.04840704798698425,
  0.00781356729567051,
  

In [11]:
n_gpu_layers = 20
n_batch = 512
n_ctx = 4098
llm = LlamaCpp(
  model_path=MODELS_DIR + LLM_MODEL_ID,
  temperature=0.1,
  top_p=1,
  n_gpu_layers=n_gpu_layers,
  device=device,
  n_ctx=n_ctx)

                device was transferred to model_kwargs.
                Please confirm that device is what you intended.
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: Quadro RTX 4000, compute capability 7.5, VMM: yes
llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /home/ehsan/Documents/LLMs/text-generation-webui/models/zephyr-7b-beta.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = huggingfaceh4_zephyr-7b-beta
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32     

In [12]:
# test loaded model
llm.invoke(test_prompt)



llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      76.51 ms /   210 runs   (    0.36 ms per token,  2744.70 tokens per second)
llama_print_timings: prompt eval time =     597.42 ms /    16 tokens (   37.34 ms per token,    26.78 tokens per second)
llama_print_timings:        eval time =   18425.18 ms /   209 runs   (   88.16 ms per token,    11.34 tokens per second)
llama_print_timings:       total time =   19480.36 ms /   225 tokens


'Answer: "I\'m the king of late night, you\'re just a prince-ipal" - Stephen Colbert\n\nExplanation: In this fictional rap battle between late night talk show hosts Stephen Colbert and John Oliver, Colbert takes a jab at Oliver\'s role as host of The Daily Show spinoff series Last Week Tonight. The line "I\'m the king of late night, you\'re just a prince-ipal" is a playful reference to Oliver\'s previous role as a correspondent on The Daily Show with Jon Stewart, where he was known as "Prince George" due to his British heritage. The term "prince-ipal" is a humorous twist on the term "principal," which is often used to describe someone in a leadership role. The line highlights Colbert\'s perceived superiority as the host of The Late Show with Stephen Colbert, while also poking fun at Oliver\'s background as a former correspondent.'

In [13]:
sample_docs = 5
documents = SimpleDirectoryReader(DATA_DIR).load_data()[:sample_docs]
print(len(documents))


5


In [14]:
#setup the service context
service_context = ServiceContext.from_defaults(
    chunk_size=256,
    llm=llm,
    embed_model=embed_model,
)

  service_context = ServiceContext.from_defaults(


In [15]:
#setup the storage context
graph_store = SimpleGraphStore()
storage_context = StorageContext.from_defaults(graph_store=graph_store)

In [16]:
#Construct the Knowlege Graph Undex
index = KnowledgeGraphIndex.from_documents(
  documents=documents,
  max_triplets_per_chunk=3,
  service_context=service_context,
  storage_context=storage_context,
  include_embeddings=True,
  show_progress = True,
  device = device)

Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Processing nodes:   0%|          | 0/7 [00:00<?, ?it/s]

  warn_deprecated(
Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      92.22 ms /   256 runs   (    0.36 ms per token,  2775.94 tokens per second)
llama_print_timings: prompt eval time =    7694.85 ms /   197 tokens (   39.06 ms per token,    25.60 tokens per second)
llama_print_timings:        eval time =   22648.01 ms /   255 runs   (   88.82 ms per token,    11.26 tokens per second)
llama_print_timings:       total time =   30943.39 ms /   452 tokens


Generating embeddings:   0%|          | 0/5 [00:00<?, ?it/s]

Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      92.87 ms /   256 runs   (    0.36 ms per token,  2756.42 tokens per second)
llama_print_timings: prompt eval time =   10907.37 ms /   269 tokens (   40.55 ms per token,    24.66 tokens per second)
llama_print_timings:        eval time =   23013.34 ms /   255 runs   (   90.25 ms per token,    11.08 tokens per second)
llama_print_timings:       total time =   34545.91 ms /   524 tokens


Generating embeddings:   0%|          | 0/14 [00:00<?, ?it/s]

Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      95.54 ms /   256 runs   (    0.37 ms per token,  2679.53 tokens per second)
llama_print_timings: prompt eval time =    7380.01 ms /   173 tokens (   42.66 ms per token,    23.44 tokens per second)
llama_print_timings:        eval time =   23332.03 ms /   255 runs   (   91.50 ms per token,    10.93 tokens per second)
llama_print_timings:       total time =   31328.62 ms /   428 tokens


Generating embeddings:   0%|          | 0/20 [00:00<?, ?it/s]

Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      92.72 ms /   256 runs   (    0.36 ms per token,  2761.00 tokens per second)
llama_print_timings: prompt eval time =    2129.68 ms /    51 tokens (   41.76 ms per token,    23.95 tokens per second)
llama_print_timings:        eval time =   22644.02 ms /   255 runs   (   88.80 ms per token,    11.26 tokens per second)
llama_print_timings:       total time =   25352.08 ms /   306 tokens


Generating embeddings:   0%|          | 0/4 [00:00<?, ?it/s]

Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      92.75 ms /   256 runs   (    0.36 ms per token,  2760.14 tokens per second)
llama_print_timings: prompt eval time =    6162.53 ms /   148 tokens (   41.64 ms per token,    24.02 tokens per second)
llama_print_timings:        eval time =   22750.40 ms /   255 runs   (   89.22 ms per token,    11.21 tokens per second)
llama_print_timings:       total time =   29514.28 ms /   403 tokens


Generating embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      95.12 ms /   256 runs   (    0.37 ms per token,  2691.20 tokens per second)
llama_print_timings: prompt eval time =   11641.94 ms /   277 tokens (   42.03 ms per token,    23.79 tokens per second)
llama_print_timings:        eval time =   23657.19 ms /   255 runs   (   92.77 ms per token,    10.78 tokens per second)
llama_print_timings:       total time =   35939.15 ms /   532 tokens


Generating embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      94.12 ms /   256 runs   (    0.37 ms per token,  2719.87 tokens per second)
llama_print_timings: prompt eval time =    1833.77 ms /    40 tokens (   45.84 ms per token,    21.81 tokens per second)
llama_print_timings:        eval time =   23122.44 ms /   255 runs   (   90.68 ms per token,    11.03 tokens per second)
llama_print_timings:       total time =   25550.31 ms /   295 tokens


Generating embeddings:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
# import pickle
# with open(DATA_DIR+'/knowledge_graph_index.pkl', 'wb') as f:
#     pickle.dump(index, f)

In [None]:
# with open(DATA_DIR+'/knowledge_graph_index.pkl', 'rb') as f:
#     loaded_index = pickle.load(f)

In [17]:
query = "What is ESOP?"
query_engine = index.as_query_engine(include_text=True,
                                     response_mode ="tree_summarize",
                                     embedding_mode="hybrid",
                                     similarity_top_k=5,)
# #
# message_template =f"""<|system|>Check if the following pieces of context has any mention of the keywords provided in the Question.If not then don't know the answer, just say that you don't know.Stop there. Do not try to make up an answer.</s>
# <|user|>
# Question: {query}
# Helpful Answer:
# </s>"""
message_template = f"""<|system|>Question: {query}
Helpful Answer: 
Stop there.
<|user|>"""
response = query_engine.query(message_template)



Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      16.90 ms /    46 runs   (    0.37 ms per token,  2721.73 tokens per second)
llama_print_timings: prompt eval time =    4014.66 ms /   102 tokens (   39.36 ms per token,    25.41 tokens per second)
llama_print_timings:        eval time =    3969.27 ms /    45 runs   (   88.21 ms per token,    11.34 tokens per second)
llama_print_timings:       total time =    8094.20 ms /   147 tokens


INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: 25cc6433-6130-4200-a0ba-81b159ce4aec: Table of Contents  
Part I: Intro to Options Plans  
•What is an ESOP?  
•Wha...
> Querying with idx: 25cc6433-6130-4200-a0ba-81b159ce4aec: Table of Contents  
Part I: Intro to Options Plans  
•What is an ESOP?  
•Wha...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: 9d9e8547-b0a8-42d3-a360-6e585e619386: Table of Contents  
Part I: Intro to Options Plans  
•What is an ESOP?  
•Wha...
> Querying with idx: 9d9e8547-b0a8-42d3-a360-6e585e619386: Table of Contents  
Part I: Intro to Options Plans  
•What is an ESOP?  
•Wha...
INFO:llama_index.core.indices.knowledge_graph.retrievers:> Querying with idx: ea22caf8-661f-4702-8c7c-935c8b72ac5c: What is an ESOP?  
•An Employee Stock Options Plan (ESOP)  
 
•An allocation ...
> Querying with idx: ea22caf8-661f-4702-8c7c-935c8b72ac5c: What is an ESOP?  
•An Employee Stock Options Plan (ESOP)  
 
•An allo

Llama.generate: prefix-match hit

llama_print_timings:        load time =     302.41 ms
llama_print_timings:      sample time =      30.64 ms /    83 runs   (    0.37 ms per token,  2708.44 tokens per second)
llama_print_timings: prompt eval time =   70205.81 ms /  1629 tokens (   43.10 ms per token,    23.20 tokens per second)
llama_print_timings:        eval time =    7859.95 ms /    82 runs   (   95.85 ms per token,    10.43 tokens per second)
llama_print_timings:       total time =   78521.47 ms /  1711 tokens


In [22]:
print(textwrap.fill(response.response.strip(),90))

An ESOP, or Employee Stock Ownership Plan, is a retirement plan that gives employees
ownership interest in the company through company stock or stock options. It can be used
as a tool for employee retention, motivation, and compensation, as well as a way for the
company to provide retirement benefits to its employees. ESOPs can also have tax
advantages for both the company and the employees.


In [23]:
from pyvis.network import Network
from IPython.display import display
g = index.get_networkx_graph()
net = Network(notebook=True,cdn_resources="in_line",directed=True)
net.from_nx(g)
net.show("graph.html")
net.save_graph("Knowledge_graph.html")
#
import IPython
IPython.display.HTML(filename="Knowledge_graph.html")

graph.html


to solve the problem of not detecting CMAKE in llamacpp+GPU:<br>
Add these line to your ~/.bashrc and reload the terminal. After that cmake will find the nvcc

export CUDA_HOME=/usr/local/cuda
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64
export PATH=$PATH:$CUDA_HOME/bin