#### Setup

In [20]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain.graphs import Neo4jGraph
import os

url = os.getenv("URL")
username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [21]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf")
pages = loader.load_and_split()

In [22]:
pages

[Document(page_content='資訊整體架構健檢及優化服務\n專案 \n建議書徵求說明書  \n(Request for Proposal )  \n \n \n \n \n \n \n \n108年11月25日\n交通部中華郵政股份有限公司', metadata={'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf', 'page': 0}),
 Document(page_content='「資訊整體架構健檢及優化服務 專案」建議書徵求說明書   \n \nI \n交通部中華郵政股份有限公司  \n目錄  \n壹、 概述  ................................ ................................ ................................ ....................  1 \n一、  專案緣起  ................................ ................................ ................................ ...............................  1 \n二、  專案目標  ................................ ................................ ................................ ...............................  1 \n貳、 文件詞彙與用語  ................................ ................................ ................................  2 \n一、  名詞定義  ................................ ................................ ................................ ...............................  2 \n二、  文件用語  .

In [23]:
len(pages)

35

#### RAG

Import packages

In [68]:
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

ImportError: cannot import name 'ValuePassthrough' from 'langchain.runnables' (/home/cch1006/langchain-knowledge-graph/venv/lib/python3.10/site-packages/langchain/runnables/__init__.py)

Split the character using RecursiveCharacterTextSplitter

In [25]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
documents = text_splitter.split_documents(pages)
embeddings = OpenAIEmbeddings()
documents

[Document(page_content='資訊整體架構健檢及優化服務\n專案 \n建議書徵求說明書  \n(Request for Proposal )  \n \n \n \n \n \n \n \n108年11月25日\n交通部中華郵政股份有限公司', metadata={'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf', 'page': 0}),
 Document(page_content='「資訊整體架構健檢及優化服務 專案」建議書徵求說明書   \n \nI \n交通部中華郵政股份有限公司  \n目錄  \n壹、 概述  ................................ ................................ ................................ ....................  1', metadata={'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf', 'page': 1}),
 Document(page_content='一、  專案緣起  ................................ ................................ ................................ ...............................  1', metadata={'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf', 'page': 1}),
 Document(page_content='二、  專案目標  ................................ ................................ ................................ ...............................  1', metadata={'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案

The pdf file has 35 pages and it is is split into 160 documents

In [26]:
len(documents)

160

Use Pinecone to be the vector database

In [27]:
index_name = os.getenv("PINECONE_INDEX_NAME")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

docsearch = PineconeVectorStore.from_documents(documents, embeddings, index_name=index_name)

Search for the top ten most similar document, the docs[0] has 136 words.

In [98]:
query = "專案範圍包括什麼項目?"
docs = docsearch.similarity_search(query, k=10)
# docs
len(docs[0].page_content)

143

In [99]:
docs

[Document(page_content='二、  專案範圍  ................................ ................................ ................................ ...............................  5', metadata={'page': 1.0, 'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf'}),
 Document(page_content='二、  專案範圍  ................................ ................................ ................................ ...............................  5', metadata={'page': 1.0, 'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf'}),
 Document(page_content='專案管理運作方式、 執行之工作 項目、時程規劃及檢查時間點', metadata={'page': 24.0, 'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf'}),
 Document(page_content='專案管理運作方式、 執行之工作 項目、時程規劃及檢查時間點', metadata={'page': 24.0, 'source': 'files/中華郵政股份有限公司_資訊整體架構健檢及優化服務專案_建議書徵求說明書0420.pdf'}),
 Document(page_content='投標廠商針對本專案所建議 範圍，列表說明最近 5年(含)內\n曾承包過 相類似工作經驗 之專案，並提供書面證明 (契約書\n或完工證明文件 等；所附文件須可供審查 履約績效 之內\n容)。 \n(十) 專案管理  \n投標廠商須說明針對 本專案之專案管理 (包含專案工作規\n劃、專案風險管制及專案狀態報告 )運作方式 、執行工作項\n目、

Use langchain to build a RAG : `prompt| model| StrOutputParser()` 

In [44]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = OpenAI()

chain = (prompt| model| StrOutputParser())

context = ''.join([doc.page_content for doc in docs])
chain.invoke({"context": context, "question": query})

'Answer: 交通部中華郵政股份有限公司的重點是進行資訊整體架構健檢及優化服務專案，並徵求建議書 (Request for Proposal)，期望投標廠商能提出具有國內金融業經驗的資訊策略、組織及系統架構健檢與規劃專業能力，並要求專案成員為正式員工。此外，公司也設定了驗收準則，包含文件驗證準則和程式驗收準則。'

Instead of using Pinecone ,trying to use FAISS as vector database

In [101]:
vectorstore = FAISS.from_documents(documents, OpenAIEmbeddings())
retriever = vectorstore.as_retriever(k=10)

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = OpenAI()

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke("專案範圍包括什麼項目?")

'\nAnswer: 專案範圍包括專案管理運作方式、執行的工作項目、時程規劃及檢查時間點。'

#### KG

In [4]:
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
from langchain.graphs import Neo4jGraph
import os

url = os.getenv("URL")
username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")
graph = Neo4jGraph(
    url=url,
    username=username,
    password=password
)

In [4]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )


In [None]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props: 
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [7]:
import os
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions for GPT-4
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.
## 5. Strict Compliance
Adhere to the rules strictly. Non-compliance will result in termination.
          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [None]:
nodes_and_relations = []
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    # Store the chain for later use
    nodes_and_relations.append(extract_chain)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [None]:
from tqdm import tqdm

for i, d in tqdm(enumerate(pages), total=len(pages)):
    extract_and_store_graph(d)

In [None]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose=True
)
result = cypher_chain.invoke({"query": "專案範圍包括什麼項目?"})

#### RAG+KG

In [95]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain
from langchain_openai import ChatOpenAI

graph.refresh_schema()

query = "專案範圍包括什麼項目?"

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    validate_cypher=True, # Validate relationship directions
    verbose=True,
    return_intermediate_steps=True
)
result = cypher_chain.invoke({"query": query})

template = """Generate a new query in other words, the meaning of the following query should stay the same, you should not miss any information.
THE LANGUAGE SHOULD STAY THE SAME ,IF THE QUERY IS IN CHINESE, THE NEW QUERY SHOULD ALSO BE IN CHINESE.
{query}
"""
prompt = ChatPromptTemplate.from_template(template)
model = OpenAI()
chain = (prompt| model| StrOutputParser())

count = 0
while result["intermediate_steps"][1]["context"] == [] and count < 3:
    count += 1
    query = chain.invoke({"query": query})
    result = cypher_chain.invoke({"query": query})
result



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Project)-[:INCLUDES]->(i) RETURN p.name, i.name[0m
Full Context:
[32;1m[1;3m[{'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '硬體架構'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '軟體架構'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '備援架構'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '能量規劃'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '監控規劃'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '管理與維運'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '同一區域之防禦機制'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '端點設備管控機制'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '檔案分享資料安全控管架構與程序'}, {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '人員配置'}][0m

[1m> Finished chain.[0m


{'query': '專案範圍包括什麼項目?',
 'result': '專案範圍包括硬體架構, 軟體架構, 備援架構, 能量規劃, 監控規劃, 管理與維運, 同一區域之防禦機制, 端點設備管控機制, 檔案分享資料安全控管架構與程序, 以及人員配置。',
 'intermediate_steps': [{'query': 'MATCH (p:Project)-[:INCLUDES]->(i) RETURN p.name, i.name'},
  {'context': [{'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '硬體架構'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '軟體架構'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '備援架構'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '能量規劃'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '監控規劃'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '管理與維運'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '同一區域之防禦機制'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '端點設備管控機制'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '檔案分享資料安全控管架構與程序'},
    {'p.name': '資訊整體架構健檢及優化服務 專案', 'i.name': '人員配置'}]}]}

In [102]:
from langchain_core.runnables import Runnable

class ValueRunnable(Runnable):
    def __init__(self, value):
        super().__init__() 
        self.value = value

    def invoke(self, _ , config=None):
        return self.value


vectorstore = FAISS.from_documents(documents, OpenAIEmbeddings())
retriever = vectorstore.as_retriever(k=10)

template = """Answer the question based on the following context:
{context}
Here is some additional information to answer the question:
{context_neo4j}
The context is much more important to context_neo4j.The context_neo4j is just a reference to help you answer the question.

Question: {question}
"""
prompt1 = ChatPromptTemplate.from_template(template)

model = OpenAI()

context_neo4j = ValueRunnable(result["result"])
chain = (
    {"context": retriever, "context_neo4j": context_neo4j, "question": RunnablePassthrough()}
    | prompt1
    | model
    | StrOutputParser()
)

chain.invoke(query)

'\nAnswer: 專案範圍包括硬體架構, 軟體架構, 備援架構, 能量規劃, 監控規劃, 管理與維運, 同一區域之防禦機制, 端點設備管控機制, 檔案分享資料安全控管架構與程序, 以及人員配置。'