In [2]:
# Run this cell and restart the kernel if the libraries are not present in the environment
! pip install --upgrade --quiet  langchain==0.1.12 langchain-community==0.0.29 langchain-openai==0.1.1 langchain-experimental==0.0.54 neo4j==5.18.0 

In [1]:
# Load all the libraries
from langchain_community.document_loaders import PubMedLoader
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain.text_splitter import TokenTextSplitter
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatMlflow
from langchain.chains.openai_functions import (
    create_openai_fn_chain,
    create_structured_output_chain,
)
from langchain_community.document_loaders import PyPDFLoader
from langchain.prompts import ChatPromptTemplate
from yfiles_jupyter_graphs import GraphWidget
from neo4j import GraphDatabase

import os
from tqdm import tqdm

In [2]:
# Load from environment
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'

graph = Neo4jGraph(url=NEO4J_URI,password=NEO4J_PASSWORD, database=NEO4J_DATABASE)

In [None]:
from langchain_community.graphs.graph_document import (
    Node as BaseNode,
    Relationship as BaseRelationship,
    GraphDocument,
)
from langchain.schema import Document
from typing import List, Dict, Any, Optional
from langchain.pydantic_v1 import Field, BaseModel

class Property(BaseModel):
  """A single property consisting of key and value"""
  key: str = Field(..., description="key")
  value: str = Field(..., description="value")

class Node(BaseNode):
    properties: Optional[List[Property]] = Field(
        None, description="List of node properties")

class Relationship(BaseRelationship):
    properties: Optional[List[Property]] = Field(
        None, description="List of relationship properties"
    )

class KnowledgeGraph(BaseModel):
    """Generate a knowledge graph with entities and relationships."""
    nodes: List[Node] = Field(
        ..., description="List of nodes in the knowledge graph")
    rels: List[Relationship] = Field(
        ..., description="List of relationships in the knowledge graph"
    )

In [None]:
def format_property_key(s: str) -> str:
    words = s.split()
    if not words:
        return s
    first_word = words[0].lower()
    capitalized_words = [word.capitalize() for word in words[1:]]
    return "".join([first_word] + capitalized_words)

def props_to_dict(props) -> dict:
    """Convert properties to a dictionary."""
    properties = {}
    if not props:
      return properties
    for p in props:
        properties[format_property_key(p.key)] = p.value
    return properties

def map_to_base_node(node: Node) -> BaseNode:
    """Map the KnowledgeGraph Node to the base Node."""
    properties = props_to_dict(node.properties) if node.properties else {}
    # Add name property for better Cypher statement generation
    properties["name"] = node.id.title()
    return BaseNode(
        id=node.id.title(), type=node.type.capitalize(), properties=properties
    )


def map_to_base_relationship(rel: Relationship) -> BaseRelationship:
    """Map the KnowledgeGraph Relationship to the base Relationship."""
    source = map_to_base_node(rel.source)
    target = map_to_base_node(rel.target)
    properties = props_to_dict(rel.properties) if rel.properties else {}
    return BaseRelationship(
        source=source, target=target, type=rel.type, properties=properties
    )

In [None]:
# Uncomment to use the AI gateway, change the name of the endpoint as required
# llm = ChatMlflow(
#     target_uri=os.environ["DOMINO_MLFLOW_DEPLOYMENTS"],
#     endpoint="chat-gpt35turbo-sm",
# )

llm = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0)

def get_extraction_chain(
    allowed_nodes: Optional[List[str]] = None,
    allowed_rels: Optional[List[str]] = None
    ):
    prompt = ChatPromptTemplate.from_messages(
        [(
          "system",
          f"""# Knowledge Graph Instructions
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts similar to Wikipedia entries. Restrict the total number of unique entities and concepts to 10 to maintain focus and clarity.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible to a broad audience. The structure of the graph should be straightforward.

## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "researcher" or "participant" in the context of medical studies.
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text, such as the name of a person like "Joe" or computer science terms like "algorithm".

## 3. Handling Numerical Data and Dates
- Numerical data, like measurements of REE or blood pressure, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format, with keys in camelCase
- **Quotation Marks**: Never use escaped single or double quotes within property values.

## 4. Coreference Resolution
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"),
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial.

## 5. Strict Compliance
Adherence to these guidelines is mandatory. Non-compliance will result in the discontinuation of the algorithm's application. Ensuring accuracy, consistency, and clarity in the knowledge graph is paramount for it to be a reliable and valuable medical resource.

          """),
            ("human", "Use the given format to extract information from the following input: {input}"),
            ("human", "Tip: Make sure to answer in the correct format"),
        ])
    return create_structured_output_chain(KnowledgeGraph, llm, prompt, verbose=False)

In [None]:
def extract_and_store_graph(
    document: Document,
    nodes:Optional[List[str]] = None,
    rels:Optional[List[str]]=None) -> None:
    # Extract graph data using OpenAI functions
    extract_chain = get_extraction_chain(nodes, rels)
    data = extract_chain.invoke(document.page_content)['function']
    # Construct a graph document
    graph_document = GraphDocument(
      nodes = [map_to_base_node(node) for node in data.nodes],
      relationships = [map_to_base_relationship(rel) for rel in data.rels],
      source = document
    )
    # Store information into a graph
    graph.add_graph_documents([graph_document])

In [100]:
# Let's try ingesting and querying a pdf now
loader = PyPDFLoader("/mnt/code/data/activision.pdf")
pages = loader.load_and_split()

In [None]:
allowed_nodes = ["Person", "Organization", "Risk", "Investor", "Location", "Game", "Franchise", "League"]
# Ingest the first 20 pages
for i, d in tqdm(enumerate(pages[:20]), total=len(pages[:20])):
    extract_and_store_graph(d, allowed_nodes)
    # extract_and_store_graph(d)

In [135]:
# Query the knowledge graph in a RAG application
from langchain.chains import GraphCypherQAChain

graph.refresh_schema()

# Uncomment and set chat_qa_llm in qa_llm to use the AI gateway, change the name of the endpoint as required
# chat_qa_llm = ChatMlflow(
#     target_uri=os.environ["DOMINO_MLFLOW_DEPLOYMENTS"],
#     endpoint="chat-gpt35turbo-sm",
# )

cypher_chain = GraphCypherQAChain.from_llm(
    graph=graph,
    cypher_llm=ChatOpenAI(temperature=0.3, model="gpt-4"),
    qa_llm=ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k"),
    validate_cypher=True, # Validate relationship directions
    verbose=False
)

In [136]:
cypher_chain.invoke({"query": "Where is Activision Blizzard, Inc. incorporated?"})

{'query': 'Where is Activision Blizzard, Inc. incorporated?',
 'result': 'Activision Blizzard, Inc. is incorporated in Delaware.'}

In [137]:
cypher_chain.invoke({"query": "What does King Digital Entertainment offer?"})

{'query': 'What does King Digital Entertainment offer?',
 'result': 'King Digital Entertainment offers World Of Warcraft and Candy Crush.'}

In [138]:
cypher_chain.invoke({"query": "Who determines changes to compensation?"})

{'query': 'Who determines changes to compensation?',
 'result': "The Board Of Directors' Workplace Responsibility Committee determines changes to compensation."}

In [139]:
cypher_chain.invoke({"query": "What does the board of directors determine?"})

{'query': 'What does the board of directors determine?',
 'result': 'The board of directors determines Annual Compensation, Long-Term Compensation, and Workplace Excellence Objectives.'}

In [140]:
cypher_chain.invoke({"query": "Give me a list of competitors"})

{'query': 'Give me a list of competitors',
 'result': 'Microsoft, Sony, Nintendo, Apple, and Google are competitors.'}

In [141]:
cypher_chain.invoke({"query": "What does Blizzard offer?"})

{'query': 'What does Blizzard offer?',
 'result': 'Blizzard offers the Warcraft Franchise, Diablo, and Overwatch.'}

In [142]:
cypher_chain.invoke({"query": "What does the Form 10-K contain?"})

{'query': 'What does the Form 10-K contain?',
 'result': 'The Form 10-K contains information about Forward-Looking Statements.'}

In [144]:
cypher_chain.invoke({"query": "Who does Activision Blizzard conduct business through?"})

{'query': 'Who does Activision Blizzard conduct business through?',
 'result': 'Activision Blizzard conducts business through Activision Publishing and Blizzard Entertainment.'}

In [156]:
cypher_chain.invoke({"query": "What does the Strategy And Vision focus on?"})

{'query': 'What does the Strategy And Vision focus on?',
 'result': 'The Strategy And Vision focuses on Expanding Audience Reach, Deepening Consumer Engagement, and Increasing Player Investment.'}

In [176]:
cypher_chain.invoke({"query": "What is included in the Forward-Looking Statements?"})

{'query': 'What is included in the Forward-Looking Statements?',
 'result': 'The Forward-Looking Statements include Revenues, Expenses, Income, Loss, Earnings Per Share, Cash Flow, Financial Items, Plans, Objectives, and Transaction.'}

In [174]:
cypher_chain.invoke({"query": "What allegations are mentioned"})

{'query': 'What allegations are mentioned',
 'result': 'The allegations mentioned are violation of Section 5 of the FTC Act and violation of Section 7 of the Clayton Act.'}