## Create Graph using text

In [None]:
!pip install langchain langchain-community langchain-openai langchain-experimental neo4j

In [135]:
query = "companies that Elon Musk co-founded and subsidiary other company"

In [142]:
import getpass
import os

#OpenAI
os.environ["OPENAI_API_KEY"] = ""
os.environ["AZURE_OPENAI_API_KEY"] = ""
os.environ["AZURE_OPENAI_ENDPOINT"] = ""
os.environ["AZURE_OPENAI_API_VERSION"] = "2024-02-01"
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = "gpt4"
os.environ["AZURE_OPENAI_EMBEDING_MODEL"] = "text-embedding-ada-002"
#Search 
os.environ["AZURE_SEARCH_API_KEY"] = "",
os.environ["AZURE_SEARCH_ENDPOINT"] = ""
os.environ["AZURE_SEARCH_INDEX"] = ""

#Neo4j
os.environ["NEO4J_URI"] = "bolt://54.146.136.117:7687"
os.environ["NEO4J_USERNAME"] = "test"
os.environ["NEO4J_PASSWORD"] = "password"
os.environ["NEO4J_DATABSE"] = "neo4j"

In [24]:
# the database name to connect to
from langchain_community.graphs import Neo4jGraph
graph = Neo4jGraph(database=os.environ["NEO4J_DATABSE"])

from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"])
llm_transformer = LLMGraphTransformer(llm=llm)

### File to Graph

### Wiki to Graph

In [None]:
from langchain_community.document_loaders import WikipediaLoader

query = "Elon Musk"
raw_documents = WikipediaLoader(query=query).load()

In [None]:
# DiffbotGraphTransformer calls Diffbot Natural Language API to extract entities and relationships in the article
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
import os

diffbot_api_key = "" 

diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)

# Diffbot's Natural Language API converts unstructured text data into knowlegde graphs
graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)

# add knowledge graph data to the neo4j database
graph.add_graph_documents(graph_documents)

### Text to Graph example

In [4]:
#Convert Text to Vector Graph

from langchain_core.documents import Document

text = """
Marie Curie, born in 1867, was a Polish and naturalised-French physicist and chemist who conducted pioneering research on radioactivity.
She was the first woman to win a Nobel Prize, the first person to win a Nobel Prize twice, and the only person to win a Nobel Prize in two scientific fields.
Her husband, Pierre Curie, was a co-winner of her first Nobel Prize, making them the first-ever married couple to win the Nobel Prize and launching the Curie family legacy of five Nobel Prizes.
She was, in 1906, the first woman to become a professor at the University of Paris.
"""
documents = [Document(page_content=text)]
graph_documents = llm_transformer.convert_to_graph_documents(documents)

#result
print(f"Nodes:{graph_documents[0].nodes}")
print(f"Relationships:{graph_documents[0].relationships}")

Nodes:[Node(id='Marie Curie', type='Person'), Node(id='1867', type='Year'), Node(id='Polish', type='Nationality'), Node(id='Naturalised-French', type='Nationality'), Node(id='Physicist', type='Occupation'), Node(id='Chemist', type='Occupation'), Node(id='Radioactivity', type='Fieldofstudy'), Node(id='Nobel Prize', type='Award'), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization'), Node(id='1906', type='Year'), Node(id='Curie Family Legacy', type='Legacy'), Node(id='Five Nobel Prizes', type='Achievement')]
Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='1867', type='Year'), type='BORN_IN'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Polish', type='Nationality'), type='NATIONALITY'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Naturalised-French', type='Nationality'), type='NATIONALITY'), Relationship(source=Node(id='Marie Curie', type='Person')

In [5]:
# Graph Transformer Filtered
llm_transformer_filtered = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=["Person", "Country", "Organization"],
    allowed_relationships=["NATIONALITY", "LOCATED_IN", "WORKED_AT", "SPOUSE"],
)
graph_documents_filtered = llm_transformer_filtered.convert_to_graph_documents(
    documents
)

#result
print(f"Nodes:{graph_documents_filtered[0].nodes}")
print(f"Relationships:{graph_documents_filtered[0].relationships}")

Nodes:[Node(id='Marie Curie', type='Person'), Node(id='Pierre Curie', type='Person'), Node(id='Poland', type='Country'), Node(id='France', type='Country'), Node(id='University Of Paris', type='Organization')]
Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Poland', type='Country'), type='NATIONALITY'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='France', type='Country'), type='NATIONALITY'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='SPOUSE'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='WORKED_AT')]


In [6]:
# Graph Transformer Properties
llm_transformer_props = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=["Person", "Country", "Organization"],
    allowed_relationships=["NATIONALITY", "LOCATED_IN", "WORKED_AT", "SPOUSE"],
    node_properties=["born_year"],
)
graph_documents_props = llm_transformer_props.convert_to_graph_documents(documents)

#result
print(f"Nodes:{graph_documents_props[0].nodes}")
print(f"Relationships:{graph_documents_props[0].relationships}")

Nodes:[Node(id='Marie Curie', type='Person', properties={'born_year': '1867'}), Node(id='Pierre Curie', type='Person'), Node(id='University Of Paris', type='Organization'), Node(id='Poland', type='Country'), Node(id='France', type='Country'), Node(id='Nobel Prize', type='Organization')]
Relationships:[Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Poland', type='Country'), type='NATIONALITY'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='France', type='Country'), type='NATIONALITY'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='University Of Paris', type='Organization'), type='WORKED_AT'), Relationship(source=Node(id='Marie Curie', type='Person'), target=Node(id='Pierre Curie', type='Person'), type='SPOUSE'), Relationship(source=Node(id='Pierre Curie', type='Person'), target=Node(id='Nobel Prize', type='Organization'), type='WORKED_AT'), Relationship(source=Node(id='Marie Curie', type='Person'), tar

In [11]:
#Create Graph
graph.add_graph_documents(graph_documents_props)

## Query Graph

In [143]:
graph.schema



'Node properties:\nPerson {positionHeld: STRING, academicDegree: STRING, id: STRING, name: STRING, age: STRING, dateOfBirth: STRING, numberOfChildren: STRING, causeOfDeath: STRING}\nOrganization {id: STRING, name: STRING, foundingDate: STRING, productType: STRING}\nSkill {name: STRING, id: STRING}\nLocation {id: STRING, name: STRING}\nAward {name: STRING, id: STRING}\nRelationship properties:\nEMPLOYEE_OR_MEMBER_OF {evidence: STRING, isCurrent: STRING, isNotCurrent: STRING, startTime: STRING, endTime: STRING, positionHeld: STRING}\nHAS_CHILD {evidence: STRING}\nFAMILY_MEMBER {evidence: STRING, isNotCurrent: STRING, startTime: STRING}\nSOCIAL_RELATIONSHIP {evidence: STRING, isNotCurrent: STRING, startTime: STRING}\nFOUNDED_BY {evidence: STRING}\nACQUIRED_BY {evidence: STRING, pointInTime: STRING}\nINDUSTRY {evidence: STRING}\nGEOGRAPHIC_HERITAGE {evidence: STRING}\nHAS_PARENT {evidence: STRING}\nCHIEF_EXECUTIVE_OFFICER {evidence: STRING, isNotCurrent: STRING, isCurrent: STRING, endTime:

In [46]:
# Init LLM Search in Graph
from langchain.chains import GraphCypherQAChain

initial_context_from_knowledge_graph = GraphCypherQAChain.from_llm(
    cypher_llm=llm, 
    qa_llm=llm, 
    graph=graph,
    validate_cypher=True, 
    verbose=True
)

In [136]:
# Knowledge Graph Instructions and Query
rules = """
# Knowledge Graph Instructions:
## 1. Overview
You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph.
- **Nodes** represent entities and concepts. They're akin to Wikipedia nodes.
- The aim is to achieve simplicity and clarity in the knowledge graph, making it accessible for a vast audience.
## 2. Labeling Nodes
- **Consistency**: Ensure you use basic or elementary types for node labels.
  - For example, when you identify an entity representing a person, always label it as **"person"**. Avoid using more specific terms like "mathematician" or "scientist".
- **Node IDs**: Never utilize integers as node IDs. Node IDs should be names or human-readable identifiers found in the text.
{'- **Allowed Node Labels:**' + ", ".join(allowed_nodes) if allowed_nodes else ""}
{'- **Allowed Relationship Types**:' + ", ".join(allowed_rels) if allowed_rels else ""}
## 3. Handling Numerical Data and Dates
- Numerical data, like age or other related information, should be incorporated as attributes or properties of the respective nodes.
- **No Separate Nodes for Dates/Numbers**: Do not create separate nodes for dates or numerical values. Always attach them as attributes or properties of nodes.
- **Property Format**: Properties must be in a key-value format.
- **Quotation Marks**: Never use escaped single or double quotes within property values.
- **Naming Convention**: Use camelCase for property keys, e.g., `birthDate`.
## 4. Coreference Resolution
- **Maintain Entity Consistency**: When extracting entities, it's vital to ensure consistency.
If an entity, such as "John Doe", is mentioned multiple times in the text but is referred to by different names or pronouns (e.g., "Joe", "he"), 
always use the most complete identifier for that entity throughout the knowledge graph. In this example, use "John Doe" as the entity ID.  
Remember, the knowledge graph should be coherent and easily understandable, so maintaining consistency in entity references is crucial. 
"""
question_neuralink_industry = f"{query}, {rules}"
kg_context = initial_context_from_knowledge_graph.run(question_neuralink_industry)
kg_context




> Entering new GraphCypherQAChain chain...
Generated Cypher:
cypher
MATCH (p:Person {name: "Elon Musk"})-[:FOUNDED_BY]-(c:Organization)-[:SUBSIDIARY]->(s:Organization)
RETURN c.name AS CompanyName, s.name AS SubsidiaryName

Full Context:
[{'CompanyName': 'SpaceX', 'SubsidiaryName': 'TBC - THE BORING COMPANY'}, {'CompanyName': 'OpenAI', 'SubsidiaryName': 'OpenAI Global, LLC'}]

> Finished chain.


'SpaceX and OpenAI are companies that Elon Musk co-founded which have subsidiary companies. TBC - The Boring Company is a subsidiary of SpaceX, and OpenAI Global, LLC is a subsidiary of OpenAI.'

## Go to WIKI - Collect and Embed - if not using Azure Search - Using Chroma - Using DSPY

In [53]:
from langchain_community.document_loaders import WikipediaLoader

query = "Elon Musk"
raw_documents = WikipediaLoader(query=query).load()



  lis = BeautifulSoup(html).find_all('li')


In [74]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings

# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=512, chunk_overlap=100
)
all_splits = text_splitter.split_documents(raw_documents)

import chromadb.utils.embedding_functions as embedding_functions
embeddings = AzureOpenAIEmbeddings(model="text-embedding-ada-002", azure_endpoint="https://lioropenaitest.openai.azure.com/", api_key="dc7032d007034a0c99db583e0726d41a", api_version="2024-02-01")

CHROMA_COLLECTION_NAME = "dspy-rag-chroma"
CHROMADB_DIR = "dspy_rag_chroma/"

# Index
vectorstore = Chroma.from_documents(
    documents=all_splits,
    collection_name=CHROMA_COLLECTION_NAME,
    embedding=embeddings,
    persist_directory=CHROMADB_DIR
)
retriever = vectorstore.as_retriever()

In [75]:
vectorstore.persist()

  warn_deprecated(


In [58]:
import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from dspy.retrieve.chromadb_rm import ChromadbRM
import os

embedding_function = OpenAIEmbeddingFunction(
    api_key=os.environ.get('OPENAI_API_KEY'),
    model_name=os.environ.get('AZURE_OPENAI_EMBEDING_MODEL'),
    api_version=os.environ.get('AZURE_OPENAI_API_VERSION'),
    api_base=os.environ.get('AZURE_OPENAI_ENDPOINT'),
)

rm = ChromadbRM(CHROMA_COLLECTION_NAME, CHROMADB_DIR, embedding_function, k=3)

In [64]:
import dspy
from dspy.retrieve.chromadb_rm import ChromadbRM

class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

#vanilla_dspy_rag_lm = dspy.OpenAI(model='gpt-3.5-turbo-instruct')
#vanilla_dspy_rag_lm = dspy.AzureOpenAI(model="gpt35", api_base="https://lioropenaitest.openai.azure.com/", api_key="dc7032d007034a0c99db583e0726d41a", api_version="2024-02-01")
dspy.settings.configure(lm=llm, rm=rm)

class vanilla_dspy_rag(dspy.Module):
    
    # we set num_passages=1 to avoid the same passage being repeatedly retrieved for multiple times
    def __init__(self, num_passages=1):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [None]:
question_neuralink_industry = 'What industry or industries is Neuralink in?'
#vector quary the chroma db using the question
vanilla_dspy_rag(question_neuralink_industry)

## Azure Vecotr Search

In [85]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery

In [82]:
service_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"]
index_name = os.environ["AZURE_SEARCH_INDEX"]
key = os.environ["AZURE_SEARCH_API_KEY"]

def get_embeddings(text: str):
    # There are a few ways to get embeddings. This is just one example.
    import openai

    open_ai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    open_ai_key = os.getenv("OPENAI_API_KEY")

    client = openai.AzureOpenAI(
        azure_endpoint=open_ai_endpoint,
        api_key=open_ai_key,
        api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    )
    embedding = client.embeddings.create(input=[text], model=os.getenv("AZURE_OPENAI_EMBEDING_MODEL"))
    return embedding.data[0].embedding


## Vector Search

In [137]:

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector_query = VectorizedQuery(vector=get_embeddings(query), k_nearest_neighbors=3, fields="content_vector")

results = search_client.search(
    vector_queries=[vector_query],
    select=["id", "content"],
)

result_array = []
for result in results:
    print(result)
    result_array.append(result)
# [END single_vector_search]

{'content': "he made from the sale of PayPal, Musk founded SpaceX, a spaceflight services company, in 2002. \nIn 2004, Musk became an early investor in electric vehicle manufacturer Tesla Motors, Inc. (later Tesla, Inc.). He became the company's chairman and product architect, assuming the position of CEO in 2008. In 2006, Musk helped create SolarCity, a solar-energy company that was acquired by Tesla in 2016 and became Tesla Energy. In 2013, he proposed a hyperloop high-speed vactrain transportation system. In 2015, he co-founded OpenAI, a nonprofit artificial intelligence research company. The following year, Musk co-founded Neuralink—a neurotechnology company developing brain–computer interfaces—and the Boring Company, a tunnel construction company.", 'id': 'aHR0cHM6Ly9zdHJpbWtjbGZsaGQzeW1hLmJsb2IuY29yZS53aW5kb3dzLm5ldC9kb2N1bWVudHMvaHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvRWxvbl9NdXNr0', '@search.score': 0.88719743, '@search.reranker_score': None, '@search.highlights': None, '@searc

## Multi Search

In [None]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector_query = VectorizedQuery(vector=get_embeddings(query), k_nearest_neighbors=3, fields="content_vector")

results = search_client.search(
    search_text=query,
    vector_queries=[vector_query],
    select=["id", "content"],
)

for result in results:
    print(result)
# [END simple_hybrid_search]

## LLM Answer (No data sources)

In [None]:
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain_openai import ChatOpenAI

In [141]:
chat = llm

# Vector Search Answer
messages = [
    SystemMessage(
        content=f"You are a helpful assistant who generates information to questions. Please answer the question"
    ),
    HumanMessage(
        content= f"{query}"
    ),
    HumanMessage(
        content= f"write a short answer based on the information above, provide a short answer, not nore than 1 line that must answer the question."
    )
    
]
LLM_ans_colleague = chat.invoke(messages)
print(LLM_ans_colleague.content)

Elon Musk co-founded companies like Zip2, PayPal, SpaceX, Tesla, Inc., Neuralink, and The Boring Company; Tesla's subsidiary includes SolarCity.


## Vector LLM answer

In [None]:
#Print the Question
print(query)

In [138]:
chat = llm

# Vector Search Answer
messages = [
    SystemMessage(
        content=f"You are a helpful assistant who generates information grounded with facts. Please enhance Vector search answer and generate the final answer. the question was: {query}"
    ),
    HumanMessage(
        content= f"Search Results: {result_array}"
    ),
    HumanMessage(
        content= f"write a short answer based on the information above, provide a short answer, not nore than 1 line that must answer the question."
    )
    
]
vector_ans_colleague = chat.invoke(messages)
print(vector_ans_colleague.content)

Elon Musk co-founded and is involved with companies such as SpaceX, Tesla, Inc., SolarCity (now Tesla Energy), Neuralink, The Boring Company, OpenAI, and he acquired Twitter, which was later integrated into X Corp.


# Boom!

In [139]:
chat = llm

# Full Answer Graph and Vector Search
messages = [
    SystemMessage(
        content=f"You are a helpful assistant who generates information grounded with facts. Please enhance the original answer with complementary entity and relationship information from the knowledge graph to generate the final answer. the question was: {query}"
    ),
    HumanMessage(
        content= f"Graph Results: {kg_context}"
    ),
    HumanMessage(
        content= f"Search Results: {result_array}"
    ),
    HumanMessage(
        content= f"write a short answer based on the information above, provide a short answer, not nore than 1 line that must answer the question."
    )
    
]
final_ans_colleague = chat.invoke(messages)
print(final_ans_colleague.content)

Elon Musk co-founded companies such as SpaceX, Tesla, Inc., SolarCity, OpenAI, Neuralink, and The Boring Company, with SpaceX owning The Boring Company as a subsidiary and OpenAI Global, LLC being a subsidiary of OpenAI.
