In [12]:
from langchain_huggingface import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser, RegexParser
from pydantic import BaseModel
from typing import List, Dict, Any
import json
import torch
import re

from knowledgegraph import KnowledgeGraph

In [2]:
with open("hf.key") as f:
    hf_token = f.read()

In [3]:
BYTES_IN_GB = 1000_000_000

def print_mem(msg = ""):
    (free, total) = torch.cuda.mem_get_info()
    used = total - free
    
    perc_usaged = round(used / total * 100.0, 1)
    used_gb = round(used / BYTES_IN_GB, 1)
    total_gb = round(total / BYTES_IN_GB, 1)
    print(f'CUDA mem usage: {used_gb}/{total_gb}GB ({perc_usaged}%)')

print_mem()

CUDA mem usage: 0.8/12.5GB (6.7%)


In [4]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)

# MODEL_NAME = "Trelis/Mistral-7B-Instruct-v0.1-Summarize-16k"
MODEL_NAME= "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    token=hf_token
)
model.config.use_cache = False

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    token=hf_token
)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.1,
    repetition_penalty=1.2
)

llm = HuggingFacePipeline(pipeline=pipe)

print_mem()

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


CUDA mem usage: 8.8/12.5GB (71.0%)


In [32]:
class Person(BaseModel):
    name: str
    role: str

class Relationship(BaseModel):
    source: Person
    target: Person
    relationship: str
    
class QueryResponse(BaseModel):
    relationships: List[Relationship]

parser = PydanticOutputParser(pydantic_object=QueryResponse)

print(parser.get_format_instructions())

The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"Person": {"properties": {"name": {"title": "Name", "type": "string"}, "role": {"title": "Role", "type": "string"}}, "required": ["name", "role"], "title": "Person", "type": "object"}, "Relationship": {"properties": {"source": {"$ref": "#/$defs/Person"}, "target": {"$ref": "#/$defs/Person"}, "relationship": {"title": "Relationship", "type": "string"}}, "required": ["source", "target", "relationship"], "title": "Relationship", "type": "object"}}, "properties": {"relationships": {"items": {"$ref": "#/$defs/Relationship"}, "title": "Re

In [39]:
class KnowledgeGraphLLM:
    def __init__(self):
        self.kg = KnowledgeGraph()
        self.llm = llm
        
        # Define prompts for different operations
        self.relation_prompt = PromptTemplate(
            input_variables=["subject", "context"],
            partial_variables={
                "format_instructions": parser.get_format_instructions()
            },
            template="""[INST] Given the following information about {subject}, identify potential relationships for each person.
                If you don't know the answer, return an empty JSON object instead.
            
                [CONTEXT]
                {context}
                [/CONTEXT]

                [FORMAT]                
                {format_instructions}
                [/FORMAT]
            
                [/INST]""")
        
        self.query_prompt = PromptTemplate(
            input_variables=["question", "graph_data"],
            template="""[INST] Answer the following question using only the provided knowledge graph data.
            If you cannot answer with certainty, say "I cannot determine this from the available data."
            
            [GRAPH DATA]
            {graph_data}
            [/GRAPH_DATA]
            
            [QUESTION]
            {question}
            [/QUESTION]
            
            [/INST]"""
        )
        
        # Create LangChain chains
        self.relation_chain = self.relation_prompt | self.llm # | parser
        self.query_chain = self.query_prompt | self.llm
    
    def extract_relationships(self, subject: str, context: str) -> List[Dict[str, str]]:
        """Extract relationships from unstructured text using LLM."""
        try:
            # Get LLM's analysis
            output = self.relation_chain.invoke(input={
                "subject": subject, "context": context
            })

            result = re.findall(r'```json(.*?)```', string, re.DOTALL)[-1]
            
            relationships = json.loads(result.strip())["relationships"]
            
            # Add all extracted relationships to the knowledge graph
            for rel in relationships:
                self.kg.add_node(rel["source"])
                self.kg.add_node(rel["target"])
                self.kg.add_edge(rel["source"], rel["target"], rel["relationship"])
            
            return relationships
        
        except json.JSONDecodeError:
            print("Error: Could not parse LLM response as JSON")
            return []

    def graph_data(self):
        """Dump graph data to JSON"""
        return json.dumps(self.kg.dump(), indent=2)
        
    def smart_query(self, question: str) -> str:
        """Query the knowledge graph using LLM-powered reasoning."""
        # Get LLM's analysis
        return self.query_chain.invoke(input={
            "question": question,
            "graph_data": self.graph_data()
        })
    
    def get_graph_summary(self) -> Dict[str, Any]:
        """Return a summary of the knowledge graph."""
        return {
            "node_count": len(self.kg.nodes),
            "edge_count": sum(len(edges) for edges in self.kg.edges.values()),
            "data": self.graph_data()
        }

# Example usage
def example_usage():
    # Initialize the system
    kg_llm = KnowledgeGraphLLM()
    
    # Add information through unstructured text
    context = """
    John is a senior software engineer at TechCorp. He graduated from MIT
    in 2015 with a degree in Computer Science. He currently leads the 
    machine learning team and mentors junior engineers like Sarah and Mike.
    """
    
    # Extract and add relationships
    relationships = kg_llm.extract_relationships("John", context)
    print("Extracted relationships:", json.dumps(relationships, indent=2))
    
    # Query the enhanced knowledge graph
    questions = [
        "What is John's role at TechCorp?",
        "Who does John mentor?",
        "Where did John study?",
        "What was John's graduation year?"
    ]
    
    for question in questions:
        answer = kg_llm.smart_query(question).partition("[/INST]")[-1]
        print(f"###\n\nQ: {question}\nA: {answer}\n\n###")
    
    # Get graph summary
    print("\nKnowledge Graph Summary:")
    print(json.dumps(kg_llm.get_graph_summary(), indent=2))

In [40]:
example_usage()

Extracted relationships: [
  {
    "source": "John",
    "target": "Sarah",
    "relationship": "Mentorship"
  },
  {
    "source": "John",
    "target": "Mike",
    "relationship": "Mentorship"
  }
]
###

Q: What is John's role at TechCorp?
A:  To determine John's role at TechCorp based on the given knowledge graph data, I first examine the nodes and relationships. The nodes include "John," "Sarah," and "Mike." There are no specific attributes assigned to any of these individuals within the relationship definitions.

The relationships show that John has a mentorship link with both Sarah and Mike but do not provide information about their roles or positions within the company like TechCorp. Without additional details such as job titles mentioned in the relationships or explicit node descriptions, it's impossible to ascertain what role John plays specifically at TechCorp.

Therefore, due to insufficient data indicating John's position or responsibilities at TechCorp beyond being a mente

<__main__.KnowledgeGraphLLM at 0x7d2e2d9c8f10>

# Conclusion

The LLM chain correctly answers the questions regarding the relationships.  
In some test runs the LLM answers the first question, about John's role, as that of a mentor.

It cannot answer the other questions, because we (explicitly) did not feed those back into the LLM.  
This to showcase how it was really fetching the data from the graph that was built and not from the context.