In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import dspy
import asyncio
import os
from dotenv import load_dotenv
from datasets import load_dataset
import logging

from nano_graphrag._utils import compute_mdhash_id
from nano_graphrag.entity_extraction.extract import generate_dataset, compile_model
from nano_graphrag.entity_extraction.module import EntityRelationshipExtractor

In [None]:
WORKING_DIR = "./nano_graphrag_cache_finetune_entity_relationship_dspy"

load_dotenv()

logging.basicConfig(level=logging.WARNING)
logging.getLogger("nano-graphrag").setLevel(logging.DEBUG)

In [None]:
system_prompt = """
    You are a world-class AI system, capable of complex reasoning and reflection. 
    Reason through the query, and then provide your final response. 
    If you detect that you made a mistake in your reasoning at any point, correct yourself.
    Think carefully.
"""
lm = dspy.OpenAI(
    model="deepseek-chat", 
    model_type="chat", 
    api_key=os.environ["DEEPSEEK_API_KEY"], 
    base_url=os.environ["DEEPSEEK_BASE_URL"], 
    system_prompt=system_prompt, 
    temperature=0.3,
    top_p=1.0,
    max_tokens=4096
)
dspy.settings.configure(lm=lm)

In [None]:
os.makedirs(WORKING_DIR, exist_ok=True)
train_len = 20
entity_relationship_dataset_path = os.path.join(WORKING_DIR, "entity_relationship_extraction_news.pkl")
entity_relationship_module_path = os.path.join(WORKING_DIR, "entity_relationship_extraction_news.json")
ds = load_dataset("ashraq/financial-news-articles")
train_data = ds['train'][-train_len:]

In [None]:
train_data['text'][-1]

In [None]:
chunks = {compute_mdhash_id(text, prefix=f"chunk-"): {"content": text} for text in train_data["text"]}
dataset = asyncio.run(generate_dataset(chunks=chunks, filepath=entity_relationship_dataset_path))

In [None]:
dataset[0]

In [None]:
dataset[0].relationships.context

In [None]:
model = EntityRelationshipExtractor()
model

In [None]:
optimized_model = compile_model(
    model=model,
    dataset_path=entity_relationship_dataset_path,
    module_path=entity_relationship_module_path
)
optimized_model