# Building LEED Knowledge Graph

This notebook demonstrates how to build a knowledge graph from LEED materials (PDFs and Excel files).

In [None]:
import os
import json
import logging
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import pandas as pd
import fitz  # PyMuPDF
from PIL import Image
import io
import networkx as nx
import matplotlib.pyplot as plt
from kg_extractor import KnowledgeGraphExtractor

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

## Initialize Knowledge Graph Builder

In [None]:
class LEEDKnowledgeGraphBuilder:
    def __init__(self, config_path: str = "../config.yaml"):
        """Initialize the knowledge graph builder."""
        self.kg_extractor = KnowledgeGraphExtractor(config_path)
        self.all_entities = {}  # Use dict for deduplication
        self.all_relations = []
    
    def extract_text_from_pdf(self, pdf_path: str) -> List[str]:
        """Extract text and images from PDF files."""
        logger.info(f"Processing PDF: {pdf_path}")
        text_chunks = []
        
        try:
            # Open PDF
            doc = fitz.open(pdf_path)
            
            for page_num, page in enumerate(doc):
                # Extract text
                text = page.get_text()
                if text.strip():
                    text_chunks.append(text)
                
                # Extract images
                image_list = page.get_images()
                for img_index, img in enumerate(image_list):
                    xref = img[0]
                    base_image = doc.extract_image(xref)
                    image_bytes = base_image["image"]
                    
                    # Convert to PIL Image
                    image = Image.open(io.BytesIO(image_bytes))
                    
                    # TODO: Replace with actual vision model call
                    logger.info(f"Found image on page {page_num + 1}, image {img_index + 1}")
            
            return text_chunks
        
        except Exception as e:
            logger.error(f"Error processing PDF {pdf_path}: {e}")
            return []
    
    def extract_from_excel(self, excel_path: str) -> List[str]:
        """Extract information from Excel files."""
        logger.info(f"Processing Excel: {excel_path}")
        statements = []
        
        try:
            # Read all sheets
            excel_file = pd.ExcelFile(excel_path)
            
            for sheet_name in excel_file.sheet_names:
                df = pd.read_excel(excel_path, sheet_name=sheet_name)
                
                # Convert each row to natural language
                for _, row in df.iterrows():
                    if 'fixture_type' in df.columns and 'flow_rate' in df.columns:
                        statement = f"The baseline flow rate for {row['fixture_type']} is {row['flow_rate']} gpm"
                        statements.append(statement)
                    else:
                        statement = " ".join(f"{col}: {val}" for col, val in row.items())
                        statements.append(statement)
            
            return statements
        
        except Exception as e:
            logger.error(f"Error processing Excel {excel_path}: {e}")
            return []
    
    def process_text_chunk(self, text: str):
        """Process a single text chunk and accumulate entities and relations."""
        entities, relations = self.kg_extractor.process(text)
        
        # Deduplicate entities
        for entity in entities:
            self.all_entities[entity['id']] = entity
        
        # Add relations
        self.all_relations.extend(relations)
    
    def build_knowledge_graph(self, input_dir: str, output_dir: str):
        """Build knowledge graph from all files in the input directory."""
        input_path = Path(input_dir)
        output_path = Path(output_dir)
        output_path.mkdir(parents=True, exist_ok=True)
        
        # Process all files
        for file_path in input_path.glob('**/*'):
            if file_path.suffix.lower() == '.pdf':
                text_chunks = self.extract_text_from_pdf(str(file_path))
                for chunk in text_chunks:
                    self.process_text_chunk(chunk)
            
            elif file_path.suffix.lower() in ['.xlsx', '.xls']:
                statements = self.extract_from_excel(str(file_path))
                for statement in statements:
                    self.process_text_chunk(statement)
        
        # Save results
        self._save_results(output_path)
        
        logger.info(f"Knowledge graph built successfully:")
        logger.info(f"- {len(self.all_entities)} unique entities")
        logger.info(f"- {len(self.all_relations)} relations")
    
    def _save_results(self, output_path: Path):
        """Save entities and relations to JSON files."""
        # Save entities
        with open(output_path / 'entities.json', 'w') as f:
            json.dump(list(self.all_entities.values()), f, indent=2)
        
        # Save relations
        with open(output_path / 'relations.json', 'w') as f:
            json.dump(self.all_relations, f, indent=2)
    
    def visualize_graph(self, max_nodes: int = 100):
        """Create a visualization of the knowledge graph."""
        # Create a new graph
        G = nx.DiGraph()
        
        # Add nodes (entities)
        for entity in list(self.all_entities.values())[:max_nodes]:
            G.add_node(entity['id'], label=entity['text'], type=entity['type'])
        
        # Add edges (relations)
        for relation in self.all_relations:
            if relation['source'] in G and relation['target'] in G:
                G.add_edge(relation['source'], relation['target'], label=relation['type'])
        
        # Create the visualization
        plt.figure(figsize=(15, 15))
        pos = nx.spring_layout(G)
        
        # Draw nodes
        nx.draw_networkx_nodes(G, pos, node_size=1000, alpha=0.8)
        
        # Draw edges
        nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5, arrows=True)
        
        # Add labels
        nx.draw_networkx_labels(G, pos, font_size=8)
        
        # Add edge labels
        edge_labels = nx.get_edge_attributes(G, 'label')
        nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_size=6)
        
        plt.title("LEED Knowledge Graph")
        plt.axis('off')
        plt.show()

## Build the Knowledge Graph

In [None]:
# Initialize the builder
builder = LEEDKnowledgeGraphBuilder()

# Build the knowledge graph
builder.build_knowledge_graph(
    input_dir='../data/leed_materials',
    output_dir='../output'
)

## Visualize the Knowledge Graph

In [None]:
# Visualize the graph (showing first 100 nodes for clarity)
builder.visualize_graph(max_nodes=100)

## Analyze the Knowledge Graph

In [None]:
# Load the saved entities and relations
with open('../output/entities.json', 'r') as f:
    entities = json.load(f)

with open('../output/relations.json', 'r') as f:
    relations = json.load(f)

# Create a DataFrame for analysis
entities_df = pd.DataFrame(entities)
relations_df = pd.DataFrame(relations)

# Display entity type distribution
print("Entity Type Distribution:")
print(entities_df['type'].value_counts())

print("\nRelation Type Distribution:")
print(relations_df['type'].value_counts())

## Interactive Graph Exploration

In [None]:
def explore_entity(entity_id: str):
    """Explore an entity and its relationships."""
    # Get entity details
    entity = next((e for e in entities if e['id'] == entity_id), None)
    if not entity:
        print(f"Entity {entity_id} not found")
        return
    
    print(f"Entity: {entity['text']} (Type: {entity['type']})")
    
    # Find related entities
    outgoing = [r for r in relations if r['source'] == entity_id]
    incoming = [r for r in relations if r['target'] == entity_id]
    
    print("\nOutgoing relationships:")
    for rel in outgoing:
        target = next((e['text'] for e in entities if e['id'] == rel['target']), None)
        print(f"- {rel['type']} -> {target}")
    
    print("\nIncoming relationships:")
    for rel in incoming:
        source = next((e['text'] for e in entities if e['id'] == rel['source']), None)
        print(f"- {source} -> {rel['type']}")

In [None]:
# Example: Explore a specific entity
# Replace 'CREDIT_1_1' with an actual entity ID from your graph
explore_entity('CREDIT_1_1')