# Graph RAG Pipeline - Demo Notebook

This notebook demonstrates the complete Graph RAG pipeline from data ingestion to querying.

In [None]:
# Import required libraries
import sys
sys.path.append('..')

from src.ingestion import WikipediaIngester
from src.extraction import CombinedExtractor
from src.graph import GraphBuilder
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

## Step 1: Data Ingestion

Fetch Wikipedia articles on AI-related topics.

In [None]:
# Initialize ingester
ingester = WikipediaIngester()

# Define topics
topics = [
    "Artificial Intelligence",
    "Machine Learning",
    "Natural Language Processing"
]

# Fetch articles
articles = ingester.fetch_articles(topics, max_articles=3)
print(f"Fetched {len(articles)} articles")

## Step 2: Entity Extraction

In [None]:
# Initialize extractor
extractor = CombinedExtractor()

# Process articles
results = extractor.process_articles('../data/articles')

# Display statistics
total_entities = sum(r['stats']['total_entities'] for r in results)
print(f"Extracted {total_entities} total entities")
print(f"Average entities per article: {total_entities/len(results):.1f}")

## Step 3: Graph Construction

In [None]:
# Initialize graph builder
builder = GraphBuilder()

# Load extraction results
extraction_files = list(Path('../data/entities').glob('*_entities.json'))

# Build graph
graph = builder.build_from_extractions(extraction_files)

# Save graph
builder.save_graph('demo_graph')

print(f"Graph built with {len(graph.nodes)} nodes and {len(graph.edges)} edges")

## Step 4: Graph Analysis

In [None]:
# Basic graph statistics
print("Graph Statistics:")
print(f"  Density: {nx.density(graph):.4f}")
print(f"  Average degree: {sum(dict(graph.degree()).values()) / len(graph.nodes):.2f}")
print(f"  Number of components: {nx.number_weakly_connected_components(graph)}")

In [None]:
# Top entities by centrality
centrality = nx.degree_centrality(graph)
top_10 = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:10]

# Create DataFrame for display
df_top = pd.DataFrame([
    {
        'Entity': graph.nodes[node].get('text', node),
        'Type': graph.nodes[node].get('label', 'UNKNOWN'),
        'Centrality': score,
        'Connections': graph.degree(node)
    }
    for node, score in top_10
])

df_top

## Step 5: Visualization

In [None]:
# Create interactive visualization
builder.visualize('demo_graph.html', max_nodes=50)
print("Visualization saved to ../data/graphs/demo_graph.html")
print("Open the HTML file in your browser to explore the graph interactively.")

In [None]:
# Simple matplotlib visualization of entity type distribution
entity_types = {}
for node, data in graph.nodes(data=True):
    label = data.get('label', 'UNKNOWN')
    entity_types[label] = entity_types.get(label, 0) + 1

plt.figure(figsize=(10, 6))
plt.bar(entity_types.keys(), entity_types.values())
plt.xlabel('Entity Type')
plt.ylabel('Count')
plt.title('Entity Type Distribution')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Next Steps

1. **Validation**: Run `streamlit run ../src/validation/app.py` to launch the validation interface
2. **Query Interface**: Implement LLM integration with Ollama for natural language queries
3. **Scale Up**: Process more articles and build a larger knowledge graph
4. **Custom Entities**: Add domain-specific entity types using GLiNER