In [9]:
import os
import spacy
import spacy.cli
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
from typing import List, Tuple, Dict, Optional

In [10]:
def setup_spacy_pipeline() -> spacy.language.Language:
    """
    Set up and configure the spaCy pipeline with NER capabilities.
    
    Returns:
        Configured spaCy language model
    """
    try:
        nlp = spacy.load("en_core_web_md")
    except OSError:
        print("Download model...")
        spacy.cli.download("en_core_web_md")
        nlp = spacy.load("en_core_web_md")
    return nlp

In [11]:
def extract_entities(nlp: spacy.language.Language, texts: List[str]) -> List[Dict]:
    """
    Extract entities from given texts.
    
    Args:
        nlp: Configured spaCy language model
        texts: List of input texts
    
    Returns:
        List of dictionaries containing entity information
    """
    extracted_entities = []
    
    for doc in nlp.pipe(texts):
        doc_entities = []
        for ent in doc.ents:
            doc_entities.append({
                'text': ent.text,
                'label': ent.label_,
                'start': ent.start_char,
                'end': ent.end_char
            })
        extracted_entities.append(doc_entities)
    
    return extracted_entities

In [5]:
def generate_relationships(entities: List[Dict]) -> List[Tuple[str, str, str]]:
    """
    Generate relationships between entities.
    
    Args:
        entities: List of extracted entities
    
    Returns:
        List of relationship tuples (subject, relation, object)
    """
    knowledge_tuples = []
    
    for doc_entities in entities:
        for i, ent1 in enumerate(doc_entities):
            for j, ent2 in enumerate(doc_entities):
                if i != j:
                    # Create a simple relationship based on different entity types
                    relation = f"related_to_{ent1['label']}_{ent2['label']}"
                    knowledge_tuples.append((
                        ent1['text'], 
                        relation, 
                        ent2['text']
                    ))
    
    return knowledge_tuples

In [12]:
def save_knowledge_base(knowledge_tuples: List[Tuple[str, str, str]], output_dir: str = 'outputs') -> str:
    """
    Save knowledge base tuples to a CSV file.
    
    Args:
        knowledge_tuples: List of relationship tuples
        output_dir: Directory to save the output
    
    Returns:
        Path to the saved CSV file
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Create DataFrame
    df = pd.DataFrame(
        knowledge_tuples, 
        columns=['Subject', 'Relation', 'Object']
    )
    
    # Save to CSV
    csv_path = os.path.join(output_dir, 'knowledge_base.csv')
    df.to_csv(csv_path, index=False)
    print(f"Knowledge base saved to {csv_path}")
    
    return csv_path


In [7]:
def visualize_knowledge_graph(
    knowledge_tuples: List[Tuple[str, str, str]], 
    output_dir: str = 'outputs', 
    method: str = 'pyvis'
) -> Optional[str]:
    """
    Visualize knowledge graph using multiple methods with directed edges.
    
    Args:
        knowledge_tuples: List of relationship tuples
        output_dir: Directory to save the output
        method: Visualization method ('pyvis' or 'networkx')
    
    Returns:
        Path to the saved visualization
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        if method == 'pyvis':
            # Pyvis Interactive Visualization
            net = Network(
                height="750px", 
                width="100%", 
                bgcolor="#f5f5f5", 
                font_color="black",
                directed=True  # Key change: Enable directed edges
            )
            
            # Track unique nodes to prevent duplicates
            added_nodes = set()
            
            # Add nodes and edges
            for subject, relation, obj in knowledge_tuples:
                # Add subject node if not exists
                if subject not in added_nodes:
                    net.add_node(subject, label=subject, title=subject)
                    added_nodes.add(subject)
                
                # Add object node if not exists
                if obj not in added_nodes:
                    net.add_node(obj, label=obj, title=obj)
                    added_nodes.add(obj)
                
                # Add directed edge
                net.add_edge(subject, obj, label=relation, title=relation, arrows='to')
            
            # Customize physics and interaction
            net.set_options('''
            var options = {
                "physics": {
                    "forceAtlas2Based": {
                        "gravitationalConstant": -50,
                        "centralGravity": 0.01,
                        "springLength": 100,
                        "springConstant": 0.08
                    },
                    "minVelocity": 0.75,
                    "solver": "forceAtlas2Based"
                }
            }
            ''')
            
            # Save interactive HTML
            output_path = os.path.join(output_dir, 'interactive_knowledge_graph.html')
            net.save_graph(output_path)
            print(f"Interactive graph saved to {output_path}")
            return output_path
        
        elif method == 'networkx':
            # NetworkX visualization with Matplotlib
            G = nx.DiGraph()  # Use DiGraph for directed graph
            
            # Add edges
            for subject, relation, obj in knowledge_tuples:
                G.add_edge(subject, obj, relation=relation)
            
            plt.figure(figsize=(12, 8))
            pos = nx.spring_layout(G, k=0.5, iterations=50)
            
            # Draw nodes
            nx.draw_networkx_nodes(
                G, pos, 
                node_color='lightblue', 
                node_size=1500
            )
            
            # Draw labels
            nx.draw_networkx_labels(
                G, pos, 
                font_size=10, 
                font_weight='bold'
            )
            
            # Draw edges with arrows
            nx.draw_networkx_edges(
                G, pos, 
                edge_color='gray', 
                arrows=True,  # Key change: Enable arrows
                arrowsize=20
            )
            
            # Add edge labels
            edge_labels = nx.get_edge_attributes(G, 'relation')
            nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
            
            # Remove axis
            plt.axis('off')
            
            # Save visualization
            output_path = os.path.join(output_dir, 'kg_interactive_networkx.png')
            plt.savefig(output_path, bbox_inches='tight')
            plt.close()
            print(f"Graph saved to {output_path}")
            return output_path
        
        else:
            raise ValueError(f"Unsupported visualization method: {method}")
    
    except Exception as e:
        print(f"Graph visualization failed: {e}")
        return None

In [13]:
def main():
    """
    Main function to demonstrate knowledge base construction workflow.
    """
    # Sample texts for entity and relationship extraction
    texts = [
        """A man widely seen as the godfather of artificial intelligence (AI) has quit his job, warning about the growing dangers from developments in the field.

        Geoffrey Hinton, 75, announced his resignation from Google in a statement to the New York Times, saying he now regretted his work.

        He told the BBC some of the dangers of AI chatbots were \"quite scary\".

        \"Right now, they're not more intelligent than us, as far as I can tell. But I think they soon may be.\"

        Dr Hinton also accepted that his age had played into his decision to leave the tech giant, telling the BBC: \"I'm 75, so it's time to retire.\""""
    ]
    
    # Setup spaCy pipeline
    nlp = setup_spacy_pipeline()
    
    # Extract entities
    extracted_entities = extract_entities(nlp, texts)
    
    # Generate relationships
    knowledge_tuples = generate_relationships(extracted_entities)
    
    # Save knowledge base to CSV
    save_knowledge_base(knowledge_tuples)
    
    # Visualize knowledge graph (Pyvis Interactive)
    visualize_knowledge_graph(knowledge_tuples, method='pyvis')
    
    # Optional: Visualize with NetworkX static image
    visualize_knowledge_graph(knowledge_tuples, method='networkx')

if __name__ == "__main__":
    main()

Knowledge base saved to outputs/knowledge_base.csv
Interactive graph saved to outputs/interactive_knowledge_graph.html
Graph saved to outputs/kg_interactive_networkx.png
