In [2]:
# Knowledge Graph Agent Implementation
# Phase 2: Graph Representation and Validation Logic

import pandas as pd
import numpy as np
import networkx as nx
import json
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

class KnowledgeGraphAgent:
    """
    MVP Knowledge Graph Agent with self-validation and extensible structure
    """
    
    def __init__(self, validate_on_add=True, verbose=True):
        self.graph = nx.MultiDiGraph()  # Allows multiple edges between same nodes
        self.validate_on_add = validate_on_add
        self.verbose = verbose
        self.validation_stats = {
            'total_attempted': 0,
            'successful_adds': 0,
            'duplicates_rejected': 0,
            'contradictions_found': 0,
            'validation_errors': 0
        }
        
        # Contradiction rules - relations that shouldn't coexist
        self.contradiction_rules = {
            'Antonym': ['Synonym', 'RelatedTo'],
            'Synonym': ['Antonym'],
            'Causes': ['Prevents'],
            'Prevents': ['Causes']
        }
        
        print("🧠 Knowledge Graph Agent initialized!")
        print(f"   Validation: {'ON' if validate_on_add else 'OFF'}")
        print(f"   Verbose mode: {'ON' if verbose else 'OFF'}")
    
    def clean_conceptnet_data(self, df):
        """
        Properly clean ConceptNet data - fixes the string splitting issue
        """
        print("🧹 Cleaning ConceptNet data...")
        
        def extract_concept(concept_string):
            """Extract clean concept from ConceptNet URI format"""
            if pd.isna(concept_string):
                return None
            
            # ConceptNet format: /c/en/concept_name/part_of_speech
            # We want the concept_name part
            parts = str(concept_string).split('/')
            if len(parts) >= 4 and parts[1] == 'c' and parts[2] == 'en':
                concept = parts[3]
                # Handle underscores and clean up
                concept = concept.replace('_', ' ')
                return concept
            return concept_string
        
        def extract_relation(relation_string):
            """Extract relation type from ConceptNet URI"""
            if pd.isna(relation_string):
                return None
            parts = str(relation_string).split('/')
            if len(parts) >= 3 and parts[1] == 'r':
                return parts[2]
            return relation_string
        
        def extract_weight(weight_string):
            """Extract numerical weight from JSON string"""
            try:
                weight_data = json.loads(weight_string)
                return float(weight_data.get('weight', 1.0))
            except:
                return 1.0
        
        # Apply cleaning functions
        cleaned_df = df.copy()
        
        print("   Extracting concepts and relations...")
        cleaned_df['start_concept'] = df['start'].apply(extract_concept)
        cleaned_df['end_concept'] = df['end'].apply(extract_concept)
        cleaned_df['relation_type'] = df['relation'].apply(extract_relation)
        cleaned_df['edge_weight'] = df['weight'].apply(extract_weight)
        
        # Filter out invalid entries
        initial_count = len(cleaned_df)
        cleaned_df = cleaned_df.dropna(subset=['start_concept', 'end_concept', 'relation_type'])
        final_count = len(cleaned_df)
        
        print(f"   Filtered {initial_count - final_count:,} invalid entries")
        print(f"   Clean dataset: {final_count:,} triples")
        
        return cleaned_df[['start_concept', 'end_concept', 'relation_type', 'edge_weight']]
    
    def validate_triple(self, start, relation, end, weight=1.0):
        """
        Validate a triple before adding to graph
        Returns: (is_valid, reason)
        """
        # Check for duplicates
        if self.graph.has_edge(start, end):
            existing_edges = self.graph[start][end]
            for edge_data in existing_edges.values():
                if edge_data.get('relation') == relation:
                    return False, f"Duplicate: {start} --{relation}--> {end}"
        
        # Check for contradictions
        if relation in self.contradiction_rules:
            contradictory_relations = self.contradiction_rules[relation]
            
            if self.graph.has_edge(start, end):
                for edge_data in self.graph[start][end].values():
                    if edge_data.get('relation') in contradictory_relations:
                        return False, f"Contradiction: {start} already has {edge_data.get('relation')} with {end}"
        
        # Passed all validation checks
        return True, "Valid"
    
    def add_triple(self, start, relation, end, weight=1.0, force=False):
        """
        Add a validated triple to the knowledge graph
        """
        self.validation_stats['total_attempted'] += 1
        
        if not force and self.validate_on_add:
            is_valid, reason = self.validate_triple(start, relation, end, weight)
            
            if not is_valid:
                if "Duplicate" in reason:
                    self.validation_stats['duplicates_rejected'] += 1
                elif "Contradiction" in reason:
                    self.validation_stats['contradictions_found'] += 1
                else:
                    self.validation_stats['validation_errors'] += 1
                
                if self.verbose:
                    print(f"❌ Rejected: {reason}")
                return False
        
        # Add the triple to graph
        self.graph.add_edge(start, end, relation=relation, weight=weight)
        self.validation_stats['successful_adds'] += 1
        
        if self.verbose and self.validation_stats['total_attempted'] % 1000 == 0:
            print(f"✅ Added {self.validation_stats['successful_adds']:,} triples so far...")
        
        return True
    
    def bulk_load_triples(self, df, max_triples=None):
        """
        Efficiently load multiple triples with progress tracking
        """
        print(f"📊 Loading triples into knowledge graph...")
        
        if max_triples:
            df = df.head(max_triples)
            print(f"   Limited to first {max_triples:,} triples")
        
        total_rows = len(df)
        print(f"   Processing {total_rows:,} triples...")
        
        # Use tqdm for progress bar
        for idx, row in tqdm(df.iterrows(), total=total_rows, desc="Loading triples"):
            self.add_triple(
                start=str(row['start_concept']),
                relation=str(row['relation_type']),
                end=str(row['end_concept']),
                weight=float(row['edge_weight'])
            )
        
        self.print_stats()
    
    def print_stats(self):
        """Print comprehensive statistics about the knowledge graph"""
        print("\n📈 Knowledge Graph Statistics:")
        print("="*50)
        print(f"🔢 Total nodes: {self.graph.number_of_nodes():,}")
        print(f"🔗 Total edges: {self.graph.number_of_edges():,}")
        print(f"📊 Average degree: {np.mean([d for n, d in self.graph.degree()]):,.2f}")
        
        print(f"\n🔍 Validation Results:")
        print(f"   Attempted additions: {self.validation_stats['total_attempted']:,}")
        print(f"   ✅ Successful: {self.validation_stats['successful_adds']:,}")
        print(f"   🔄 Duplicates rejected: {self.validation_stats['duplicates_rejected']:,}")
        print(f"   ⚡ Contradictions found: {self.validation_stats['contradictions_found']:,}")
        print(f"   ❌ Other errors: {self.validation_stats['validation_errors']:,}")
        
        success_rate = (self.validation_stats['successful_adds'] / max(self.validation_stats['total_attempted'], 1)) * 100
        print(f"   📊 Success rate: {success_rate:.2f}%")
        
        # Relation type distribution
        relation_counts = Counter()
        for _, _, data in self.graph.edges(data=True):
            relation_counts[data.get('relation', 'Unknown')] += 1
        
        print(f"\n🏷️  Top 10 Relation Types:")
        for relation, count in relation_counts.most_common(10):
            print(f"   {relation}: {count:,}")
    
    def query_concept(self, concept, max_results=10):
        """
        Query all relations for a given concept
        """
        if concept not in self.graph:
            print(f"❓ Concept '{concept}' not found in graph")
            return []
        
        print(f"🔍 Relations for '{concept}':")
        
        results = []
        
        # Outgoing relations
        for neighbor in list(self.graph.neighbors(concept))[:max_results//2]:
            edge_data = self.graph[concept][neighbor]
            for edge in edge_data.values():
                relation = edge.get('relation', 'Unknown')
                weight = edge.get('weight', 1.0)
                results.append((concept, relation, neighbor, weight, 'outgoing'))
                print(f"   {concept} --{relation}--> {neighbor} (weight: {weight:.2f})")
        
        # Incoming relations
        for predecessor in list(self.graph.predecessors(concept))[:max_results//2]:
            edge_data = self.graph[predecessor][concept]
            for edge in edge_data.values():
                relation = edge.get('relation', 'Unknown')
                weight = edge.get('weight', 1.0)
                results.append((predecessor, relation, concept, weight, 'incoming'))
                print(f"   {predecessor} --{relation}--> {concept} (weight: {weight:.2f})")
        
        return results
    
    def find_path(self, start_concept, end_concept, max_length=3):
        """
        Find connection paths between two concepts
        """
        if start_concept not in self.graph or end_concept not in self.graph:
            print(f"❓ One or both concepts not found in graph")
            return []
        
        try:
            # Find shortest path
            path = nx.shortest_path(self.graph, start_concept, end_concept, weight=None)
            
            print(f"🛤️  Path from '{start_concept}' to '{end_concept}':")
            
            # Print the path with relations
            for i in range(len(path) - 1):
                current = path[i]
                next_node = path[i + 1]
                
                if self.graph.has_edge(current, next_node):
                    edge_data = list(self.graph[current][next_node].values())[0]
                    relation = edge_data.get('relation', 'Unknown')
                    print(f"   {current} --{relation}--> {next_node}")
                
            return path
            
        except nx.NetworkXNoPath:
            print(f"❌ No path found between '{start_concept}' and '{end_concept}'")
            return []


# Example usage and testing
if __name__ == "__main__":
    print("🚀 Testing Knowledge Graph Agent\n")
    
    # Initialize agent
    agent = KnowledgeGraphAgent(validate_on_add=True, verbose=False)
    
    # Test with sample data (replace with your actual cleaned data)
    sample_data = pd.DataFrame({
        'start_concept': ['dog', 'cat', 'dog', 'happy', 'sad'],
        'end_concept': ['animal', 'animal', 'cat', 'emotion', 'emotion'],
        'relation_type': ['IsA', 'IsA', 'RelatedTo', 'IsA', 'IsA'],
        'edge_weight': [1.0, 1.0, 0.8, 1.0, 1.0]
    })
    
    print("📝 Loading sample data...")
    agent.bulk_load_triples(sample_data)
    
    print("\n🔍 Testing queries...")
    agent.query_concept('dog')
    
    print("\n🛤️  Testing path finding...")
    agent.find_path('dog', 'emotion')

🚀 Testing Knowledge Graph Agent

🧠 Knowledge Graph Agent initialized!
   Validation: ON
   Verbose mode: OFF
📝 Loading sample data...
📊 Loading triples into knowledge graph...
   Processing 5 triples...


Loading triples: 100%|██████████| 5/5 [00:00<00:00, 8279.32it/s]


📈 Knowledge Graph Statistics:
🔢 Total nodes: 6
🔗 Total edges: 5
📊 Average degree: 1.67

🔍 Validation Results:
   Attempted additions: 5
   ✅ Successful: 5
   🔄 Duplicates rejected: 0
   ⚡ Contradictions found: 0
   ❌ Other errors: 0
   📊 Success rate: 100.00%

🏷️  Top 10 Relation Types:
   IsA: 4
   RelatedTo: 1

🔍 Testing queries...
🔍 Relations for 'dog':
   dog --IsA--> animal (weight: 1.00)
   dog --RelatedTo--> cat (weight: 0.80)

🛤️  Testing path finding...
❌ No path found between 'dog' and 'emotion'



