In [None]:
import json
import re
from collections import defaultdict, Counter

class HTCPGraphBuilder:
    """
    Builds a hierarchical graph model for HTPC with 5 levels:
    - L1: Token transitions
    - L2: Bigram memory
    - L3: Phrase memory
    - L4: Phrase hierarchy
    - L5: Discourse patterns
    """
    
    def __init__(self):
        # Initialize the graph structure
        self.graph = {
            "nodes": {
                "token": {},      # L1: Token nodes
                "bigram": {},     # L2: Bigram nodes
                "phrase": {},     # L3: Phrase nodes
                "hierarchy": {},  # L4: Hierarchy nodes
                "discourse": {}   # L5: Discourse nodes
            },
            "edges": {
                "sequence": [],   # Token-to-token connections
                "composition": [], # Connections between levels
                "context": []     # Contextual relationships
            },
            "metadata": {
                "token_count": 0,
                "bigram_count": 0,
                "phrase_count": 0,
                "hierarchy_count": 0,
                "discourse_count": 0
            }
        }
        # Thresholds for each level
        self.thresholds = {
            "bigram": 2,      # Minimum frequency to create a bigram
            "phrase": 2,      # Minimum frequency to create a phrase
            "hierarchy": 2,   # Minimum frequency to create a hierarchy
            "discourse": 2    # Minimum frequency to create a discourse pattern
        }
        # Tracking counters
        self.token_frequencies = Counter()
        self.bigram_frequencies = Counter()
        self.phrase_frequencies = Counter()
        self.hierarchy_frequencies = Counter()
        # Temporary storage for sequence processing
        self.sequence_buffer = []
        # Node ID counters
        self.next_node_id = {
            "token": 0,
            "bigram": 0,
            "phrase": 0,
            "hierarchy": 0,
            "discourse": 0
        }
    
    def tokenize(self, text):
        """Convert text to tokens, handling basic punctuation"""
        # Remove excess whitespace and convert to lowercase
        text = text.strip().lower()
        # Simple tokenization: split on whitespace and keep punctuation
        tokens = re.findall(r'\b\w+\b|[.,!?;]', text)
        return tokens
    
    def process_sequence(self, sequence):
        """Process a single sequence (sentence) and update the graph"""
        tokens = self.tokenize(sequence)
        if not tokens:
            return
        
        # Store the sequence for higher-level processing
        self.sequence_buffer.append(tokens)
        
        # Process L1: Token transitions
        self._process_tokens(tokens)
        
        # Process higher levels if we have enough sequences
        if len(self.sequence_buffer) >= 5:  # Wait until we have enough context
            self._build_higher_levels()
            # Keep only the most recent sequences for sliding window
            self.sequence_buffer = self.sequence_buffer[-5:]
    
    def _process_tokens(self, tokens):
        """Process token-level (L1) structures"""
        # Add tokens to the graph
        previous_token = None
        for token in tokens:
            # Update token frequency
            self.token_frequencies[token] += 1
            
            # Add token node if it doesn't exist
            if token not in self.graph["nodes"]["token"]:
                token_id = f"t_{self.next_node_id['token']}"
                self.next_node_id["token"] += 1
                self.graph["nodes"]["token"][token] = {
                    "id": token_id,
                    "value": token,
                    "frequency": 1
                }
            else:
                # Update existing token frequency
                self.graph["nodes"]["token"][token]["frequency"] += 1
                token_id = self.graph["nodes"]["token"][token]["id"]
            
            # Create sequence edge if we have a previous token
            if previous_token:
                # Create a unique key for this token transition
                transition_key = f"{previous_token}_{token}"
                
                # Track bigram frequency for L2
                self.bigram_frequencies[transition_key] += 1
                
                # Add sequence edge
                edge_exists = False
                for edge in self.graph["edges"]["sequence"]:
                    if edge["source"] == self.graph["nodes"]["token"][previous_token]["id"] and \
                       edge["target"] == token_id:
                        edge["weight"] += 1
                        edge_exists = True
                        break
                
                if not edge_exists:
                    self.graph["edges"]["sequence"].append({
                        "source": self.graph["nodes"]["token"][previous_token]["id"],
                        "target": token_id,
                        "type": "sequence",
                        "weight": 1
                    })
            
            previous_token = token
    
    def _build_higher_levels(self):
        """Build higher-level structures (L2-L5) based on collected data"""
        # Process L2: Bigram memory
        self._build_bigrams()
        
        # Process L3: Phrase memory
        self._build_phrases()
        
        # Process L4: Phrase hierarchy
        self._build_hierarchies()
        
        # Process L5: Discourse patterns
        self._build_discourse_patterns()
    
    def _build_bigrams(self):
        """Build L2: Bigram nodes from token transitions"""
        for bigram, freq in self.bigram_frequencies.items():
            if freq < self.thresholds["bigram"]:
                continue
                
            # Parse the bigram key back into tokens
            token1, token2 = bigram.split("_")
            
            # Create bigram node if it doesn't exist
            if bigram not in self.graph["nodes"]["bigram"]:
                bigram_id = f"b_{self.next_node_id['bigram']}"
                self.next_node_id["bigram"] += 1
                self.graph["nodes"]["bigram"][bigram] = {
                    "id": bigram_id,
                    "tokens": [token1, token2],
                    "frequency": freq
                }
                
                # Add composition edges connecting tokens to this bigram
                self.graph["edges"]["composition"].append({
                    "source": self.graph["nodes"]["token"][token1]["id"],
                    "target": bigram_id,
                    "type": "composition",
                    "level": "L1_to_L2"
                })
                self.graph["edges"]["composition"].append({
                    "source": self.graph["nodes"]["token"][token2]["id"],
                    "target": bigram_id,
                    "type": "composition",
                    "level": "L1_to_L2"
                })
            else:
                # Update frequency if bigram already exists
                self.graph["nodes"]["bigram"][bigram]["frequency"] += freq
    
    def _build_phrases(self):
        """Build L3: Phrase nodes from sequences of bigrams"""
        # Process each sequence in the buffer to find phrases
        for tokens in self.sequence_buffer:
            # Find all potential phrases (3+ tokens)
            if len(tokens) < 3:
                continue
                
            # Generate all possible phrases from the sequence
            for i in range(len(tokens) - 2):
                for j in range(i + 2, min(i + 6, len(tokens))):  # Limit phrase length
                    phrase_tokens = tokens[i:j+1]
                    phrase_key = "_".join(phrase_tokens)
                    
                    # Update phrase frequency
                    self.phrase_frequencies[phrase_key] += 1
                    
                    # Check if this meets our threshold
                    if self.phrase_frequencies[phrase_key] >= self.thresholds["phrase"]:
                        # Create or update phrase node
                        if phrase_key not in self.graph["nodes"]["phrase"]:
                            phrase_id = f"p_{self.next_node_id['phrase']}"
                            self.next_node_id["phrase"] += 1
                            self.graph["nodes"]["phrase"][phrase_key] = {
                                "id": phrase_id,
                                "tokens": phrase_tokens,
                                "frequency": self.phrase_frequencies[phrase_key]
                            }
                            
                            # Connect to component bigrams
                            for k in range(len(phrase_tokens) - 1):
                                bigram_key = f"{phrase_tokens[k]}_{phrase_tokens[k+1]}"
                                if bigram_key in self.graph["nodes"]["bigram"]:
                                    self.graph["edges"]["composition"].append({
                                        "source": self.graph["nodes"]["bigram"][bigram_key]["id"],
                                        "target": phrase_id,
                                        "type": "composition",
                                        "level": "L2_to_L3"
                                    })
                        else:
                            # Update existing phrase
                            self.graph["nodes"]["phrase"][phrase_key]["frequency"] = \
                                self.phrase_frequencies[phrase_key]
    
    def _build_hierarchies(self):
        """Build L4: Hierarchy nodes representing relationships between phrases"""
        # Look for phrases that frequently appear in the same sentence
        phrase_co_occurrences = Counter()
        
        for tokens in self.sequence_buffer:
            # Find all phrases in this sequence
            sequence_phrases = []
            for phrase_key in self.graph["nodes"]["phrase"]:
                phrase_tokens = self.graph["nodes"]["phrase"][phrase_key]["tokens"]
                # Check if this phrase is in the sequence
                phrase_str = " ".join(phrase_tokens)
                sequence_str = " ".join(tokens)
                if phrase_str in sequence_str:
                    sequence_phrases.append(phrase_key)
            
            # Record co-occurrences of phrases
            for i in range(len(sequence_phrases)):
                for j in range(i+1, len(sequence_phrases)):
                    hierarchy_key = f"{sequence_phrases[i]}|{sequence_phrases[j]}"
                    phrase_co_occurrences[hierarchy_key] += 1
        
        # Create hierarchy nodes for frequent co-occurrences
        for hierarchy_key, freq in phrase_co_occurrences.items():
            if freq < self.thresholds["hierarchy"]:
                continue
                
            # Update our tracking counter
            self.hierarchy_frequencies[hierarchy_key] += freq
            
            if hierarchy_key not in self.graph["nodes"]["hierarchy"]:
                hierarchy_id = f"h_{self.next_node_id['hierarchy']}"
                self.next_node_id["hierarchy"] += 1
                
                phrase1, phrase2 = hierarchy_key.split("|")
                
                self.graph["nodes"]["hierarchy"][hierarchy_key] = {
                    "id": hierarchy_id,
                    "phrases": [phrase1, phrase2],
                    "frequency": freq
                }
                
                # Connect to component phrases
                if phrase1 in self.graph["nodes"]["phrase"]:
                    self.graph["edges"]["composition"].append({
                        "source": self.graph["nodes"]["phrase"][phrase1]["id"],
                        "target": hierarchy_id,
                        "type": "composition",
                        "level": "L3_to_L4"
                    })
                
                if phrase2 in self.graph["nodes"]["phrase"]:
                    self.graph["edges"]["composition"].append({
                        "source": self.graph["nodes"]["phrase"][phrase2]["id"],
                        "target": hierarchy_id,
                        "type": "composition",
                        "level": "L3_to_L4"
                    })
            else:
                # Update existing hierarchy node
                self.graph["nodes"]["hierarchy"][hierarchy_key]["frequency"] = \
                    self.hierarchy_frequencies[hierarchy_key]
    
    def _build_discourse_patterns(self):
        """Build L5: Discourse nodes representing patterns across multiple sentences"""
        # This is a simplified implementation - in a full system, you would
        # analyze patterns across multiple sequences
        
        # For our purposes, we'll connect hierarchy nodes that appear across different
        # sequences in our buffer
        
        # First, find which hierarchies appear in which buffer positions
        hierarchy_positions = defaultdict(list)
        
        for i, tokens in enumerate(self.sequence_buffer):
            sequence_str = " ".join(tokens)
            
            # Check which hierarchies might be present in this sequence
            for hierarchy_key in self.graph["nodes"]["hierarchy"]:
                hierarchy_node = self.graph["nodes"]["hierarchy"][hierarchy_key]
                phrase1, phrase2 = hierarchy_key.split("|")
                
                # Check if both phrases appear in the sequence
                if phrase1 in self.graph["nodes"]["phrase"] and phrase2 in self.graph["nodes"]["phrase"]:
                    phrase1_tokens = self.graph["nodes"]["phrase"][phrase1]["tokens"]
                    phrase2_tokens = self.graph["nodes"]["phrase"][phrase2]["tokens"]
                    
                    phrase1_str = " ".join(phrase1_tokens)
                    phrase2_str = " ".join(phrase2_tokens)
                    
                    if phrase1_str in sequence_str and phrase2_str in sequence_str:
                        hierarchy_positions[hierarchy_key].append(i)
        
        # Find patterns across sequences
        discourse_patterns = Counter()
        
        for h1 in hierarchy_positions:
            for h2 in hierarchy_positions:
                if h1 != h2:
                    # Look for sequential patterns
                    for pos1 in hierarchy_positions[h1]:
                        for pos2 in hierarchy_positions[h2]:
                            if pos2 == pos1 + 1:  # Adjacent sequences
                                discourse_key = f"{h1}||{h2}"
                                discourse_patterns[discourse_key] += 1
        
        # Create discourse nodes for frequent patterns
        for discourse_key, freq in discourse_patterns.items():
            if freq < self.thresholds["discourse"]:
                continue
                
            if discourse_key not in self.graph["nodes"]["discourse"]:
                discourse_id = f"d_{self.next_node_id['discourse']}"
                self.next_node_id["discourse"] += 1
                
                h1, h2 = discourse_key.split("||")
                
                self.graph["nodes"]["discourse"][discourse_key] = {
                    "id": discourse_id,
                    "hierarchies": [h1, h2],
                    "frequency": freq
                }
                
                # Connect to component hierarchies
                if h1 in self.graph["nodes"]["hierarchy"]:
                    self.graph["edges"]["composition"].append({
                        "source": self.graph["nodes"]["hierarchy"][h1]["id"],
                        "target": discourse_id,
                        "type": "composition",
                        "level": "L4_to_L5"
                    })
                
                if h2 in self.graph["nodes"]["hierarchy"]:
                    self.graph["edges"]["composition"].append({
                        "source": self.graph["nodes"]["hierarchy"][h2]["id"],
                        "target": discourse_id,
                        "type": "composition",
                        "level": "L4_to_L5"
                    })
            else:
                # Update existing discourse node
                self.graph["nodes"]["discourse"][discourse_key]["frequency"] += freq
    
    def update_metadata(self):
        """Update the metadata with current counts"""
        self.graph["metadata"]["token_count"] = len(self.graph["nodes"]["token"])
        self.graph["metadata"]["bigram_count"] = len(self.graph["nodes"]["bigram"])
        self.graph["metadata"]["phrase_count"] = len(self.graph["nodes"]["phrase"])
        self.graph["metadata"]["hierarchy_count"] = len(self.graph["nodes"]["hierarchy"])
        self.graph["metadata"]["discourse_count"] = len(self.graph["nodes"]["discourse"])
    
    def build_from_file(self, filepath):
        """Build the graph model from sequences in a text file"""
        try:
            with open(filepath, 'r', encoding='utf-8') as file:
                for line in file:
                    line = line.strip()
                    if line:  # Skip empty lines
                        self.process_sequence(line)
            
            # Ensure we process any remaining sequences
            self._build_higher_levels()
            
            # Update metadata
            self.update_metadata()
            
            return True
        except Exception as e:
            print(f"Error processing file: {e}")
            return False
    
    def save_to_json(self, output_path):
        """Save the graph model to a JSON file"""
        try:
            with open(output_path, 'w', encoding='utf-8') as file:
                json.dump(self.graph, file, indent=2)
            return True
        except Exception as e:
            print(f"Error saving JSON: {e}")
            return False


# Example usage
if __name__ == "__main__":
    import sys
    
    if len(sys.argv) < 3:
        print("Usage: python htpc_graph_builder.py <input_file> <output_file>")
        sys.exit(1)
    
    input_file = "test_input.txt"
    output_file = "test_model.json"
    
    builder = HTCPGraphBuilder()
    print(f"Building graph from {input_file}...")
    
    if builder.build_from_file(input_file):
        print("Graph built successfully.")
        
        if builder.save_to_json(output_file):
            print(f"Graph saved to {output_file}")
            print(f"Stats: {builder.graph['metadata']}")
        else:
            print("Failed to save graph.")
    else:
        print("Failed to build graph.")