# Semantica relation GRaph visualized

# Phase 1

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np
from matplotlib.patches import FancyArrowPatch
import random
import io
import pandas as pd
from PIL import Image, ImageDraw
import os
import time
from tqdm import tqdm
import uuid
import warnings
import traceback
import json  # Added missing import
from scipy.spatial.distance import cosine  # Added missing import
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend for better memory handling

class ConceptNetProcessor:
    """
    Process ConceptNet data for semantic visualization
    """
    def __init__(self, english_data=None, german_data=None):
        self.english_data = english_data
        self.german_data = german_data
        self.semantic_graph = nx.DiGraph()
        self.concept_vectors = {}
        self.relation_types = set()
        
        print("ConceptNetProcessor initialized")
    
    def clean_concept_name(self, concept_str):
        """Extract clean concept name from ConceptNet format"""
        if not isinstance(concept_str, str):
            return "unknown"
            
        # Extract the concept name from the ConceptNet URI format
        parts = concept_str.split('/')
        if len(parts) >= 4:
            # Format is typically /c/LANG/CONCEPT
            concept = parts[-1]
            # Remove part-of-speech tags if present
            if '/' in concept:
                concept = concept.split('/')[0]
            return concept
        return concept_str
    
    def extract_relation_type(self, relation_str):
        """Extract relation type from ConceptNet format"""
        if not isinstance(relation_str, str):
            return "unknown"
            
        parts = relation_str.split('/')
        if len(parts) >= 3:
            # Format is typically /r/RELATION_TYPE
            return parts[-1]
        return relation_str
    
    def extract_language(self, concept_str):
        """Extract language from ConceptNet concept URI"""
        if not isinstance(concept_str, str):
            return "unknown"
            
        parts = concept_str.split('/')
        if len(parts) >= 4:
            # Format is typically /c/LANG/CONCEPT
            return parts[2]
        return "unknown"
    
    def parse_weight(self, weight_str):
        """Parse weight JSON string to extract numeric weight"""
        if not isinstance(weight_str, str):
            return 1.0
            
        try:
            weight_data = json.loads(weight_str)
            # ConceptNet weights are typically in 'weight' field
            return float(weight_data.get('weight', 1.0))
        except:
            return 1.0
    
    def build_semantic_graph(self, max_concepts=200, min_weight=1.0, sample_size=0.25):
        """Build semantic graph from ConceptNet data"""
        print("Building semantic graph from ConceptNet data...")
        
        if self.english_data is None and self.german_data is None:
            print("No ConceptNet data provided.")
            return
        
        # Combine datasets
        all_data = []
        if self.english_data is not None:
            print(f"Processing {len(self.english_data)} English ConceptNet assertions...")
            sample_size_en = int(len(self.english_data) * sample_size)
            print(f"Will sample {sample_size_en} English assertions")
            all_data.append(('en', self.english_data))
        
        if self.german_data is not None:
            print(f"Processing {len(self.german_data)} German ConceptNet assertions...")
            sample_size_de = int(len(self.german_data) * sample_size)
            print(f"Will sample {sample_size_de} German assertions")
            all_data.append(('de', self.german_data))
        
        # Track concepts and their occurrence count
        concept_counts = {}
        
        # Process each language dataset
        for lang, data in all_data:
            curr_sample_size = int(len(data) * sample_size)
            data_sample = data.sample(n=curr_sample_size, random_state=42)
            print(f"Sampling {curr_sample_size} assertions from {len(data)} {lang} assertions")
            
            # Process assertions
            for _, row in tqdm(data_sample.iterrows(), desc=f"Processing {lang} assertions", total=len(data_sample)):
                try:
                    # Extract source and target concepts
                    source_concept = self.clean_concept_name(row['start'])
                    target_concept = self.clean_concept_name(row['end'])
                    
                    # Extract relation type
                    relation_type = self.extract_relation_type(row['rel'])
                    self.relation_types.add(relation_type)
                    
                    # Extract languages
                    source_lang = self.extract_language(row['start'])
                    target_lang = self.extract_language(row['end'])
                    
                    # Parse weight
                    weight = self.parse_weight(row['weight'])
                    
                    # Skip low-weight relationships
                    if weight < min_weight:
                        continue
                    
                    # Track concept occurrences
                    concept_counts[source_concept] = concept_counts.get(source_concept, 0) + 1
                    concept_counts[target_concept] = concept_counts.get(target_concept, 0) + 1
                    
                    # Add to graph
                    self.semantic_graph.add_node(
                        source_concept,
                        lang=source_lang,
                        count=concept_counts[source_concept]
                    )
                    
                    self.semantic_graph.add_node(
                        target_concept,
                        lang=target_lang,
                        count=concept_counts[target_concept]
                    )
                    
                    # Add edge with relation data
                    self.semantic_graph.add_edge(
                        source_concept,
                        target_concept,
                        relation=relation_type,
                        weight=weight
                    )
                    
                except Exception as e:
                    warnings.warn(f"Error processing assertion: {e}")
        
        # Limit to top concepts if needed
        if len(concept_counts) > max_concepts:
            print(f"Limiting graph to top {max_concepts} concepts...")
            top_concepts = sorted(concept_counts.items(), key=lambda x: x[1], reverse=True)[:max_concepts]
            top_concept_names = {c[0] for c in top_concepts}
            
            # Create subgraph with only top concepts
            subgraph = nx.DiGraph()
            
            for node in top_concept_names:
                if self.semantic_graph.has_node(node):
                    subgraph.add_node(
                        node,
                        **self.semantic_graph.nodes[node]
                    )
            
            for source, target, data in self.semantic_graph.edges(data=True):
                if source in top_concept_names and target in top_concept_names:
                    subgraph.add_edge(
                        source,
                        target,
                        **data
                    )
            
            self.semantic_graph = subgraph
        
        print(f"Semantic graph built with {self.semantic_graph.number_of_nodes()} nodes and {self.semantic_graph.number_of_edges()} edges")
        
        # Infer semantic categories
        self.infer_semantic_categories()
        
        return self.semantic_graph
    
    def infer_semantic_categories(self):
        """Infer semantic categories for concepts based on relationships"""
        print("Inferring semantic categories...")
        categories = {}
        
        # Count relationship types for each concept
        for node in self.semantic_graph.nodes():
            # Initialize as generic
            categories[node] = 'generic'
            
            # Get all relationships involving this concept
            in_edges = self.semantic_graph.in_edges(node, data=True)
            out_edges = self.semantic_graph.out_edges(node, data=True)
            
            # Count relationship types
            person_relations = 0
            place_relations = 0
            animal_relations = 0
            
            for _, _, data in in_edges:
                rel = data.get('relation', '')
                if rel in {'IsA/person', 'CapableOf', 'HasA'}:
                    person_relations += 1
                elif rel in {'AtLocation', 'LocatedNear', 'HasA'}:
                    place_relations += 1
                elif rel in {'IsA/animal', 'CapableOf'}:
                    animal_relations += 1
            
            for _, _, data in out_edges:
                rel = data.get('relation', '')
                if rel in {'IsA/person', 'CapableOf', 'HasA'}:
                    person_relations += 1
                elif rel in {'AtLocation', 'LocatedNear', 'HasA'}:
                    place_relations += 1
                elif rel in {'IsA/animal', 'CapableOf'}:
                    animal_relations += 1
            
            # Assign category based on dominant relationships
            max_relations = max(person_relations, place_relations, animal_relations)
            if max_relations > 0:
                if max_relations == person_relations:
                    categories[node] = 'person'
                elif max_relations == place_relations:
                    categories[node] = 'place'
                elif max_relations == animal_relations:
                    categories[node] = 'animal'
        
        # Update graph with categories
        nx.set_node_attributes(self.semantic_graph, categories, 'category')
        
        # Print category statistics
        category_counts = {}
        for cat in categories.values():
            category_counts[cat] = category_counts.get(cat, 0) + 1
        print(f"Inferred categories: {category_counts}")
    
    def compute_important_relationships(self, threshold=0.5, max_relationships=30):
        """Compute the most important relationships between concepts based on vector similarity"""
        important_relationships = []
        
        # Get all pairs of concepts
        concepts = list(self.concept_vectors.keys())
        for i, concept1 in enumerate(concepts):
            for concept2 in concepts[i+1:]:
                # Get vectors
                vec1 = self.concept_vectors[concept1]['vector']
                vec2 = self.concept_vectors[concept2]['vector']
                
                # Compute cosine similarity
                similarity = 1 - cosine(vec1, vec2)
                
                if similarity > threshold:
                    important_relationships.append({
                        'source': concept1,
                        'target': concept2,
                        'similarity': similarity
                    })
        
        # Sort by similarity and take top N
        important_relationships.sort(key=lambda x: x['similarity'], reverse=True)
        return important_relationships[:max_relationships]
    
    def generate_concept_vectors(self, dimensions=5):
        """Generate concept vectors based on graph structure"""
        print(f"Generating {dimensions}-dimensional concept vectors...")
        
        # Use node2vec or similar embedding
        nodes = list(self.semantic_graph.nodes())
        
        # Simple embedding based on connectivity patterns
        adjacency_matrix = nx.adjacency_matrix(self.semantic_graph).todense()
        
        # Use SVD to reduce dimensionality
        U, _, _ = np.linalg.svd(adjacency_matrix)
        embeddings = U[:, :dimensions]
        
        # Normalize embeddings
        embeddings = (embeddings - embeddings.mean(axis=0)) / embeddings.std(axis=0)
        
        # Convert to dictionary with structured data
        for i, node in enumerate(nodes):
            self.concept_vectors[node] = {
                'vector': embeddings[i],
                'category': self.semantic_graph.nodes[node].get('category', 'generic')
            }
        
        print(f"Generated vectors for {len(self.concept_vectors)} concepts")
        return self.concept_vectors

class SemanticVisualizer:
    def __init__(self, concept_processor=None):
        self.concept_processor = concept_processor
        self.fig = None
        self.ax = None
        self.category_colors = {
            'person': '#FF6B6B',  # Warm red
            'place': '#4ECDC4',   # Teal
            'animal': '#FFD93D',  # Bright yellow
            'generic': '#95A5A6'  # Neutral gray
        }
        self.category_descriptions = {
            'person': 'Human Entities & Roles',
            'place': 'Locations & Spaces',
            'animal': 'Living Creatures',
            'generic': 'Abstract Concepts'
        }

    def visualize_concepts_improved(self, output_dir, num_frames=30):
        """Create an enhanced visualization with labels, legends, and smooth transitions"""
        try:
            print("Creating enhanced semantic visualization...")
            
            if not self.concept_processor or not self.concept_processor.concept_vectors:
                raise ValueError("Concept processor not initialized or no vectors generated")
            
            # Get important relationships
            try:
                important_relationships = self.concept_processor.compute_important_relationships(
                    threshold=0.5, 
                    max_relationships=30
                )
            except Exception as e:
                print(f"Warning: Could not compute relationships: {str(e)}")
                important_relationships = []

            # Setup figure with high DPI for crisp text
            plt.close('all')
            fig = plt.figure(figsize=(20, 14), facecolor='black', dpi=150)
            ax = fig.add_subplot(111, projection='3d', facecolor='black')
            
            # Add a title with project info
            fig.suptitle('Semantic Concept Space Visualization\nRelational Semantic Convergence (RSC) Theory', 
                        color='white', y=0.95, fontsize=16, fontweight='bold')

            def update(frame):
                try:
                    ax.clear()
                    ax.set_facecolor('black')
                    
                    # Configure axis appearance
                    ax.grid(True, alpha=0.1, color='white')
                    ax.xaxis.pane.fill = False
                    ax.yaxis.pane.fill = False
                    ax.zaxis.pane.fill = False
                    
                    # Remove axis labels but keep tick marks
                    ax.set_xticklabels([])
                    ax.set_yticklabels([])
                    ax.set_zticklabels([])
                    
                    # Calculate smooth rotation angle
                    theta = (frame / num_frames) * 2 * np.pi
                    
                    # Convert concept vectors to 3D points with rotation
                    points = {}
                    for concept, data in self.concept_processor.concept_vectors.items():
                        vector = data['vector'][:3]
                        
                        # Apply smooth rotation matrix
                        x, y, z = vector
                        x_rot = x * np.cos(theta) - y * np.sin(theta)
                        y_rot = x * np.sin(theta) + y * np.cos(theta)
                        
                        points[concept] = (x_rot, y_rot, z)
                    
                    # Plot points by category with enhanced visual elements
                    legend_elements = []
                    for category, color in self.category_colors.items():
                        cat_points = [
                            (concept, (x, y, z)) for concept, (x, y, z) in points.items()
                            if self.concept_processor.concept_vectors[concept]['category'] == category
                        ]
                        
                        if cat_points:
                            concepts, coords = zip(*cat_points)
                            xs, ys, zs = zip(*coords)
                            
                            # Create scatter plot with glowing effect
                            scatter = ax.scatter(xs, ys, zs, 
                                              c=color, 
                                              alpha=0.8, 
                                              s=100,  # Larger points
                                              edgecolors='white',
                                              linewidth=0.5)
                            
                            # Add category to legend
                            legend_elements.append(plt.Line2D([0], [0], 
                                                            marker='o', 
                                                            color='none',
                                                            markerfacecolor=color,
                                                            markeredgecolor='white',
                                                            markersize=10,
                                                            label=self.category_descriptions[category]))
                            
                            # Add labels for important concepts
                            for concept, (x, y, z) in zip(concepts, coords):
                                if len(concept) > 2:  # Only label non-trivial concepts
                                    ax.text(x, y, z, 
                                          concept,
                                          color='white',
                                          fontsize=8,
                                          alpha=0.7,
                                          backgroundcolor=(0, 0, 0, 0.3))
                    
                    # Add connections between related concepts
                    if important_relationships and frame == 0:  # Only on first frame for performance
                        for rel in important_relationships[:10]:  # Limit to top 10 relationships
                            if rel['source'] in points and rel['target'] in points:
                                x1, y1, z1 = points[rel['source']]
                                x2, y2, z2 = points[rel['target']]
                                ax.plot([x1, x2], [y1, y2], [z1, z2], 
                                      color='white',
                                      alpha=0.2,
                                      linestyle='--')
                    
                    # Add legend with enhanced styling
                    legend = ax.legend(handles=legend_elements,
                                     loc='center left',
                                     bbox_to_anchor=(1.15, 0.5),
                                     title='Semantic Categories',
                                     facecolor='black',
                                     edgecolor='white',
                                     framealpha=0.8)
                    legend.get_title().set_color('white')
                    for text in legend.get_texts():
                        text.set_color('white')
                    
                    # Add RSC theory info
                    ax.text2D(0.02, 0.02, 
                             'RSC Theory Visualization\nShowing semantic relationships and concept clustering',
                             transform=ax.transAxes,
                             color='white',
                             alpha=0.7,
                             fontsize=8)
                    
                    return ax
                
                except Exception as e:
                    print(f"Error in update frame {frame}: {str(e)}")
                    return ax

            # Create animation with enhanced parameters
            anim = animation.FuncAnimation(
                fig, 
                update,
                frames=num_frames,
                interval=50,  # Faster frame rate for smoother animation
                blit=False
            )
            
            # Create a static preview image
            print("Creating enhanced static preview...")
            update(0)
            
            # Save with high quality settings
            static_path = os.path.join(output_dir, f'semantica_readable_preview_{uuid.uuid4()}.png')
            plt.savefig(static_path, dpi=150, bbox_inches='tight')
            
            # Save animation with enhanced quality
            output_path = os.path.join(output_dir, f'semantica_readable_{uuid.uuid4()}.gif')
            anim.save(
                output_path,
                writer='pillow',
                fps=30,  # Higher FPS for smoother animation
                dpi=150
            )
            
            plt.close()
            print(f"Enhanced visualization saved successfully to {output_path}")
            return output_path
            
        except Exception as e:
            print(f"Error creating visualization: {str(e)}")
            traceback.print_exc()
            return None

In [21]:
# Load and process ConceptNet data
print("Loading ConceptNet data...")
english_conceptnet = pd.read_csv(
    '../Data/Input/conceptnet-assertions-5.7.0.en.tsv',
    sep='\t',
    names=['URI', 'rel', 'start', 'end', 'weight', 'source', 'id', 'dataset', 'surfaceText']
)
german_conceptnet = pd.read_csv(
    '../Data/Input/conceptnet-assertions-5.7.0.de.tsv',
    sep='\t',
    names=['URI', 'rel', 'start', 'end', 'weight', 'source', 'id', 'dataset', 'surfaceText']
)

print(f"English ConceptNet loaded with {len(english_conceptnet)} assertions.")
print(f"German ConceptNet loaded with {len(german_conceptnet)} assertions.")

# Initialize processor with larger sample size
processor = ConceptNetProcessor(english_conceptnet, german_conceptnet)

# Build semantic graph with 25% sampling
processor.build_semantic_graph(
    max_concepts=2_000,  # Keep reasonable number of concepts for visualization
    min_weight=1.0,
    sample_size=0.25  # Use 25% of the data
)

# Generate concept vectors
processor.generate_concept_vectors(dimensions=5)

# Initialize visualizer
visualizer = SemanticVisualizer(processor)

# Create improved visualization with more frames for smoother animation
output_dir = '../Data/Output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_path = visualizer.visualize_concepts_improved(
    output_dir,
    num_frames=300  # Increased number of frames for smoother animation
)

print(f"Visualization created at: {output_path}")

Loading ConceptNet data...
English ConceptNet loaded with 3423004 assertions.
German ConceptNet loaded with 1078946 assertions.
ConceptNetProcessor initialized
Building semantic graph from ConceptNet data...
Processing 3423004 English ConceptNet assertions...
Will sample 855751 English assertions
Processing 1078946 German ConceptNet assertions...
Will sample 269736 German assertions
Sampling 855751 assertions from 3423004 en assertions


Processing en assertions: 100%|██████████| 855751/855751 [00:28<00:00, 30542.23it/s]


Sampling 269736 assertions from 1078946 de assertions


Processing de assertions: 100%|██████████| 269736/269736 [00:08<00:00, 31268.91it/s]


Limiting graph to top 2000 concepts...
Semantic graph built with 2000 nodes and 13518 edges
Inferring semantic categories...
Inferred categories: {'generic': 1503, 'person': 84, 'place': 413}
Generating 5-dimensional concept vectors...
Generated vectors for 2000 concepts
Creating improved readable visualization...
Creating static preview...
Visualization created at: ../Data/Output\semantica_readable_551255a7-0a1c-4798-928b-5d912f2715be.gif
