# NeuroGraph Semantic Analysis with Jupyter

This notebook demonstrates using NeuroGraph Python client for semantic analysis and document clustering.

## Setup

In [None]:
# Install dependencies (uncomment if needed)
# !pip install neurograph-python matplotlib scikit-learn pandas numpy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import seaborn as sns

from neurograph import NeuroGraphClient

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## Connect to NeuroGraph

In [None]:
# Initialize client
client = NeuroGraphClient(
    base_url="http://localhost:8000",
    username="developer",
    password="developer123"
)

# Test connection
health = client.health.check()
print(f"Connected to NeuroGraph {health.version}")
print(f"Status: {health.status}")

## Create Sample Documents

In [None]:
# Sample documents from different categories
documents = [
    # Technology
    "Python is a high-level programming language",
    "Machine learning algorithms can recognize patterns",
    "Deep neural networks are used in AI",
    "JavaScript is popular for web development",
    "Cloud computing provides scalable infrastructure",
    
    # Science
    "DNA contains genetic information",
    "Photosynthesis converts light into energy",
    "Evolution explains biological diversity",
    "Atoms are the basic units of matter",
    "The theory of relativity revolutionized physics",
    
    # Business
    "Marketing strategies increase brand awareness",
    "Financial planning is essential for businesses",
    "Customer service improves client satisfaction",
    "Supply chain management optimizes operations",
    "Data analytics drives business decisions",
]

categories = [
    'tech', 'tech', 'tech', 'tech', 'tech',
    'science', 'science', 'science', 'science', 'science',
    'business', 'business', 'business', 'business', 'business'
]

# Create tokens
print("Creating tokens...")
tokens = []
for text, category in zip(documents, categories):
    token = client.tokens.create(
        text=text,
        metadata={"category": category, "source": "jupyter_notebook"}
    )
    tokens.append(token)
    print(f"✓ Created token {token.id}: {text[:50]}...")

print(f"\nCreated {len(tokens)} tokens")

## Extract Embeddings for Analysis

In [None]:
# Extract embeddings and metadata
embeddings = np.array([token.embedding for token in tokens])
texts = [token.text for token in tokens]
labels = [token.metadata['category'] for token in tokens]

print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding dimension: {embeddings.shape[1]}")

## Visualize Embeddings with t-SNE

In [None]:
# Reduce dimensionality with t-SNE
print("Running t-SNE dimensionality reduction...")
tsne = TSNE(n_components=2, random_state=42, perplexity=5)
embeddings_2d = tsne.fit_transform(embeddings)

# Create DataFrame for plotting
df = pd.DataFrame({
    'x': embeddings_2d[:, 0],
    'y': embeddings_2d[:, 1],
    'text': texts,
    'category': labels
})

# Plot
plt.figure(figsize=(12, 8))
colors = {'tech': 'blue', 'science': 'green', 'business': 'red'}

for category in df['category'].unique():
    mask = df['category'] == category
    plt.scatter(
        df[mask]['x'],
        df[mask]['y'],
        c=colors[category],
        label=category,
        s=100,
        alpha=0.6
    )

plt.title('Document Embeddings Visualization (t-SNE)', fontsize=16)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nDocuments by category:")
print(df.groupby('category').size())

## Semantic Search

In [None]:
# Search for similar documents
query = "artificial intelligence and neural networks"
print(f"Query: {query}\n")

# Create query token
query_token = client.tokens.create(text=query)

# Search
results = client.tokens.query(
    query_vector=query_token.embedding,
    top_k=5
)

# Display results
print("Top 5 similar documents:\n")
results_data = []
for i, result in enumerate(results, 1):
    print(f"{i}. Similarity: {result.similarity:.4f}")
    print(f"   Text: {result.token.text}")
    print(f"   Category: {result.token.metadata.get('category')}\n")
    
    results_data.append({
        'rank': i,
        'similarity': result.similarity,
        'text': result.token.text,
        'category': result.token.metadata.get('category')
    })

# Cleanup query token
client.tokens.delete(query_token.id)

# Create results DataFrame
results_df = pd.DataFrame(results_data)
results_df

## Clustering Analysis

In [None]:
# Perform K-means clustering
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(embeddings)

# Add clusters to DataFrame
df['cluster'] = clusters

# Plot with cluster assignments
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    df['x'],
    df['y'],
    c=df['cluster'],
    cmap='viridis',
    s=100,
    alpha=0.6
)

# Plot cluster centers in 2D space
centers_2d = tsne.fit_transform(kmeans.cluster_centers_)
plt.scatter(
    centers_2d[:, 0],
    centers_2d[:, 1],
    c='red',
    marker='X',
    s=200,
    label='Cluster Centers',
    edgecolors='black'
)

plt.title(f'K-Means Clustering (k={n_clusters})', fontsize=16)
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.colorbar(scatter, label='Cluster')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Analyze clusters
print("\nCluster composition:")
cluster_composition = df.groupby(['cluster', 'category']).size().unstack(fill_value=0)
print(cluster_composition)

## Similarity Matrix

In [None]:
# Compute similarity matrix (cosine similarity)
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(embeddings)

# Plot heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(
    similarity_matrix,
    cmap='YlOrRd',
    xticklabels=[f"{i}: {t[:30]}..." for i, t in enumerate(texts)],
    yticklabels=[f"{i}: {t[:30]}..." for i, t in enumerate(texts)],
    cbar_kws={'label': 'Cosine Similarity'}
)
plt.title('Document Similarity Matrix', fontsize=16)
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.yticks(rotation=0, fontsize=8)
plt.tight_layout()
plt.show()

# Find most similar pairs
print("\nTop 5 most similar document pairs:")
similarity_pairs = []
for i in range(len(texts)):
    for j in range(i+1, len(texts)):
        similarity_pairs.append({
            'doc1': texts[i][:40],
            'doc2': texts[j][:40],
            'similarity': similarity_matrix[i, j]
        })

similarity_df = pd.DataFrame(similarity_pairs)
similarity_df = similarity_df.sort_values('similarity', ascending=False)
print(similarity_df.head())

## Cleanup

In [None]:
# Delete created tokens (optional)
print("Cleaning up tokens...")
for token in tokens:
    client.tokens.delete(token.id)
    print(f"✓ Deleted token {token.id}")

print("\nCleanup complete!")

# Close client
client.close()