# 02. Embedding Evaluation

This notebook evaluates the quality of code embeddings generated by the Jina model.

In [None]:
# Setup
import sys
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

sys.path.append(str(Path.cwd().parent))

from src.yolo_assistant.indexer import CodeEmbedder

## 1. Test Semantic Similarity

In [None]:
# Initialize embedder
embedder = CodeEmbedder()

# Test cases: similar and dissimilar code snippets
test_cases = [
    # Similar functions (training related)
    ("def train_model(model, data, epochs=100):",
     "def train_yolo(yolo_model, dataset, num_epochs=100):"),
    
    # Similar functions (inference related)
    ("def predict(model, image):",
     "def inference(net, img):"),
    
    # Different functions
    ("def train_model(model, data):",
     "def save_checkpoint(state, filename):"),
    
    # Same concept, different implementation
    ("class YOLO(nn.Module):",
     "class YOLOv8(BaseModel):"),
]

# Calculate similarities
for i, (code1, code2) in enumerate(test_cases):
    emb1 = embedder.embed_text(code1)
    emb2 = embedder.embed_text(code2)
    
    similarity = cosine_similarity([emb1], [emb2])[0][0]
    
    print(f"Test case {i+1}:")
    print(f"  Code 1: {code1[:50]}...")
    print(f"  Code 2: {code2[:50]}...")
    print(f"  Cosine similarity: {similarity:.4f}")
    print("-" * 50)

## 2. Embedding Distribution Analysis

In [None]:
# Generate embeddings for various code patterns
code_patterns = [
    # Functions
    "def forward(self, x): return self.model(x)",
    "def backward(self, grad): return grad * self.weight",
    "def loss(self, pred, target): return F.cross_entropy(pred, target)",
    
    # Classes
    "class Model(nn.Module): pass",
    "class Dataset(torch.utils.data.Dataset): pass",
    "class Trainer: pass",
    
    # Imports
    "import torch",
    "from ultralytics import YOLO",
    "import numpy as np",
]

embeddings = embedder.embed_batch(code_patterns)
embeddings_array = np.array(embeddings)

# Analyze embedding statistics
print("Embedding Statistics:")
print(f"  Shape: {embeddings_array.shape}")
print(f"  Mean: {embeddings_array.mean():.4f}")
print(f"  Std: {embeddings_array.std():.4f}")
print(f"  Min: {embeddings_array.min():.4f}")
print(f"  Max: {embeddings_array.max():.4f}")

# Plot distribution
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.hist(embeddings_array.flatten(), bins=50, alpha=0.7)
plt.title('Embedding Value Distribution')
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.boxplot([emb for emb in embeddings_array.T], showfliers=False)
plt.title('Embedding Dimension Statistics')
plt.xlabel('Dimension Index')
plt.ylabel('Value')
plt.xticks(range(0, len(embeddings_array[0]), 100), range(0, len(embeddings_array[0]), 100))

plt.tight_layout()
plt.show()

## 3. Query-Code Matching

In [None]:
# Test query to code matching
queries = [
    "How to train a YOLO model?",
    "Export model to ONNX format",
    "Data augmentation techniques",
    "Calculate mAP metric",
]

code_snippets = [
    "def train(model, dataloader, epochs=100): optimizer = Adam(model.parameters())",
    "def export_onnx(model, filepath): torch.onnx.export(model, dummy_input, filepath)",
    "def augment_image(image): return transforms.RandomHorizontalFlip()(image)",
    "def calculate_map(predictions, ground_truth): return mean_average_precision(predictions, ground_truth)",
    "def load_dataset(path): return Dataset(path)",
    "def save_checkpoint(model, path): torch.save(model.state_dict(), path)",
]

# Embed queries and code
query_embeddings = embedder.embed_batch(queries)
code_embeddings = embedder.embed_batch(code_snippets)

# Calculate similarity matrix
similarity_matrix = cosine_similarity(query_embeddings, code_embeddings)

# Display results
print("Query-Code Similarity Matrix:")
print("(Rows: Queries, Columns: Code snippets)\n")

for i, query in enumerate(queries):
    print(f"Query: {query}")
    similarities = similarity_matrix[i]
    best_match_idx = np.argmax(similarities)
    
    print(f"  Best match: {code_snippets[best_match_idx][:60]}...")
    print(f"  Similarity: {similarities[best_match_idx]:.4f}")
    print()

## 4. Embedding Clustering

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Create diverse code samples
code_samples = [
    # Training functions
    "def train(model, data): pass",
    "def training_loop(net, dataset): pass",
    "def fit(model, X, y): pass",
    
    # Loss functions
    "def focal_loss(pred, target): pass",
    "def iou_loss(boxes1, boxes2): pass",
    "def classification_loss(logits, labels): pass",
    
    # Data processing
    "def preprocess_image(img): pass",
    "def augment_data(batch): pass",
    "def normalize(tensor): pass",
    
    # Model architecture
    "class ConvBlock(nn.Module): pass",
    "class Backbone(nn.Module): pass",
    "class DetectionHead(nn.Module): pass",
]

# Generate embeddings
sample_embeddings = embedder.embed_batch(code_samples)

# Reduce dimensionality for visualization
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(sample_embeddings)

# Cluster embeddings
kmeans = KMeans(n_clusters=4, random_state=42)
clusters = kmeans.fit_predict(sample_embeddings)

# Visualize
plt.figure(figsize=(10, 8))
colors = ['red', 'blue', 'green', 'orange']
categories = ['Training', 'Loss', 'Data', 'Architecture']

for i, (x, y) in enumerate(embeddings_2d):
    plt.scatter(x, y, c=colors[i//3], s=100)
    plt.annotate(code_samples[i][:20] + '...', (x, y), fontsize=8, alpha=0.7)

plt.title('Code Embedding Clusters (PCA Projection)')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')

# Add legend
for i, cat in enumerate(categories):
    plt.scatter([], [], c=colors[i], label=cat, s=100)
plt.legend()

plt.grid(True, alpha=0.3)
plt.show()

print(f"PCA explained variance ratio: {pca.explained_variance_ratio_}")