# 01. Indexing Exploration

This notebook explores the indexing pipeline for the YOLO codebase.

In [None]:
# Setup paths
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

from src.yolo_assistant.config import config
from src.yolo_assistant.indexer import TreeSitterParser, CodeChunker, CodeEmbedder

## 1. Test Tree-sitter Parser

In [None]:
# Create a test Python file
test_code = '''
import torch
import torch.nn as nn

class YOLOModel(nn.Module):
    """YOLO model base class."""
    
    def __init__(self, cfg='yolov8n.yaml'):
        super().__init__()
        self.cfg = cfg
        
    def forward(self, x):
        """Forward pass through the network."""
        return self.model(x)
        
def train_model(model, dataloader, epochs=100):
    """Train a YOLO model."""
    optimizer = torch.optim.Adam(model.parameters())
    for epoch in range(epochs):
        pass
'''

# Save test file
test_file = Path('test_yolo.py')
test_file.write_text(test_code)

# Parse the file
parser = TreeSitterParser()
elements = parser.parse_file(test_file)

# Display results
for element in elements:
    print(f"Type: {element.element_type}")
    print(f"Name: {element.name}")
    print(f"Lines: {element.start_line}-{element.end_line}")
    if element.docstring:
        print(f"Docstring: {element.docstring}")
    print("-" * 50)

# Cleanup
test_file.unlink()

## 2. Test Code Chunker

In [None]:
# Test chunking a directory
chunker = CodeChunker()

# Get a sample from the actual repo (if cloned)
repo_path = config.ultralytics_repo_dir
if repo_path.exists():
    # Find a sample Python file
    sample_files = list((repo_path / "ultralytics/models").glob("*.py"))[:3]
    
    for file_path in sample_files:
        print(f"\nChunking {file_path.name}:")
        chunks = chunker.chunk_file(file_path)
        
        for chunk in chunks[:3]:  # Show first 3 chunks
            print(f"  - {chunk.chunk_type}: {chunk.name}")
            print(f"    Lines: {chunk.start_line}-{chunk.end_line}")
else:
    print("Repository not cloned yet. Run 'python main.py --index' first.")

## 3. Test Embedding Generation

In [None]:
# Test embedding generation
embedder = CodeEmbedder()

# Test single embedding
test_text = "def train_yolo_model(model, dataset, epochs=100):"
embedding = embedder.embed_text(test_text)

print(f"Embedding model: {embedder.model_name}")
print(f"Embedding dimension: {len(embedding)}")
print(f"First 10 values: {embedding[:10]}")

# Test batch embedding
test_texts = [
    "class YOLO(nn.Module):",
    "def forward(self, x):",
    "def loss(self, pred, target):"
]

batch_embeddings = embedder.embed_batch(test_texts)
print(f"\nBatch embeddings shape: {len(batch_embeddings)} x {len(batch_embeddings[0])}")

## 4. Test Full Indexing Pipeline

In [None]:
# Test the full pipeline on a small directory
if repo_path.exists():
    # Create test chunks
    test_dir = repo_path / "ultralytics/models/yolo"
    if test_dir.exists():
        chunks = chunker.chunk_directory(test_dir, recursive=False)
        print(f"Found {len(chunks)} chunks")
        
        # Prepare for storage
        documents = embedder.prepare_chunks_for_storage(chunks[:5])  # Just first 5
        
        # Check document structure
        if documents:
            print("\nDocument structure:")
            for key in documents[0].keys():
                if key != 'embedding':
                    print(f"  - {key}: {type(documents[0][key])}")
            print(f"  - embedding: list[float] (dim={len(documents[0]['embedding'])})")