In [1]:
# Import necessary libraries
import sqlite3
import numpy as np
from sentence_transformers import SentenceTransformer
import json
import os
from typing import List, Dict, Any
import pandas as pd

# Initialize the semantic search model
print("Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight but effective model
print("Model loaded successfully!")

def get_db_connection():
    """Get database connection."""
    db_path = os.path.join(os.getcwd(), 'commits.db')
    return sqlite3.connect(db_path)

def setup_embeddings():
    """
    One-time setup to add embeddings to existing commit data.
    Embeds both diffs and reason (explanation) fields.
    """
    conn = get_db_connection()
    cursor = conn.cursor()
    
    try:
        # Add embedding columns if they don't exist
        cursor.execute("ALTER TABLE commits ADD COLUMN diffs_embedding TEXT")
        print("Added diffs_embedding column to database")
    except sqlite3.OperationalError:
        print("diffs_embedding column already exists")
    
    try:
        cursor.execute("ALTER TABLE commits ADD COLUMN reason_embedding TEXT")
        print("Added reason_embedding column to database")
    except sqlite3.OperationalError:
        print("reason_embedding column already exists")
    
    # Get records without embeddings
    cursor.execute("""
        SELECT rowid, diffs, reason 
        FROM commits 
        WHERE diffs_embedding IS NULL OR reason_embedding IS NULL
    """)
    records = cursor.fetchall()
    
    if not records:
        conn.close()
        print("All commits already have embeddings or no commits found.")
        return
    
    print(f"Processing {len(records)} commits...")
    processed_diffs = 0
    processed_reasons = 0
    
    for i, (rowid, diffs, reason) in enumerate(records):
        try:
            # Process diffs embedding
            if diffs and diffs.strip():
                # Truncate very long diffs to avoid memory issues
                truncated_diffs = diffs[:8000] if len(diffs) > 8000 else diffs
                diffs_embedding = model.encode(truncated_diffs)
                diffs_embedding_json = json.dumps(diffs_embedding.tolist())
                cursor.execute(
                    "UPDATE commits SET diffs_embedding = ? WHERE rowid = ?",
                    (diffs_embedding_json, rowid)
                )
                processed_diffs += 1
            
            # Process reason/explanation embedding
            if reason and reason.strip():
                reason_embedding = model.encode(reason)
                reason_embedding_json = json.dumps(reason_embedding.tolist())
                cursor.execute(
                    "UPDATE commits SET reason_embedding = ? WHERE rowid = ?",
                    (reason_embedding_json, rowid)
                )
                processed_reasons += 1
            
            if (i + 1) % 10 == 0:
                print(f"Processed {i + 1}/{len(records)} commits...")
                conn.commit()  # Commit periodically
                
        except Exception as e:
            print(f"Error processing record {rowid}: {e}")
    
    conn.commit()
    conn.close()
    
    print(f"Successfully added embeddings to:")
    print(f"  - {processed_diffs} diffs")
    print(f"  - {processed_reasons} reasons/explanations")

# Run the setup
setup_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


Loading sentence transformer model...
Model loaded successfully!
Added diffs_embedding column to database
Added reason_embedding column to database
Processing 30 commits...
Model loaded successfully!
Added diffs_embedding column to database
Added reason_embedding column to database
Processing 30 commits...
Processed 10/30 commits...
Processed 10/30 commits...
Processed 20/30 commits...
Processed 20/30 commits...
Processed 30/30 commits...
Successfully added embeddings to:
  - 30 diffs
  - 30 reasons/explanations
Processed 30/30 commits...
Successfully added embeddings to:
  - 30 diffs
  - 30 reasons/explanations


In [2]:
def semantic_search_commits(query: str, search_type: str = "both", top_k: int = 10):
    """
    Perform semantic search on commits using embeddings.
    
    Args:
        query: Search query string
        search_type: "diffs", "reason", or "both" to specify which embeddings to search
        top_k: Number of top results to return
    
    Returns:
        List of matching commits with similarity scores
    """
    conn = get_db_connection()
    cursor = conn.cursor()
    
    # Encode the query
    query_embedding = model.encode(query)
    
    results = []
    
    if search_type in ["diffs", "both"]:
        # Search in diffs embeddings
        cursor.execute("""
            SELECT rowid, repo, sha, title, diffs, reason, diffs_embedding
            FROM commits 
            WHERE diffs_embedding IS NOT NULL
        """)
        
        diffs_records = cursor.fetchall()
        for record in diffs_records:
            rowid, repo, sha, title, diffs, reason, diffs_embedding_json = record
            if diffs_embedding_json:
                diffs_embedding = np.array(json.loads(diffs_embedding_json))
                similarity = np.dot(query_embedding, diffs_embedding) / (
                    np.linalg.norm(query_embedding) * np.linalg.norm(diffs_embedding)
                )
                results.append({
                    'rowid': rowid,
                    'repo': repo,
                    'sha': sha,
                    'title': title,
                    'diffs': diffs[:500] + "..." if len(diffs) > 500 else diffs,
                    'reason': reason,
                    'similarity': float(similarity),
                    'search_field': 'diffs'
                })
    
    if search_type in ["reason", "both"]:
        # Search in reason embeddings
        cursor.execute("""
            SELECT rowid, repo, sha, title, diffs, reason, reason_embedding
            FROM commits 
            WHERE reason_embedding IS NOT NULL
        """)
        
        reason_records = cursor.fetchall()
        for record in reason_records:
            rowid, repo, sha, title, diffs, reason, reason_embedding_json = record
            if reason_embedding_json:
                reason_embedding = np.array(json.loads(reason_embedding_json))
                similarity = np.dot(query_embedding, reason_embedding) / (
                    np.linalg.norm(query_embedding) * np.linalg.norm(reason_embedding)
                )
                results.append({
                    'rowid': rowid,
                    'repo': repo,
                    'sha': sha,
                    'title': title,
                    'diffs': diffs[:500] + "..." if len(diffs) > 500 else diffs,
                    'reason': reason,
                    'similarity': float(similarity),
                    'search_field': 'reason'
                })
    
    conn.close()
    
    # Sort by similarity and return top_k
    results.sort(key=lambda x: x['similarity'], reverse=True)
    return results[:top_k]

def display_search_results(results):
    """Display search results in a readable format."""
    if not results:
        print("No results found.")
        return
    
    for i, result in enumerate(results, 1):
        print(f"\n--- Result {i} (Similarity: {result['similarity']:.4f}) ---")
        print(f"Repo: {result['repo']}")
        print(f"SHA: {result['sha']}")
        print(f"Title: {result['title']}")
        print(f"Search Field: {result['search_field']}")
        print(f"Reason: {result['reason']}")
        print(f"Diffs (truncated): {result['diffs']}")
        print("-" * 50)

# Example usage
print("\nEmbedding setup complete! You can now use semantic search:")
print("Example: results = semantic_search_commits('bug fix authentication')")
print("         display_search_results(results)")


Embedding setup complete! You can now use semantic search:
Example: results = semantic_search_commits('bug fix authentication')
         display_search_results(results)


In [5]:
# Test semantic search with different queries
print("=== Testing Semantic Search ===\n")

# Test 1: Search for bug fixes
print("1. Searching for 'bug fix':")
results = semantic_search_commits('bug fix', top_k=3)
display_search_results(results)

print("\n" + "="*80 + "\n")

# Test 2: Search for feature additions
print("2. Searching for 'add new feature':")
display_search_results(results)

print("\n" + "="*80 + "\n")

# Test 3: Search only in diffs
print("3. Searching for 'function' in diffs only:")
results = semantic_search_commits('function', search_type='diffs', top_k=3)
display_search_results(results)

print("\n" + "="*80 + "\n")

# Test 4: Search only in reasons
print("4. Searching for 'update' in reasons only:")
results = semantic_search_commits('update', search_type='reason', top_k=3)
display_search_results(results)

=== Testing Semantic Search ===

1. Searching for 'bug fix':

--- Result 1 (Similarity: 0.4416) ---
Repo: https://github.com/hm-group/gcp-projectfactory
SHA: 0d5d0d0183a1613a065f17773e1e394cd170399c
Title: Merge pull request #15999 from hm-group/projects/experience-service-api/230ab143-833b-42f5-8497-18e6ee83d490
Search Field: reason
Reason: This commit is a basic configuration change for a project. It does not involve any bug fixes, feature additions, or significant improvements.
Diffs (truncated): [{"filename": "projects/experience-service-api.yaml", "diff": "@@ -0,0 +1,16 @@\n+project:\n+  status: active\n+  projectId: experience-service-api\n+  displayName: experience-service-api\n+  costCode: ps0500\n+  folder: autonomous\n+  environment: nprod\n+  owners:\n+    - venkatagiribabu.jeedigunta@hm.com\n+  budget: 100\n+  applicationService: not_set\n+  editorGroup: true\n+  viewerGroup: false\n+  aiadSreMonitoring: false\n+  bucket: false\n+  firebase: false"}]
-----------------------

In [4]:
# Database statistics and summary
conn = get_db_connection()
cursor = conn.cursor()

print("=== Database Summary ===\n")

# Total commits
cursor.execute("SELECT COUNT(*) FROM commits")
total_commits = cursor.fetchone()[0]
print(f"Total commits: {total_commits}")

# Commits with diffs embeddings
cursor.execute("SELECT COUNT(*) FROM commits WHERE diffs_embedding IS NOT NULL")
diffs_embedded = cursor.fetchone()[0]
print(f"Commits with diffs embeddings: {diffs_embedded}")

# Commits with reason embeddings
cursor.execute("SELECT COUNT(*) FROM commits WHERE reason_embedding IS NOT NULL")
reason_embedded = cursor.fetchone()[0]
print(f"Commits with reason embeddings: {reason_embedded}")

# Repository breakdown
cursor.execute("SELECT repo, COUNT(*) as count FROM commits GROUP BY repo ORDER BY count DESC")
repo_stats = cursor.fetchall()
print(f"\nCommits by repository:")
for repo, count in repo_stats:
    print(f"  {repo}: {count} commits")

# File type breakdown
cursor.execute("SELECT file_type, COUNT(*) as count FROM commits WHERE file_type IS NOT NULL GROUP BY file_type ORDER BY count DESC")
file_type_stats = cursor.fetchall()
print(f"\nCommits by file type:")
for file_type, count in file_type_stats[:10]:  # Top 10
    print(f"  {file_type}: {count} commits")

conn.close()

print(f"\n=== Embedding Setup Complete ===")
print(f"Your commits database now has semantic search capabilities!")
print(f"You can search through both code diffs and explanations using natural language queries.")

=== Database Summary ===

Total commits: 30
Commits with diffs embeddings: 30
Commits with reason embeddings: 30

Commits by repository:
  https://github.com/hm-group/gcp-projectfactory: 30 commits

Commits by file type:
  yaml: 30 commits

=== Embedding Setup Complete ===
Your commits database now has semantic search capabilities!
You can search through both code diffs and explanations using natural language queries.
