From 9dab07afa69bf794efbd0f356d7d0d0b1d87550d Mon Sep 17 00:00:00 2001 From: Yilin Xia Date: Thu, 11 Sep 2025 16:44:28 -0500 Subject: [PATCH] add embeddings --- backend/app/config.example.json | 6 +- backend/app/config_manager.py | 12 +- backend/app/main.py | 253 ++++---- ...phrag_service.py => deepgit_ai_service.py} | 569 +++++++++++++++--- backend/requirements.txt | 4 +- environment.yml | 4 +- src/lib/config.ts | 23 +- src/styles/_graph.scss | 46 +- src/styles/_layout.scss | 80 ++- src/views/ContextPanel.tsx | 16 +- src/views/GraphRAGPanel.tsx | 230 +++---- start.sh | 2 +- 12 files changed, 858 insertions(+), 387 deletions(-) rename backend/app/services/{graphrag_service.py => deepgit_ai_service.py} (80%) diff --git a/backend/app/config.example.json b/backend/app/config.example.json index c36d8cf..9273c83 100644 --- a/backend/app/config.example.json +++ b/backend/app/config.example.json @@ -24,11 +24,13 @@ "token": "ghp_your-github-personal-access-token-here", "rate_limit_per_hour": 5000 }, - "graphrag": { + "deepgit_ai": { "timeout_minutes": 50, "batch_size": 50, "cache_hours": 24, - "max_repos_per_request": 1000 + "max_repos_per_request": 1000, + "strict_database_only": true, + "include_database_context": true }, "server": { "host": "127.0.0.1", diff --git a/backend/app/config_manager.py b/backend/app/config_manager.py index 3c60180..6c0b1d2 100644 --- a/backend/app/config_manager.py +++ b/backend/app/config_manager.py @@ -62,11 +62,13 @@ def _get_default_config(self) -> Dict[str, Any]: "token": "", "rate_limit_per_hour": 5000 }, - "graphrag": { + "deepgit_ai": { "timeout_minutes": 50, "batch_size": 50, "cache_hours": 24, - "max_repos_per_request": 1000 + "max_repos_per_request": 1000, + "strict_database_only": True, + "include_database_context": True }, "server": { "host": "127.0.0.1", @@ -143,9 +145,9 @@ def get_github_token(self) -> str: """Get GitHub token from configuration.""" return self.get("github.token", "") - def get_graphrag_config(self) -> Dict[str, Any]: - """Get GraphRAG configuration.""" - return self.get("graphrag", {}) + def get_deepgit_ai_config(self) -> Dict[str, Any]: + """Get DeepGitAI configuration.""" + return self.get("deepgit_ai", {}) def get_server_config(self) -> Dict[str, Any]: """Get server configuration.""" diff --git a/backend/app/main.py b/backend/app/main.py index 07a78b0..774f552 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -4,7 +4,7 @@ from services.ai_service import AITopicProcessor from services.gexf_node_service import GexfNodeGenerator from services.edge_generation_service import EdgeGenerationService -from services.graphrag_service import graphrag_service +from services.deepgit_ai_service import DeepGitAIService from config_manager import config_manager import os import asyncio @@ -29,9 +29,10 @@ ai_processor = AITopicProcessor() gexf_node_service = GexfNodeGenerator() edge_generation_service = EdgeGenerationService() +deepgit_ai_service = DeepGitAIService() -# Global progress tracking for GraphRAG setup -graphrag_progress = { +# Global progress tracking for DeepGitAI setup +deepgit_ai_progress = { "current_step": "", "current": 0, "total": 0, @@ -39,32 +40,54 @@ "status": "idle" # idle, running, completed, error } -# Global variable to track if GraphRAG is set up -graphrag_ready = False +# Global variable to track if DeepGitAI is set up +deepgit_ai_ready = False -@app.route("/api/graphrag-health", methods=["GET"]) -def graphrag_health(): - """Check if GraphRAG backend is ready and set up.""" - global graphrag_ready +@app.route("/api/deepgit-ai-health", methods=["GET"]) +def deepgit_ai_health(): + """Check if DeepGitAI backend is ready and set up.""" + global deepgit_ai_ready try: - if graphrag_ready: + if deepgit_ai_ready: return jsonify({ "success": True, "ready": True, - "message": "GraphRAG backend is ready" + "message": "DeepGitAI backend is ready" }) else: return jsonify({ "success": True, "ready": False, - "message": "GraphRAG backend is not set up" + "message": "DeepGitAI backend is not set up" }), 503 except Exception as e: return jsonify({ "success": False, "ready": False, "error": str(e), - "message": "Error checking GraphRAG health" + "message": "Error checking DeepGitAI health" + }), 500 + + +@app.route("/api/deepgit-ai-scope", methods=["GET"]) +def deepgit_ai_scope(): + """Get information about the database scope and limitations.""" + try: + if not deepgit_ai_ready: + return jsonify({ + "success": False, + "error": "DeepGitAI not initialized", + "message": "Please initialize DeepGitAI first" + }), 503 + + result = deepgit_ai_service.validate_query_scope("") + return jsonify(result) + + except Exception as e: + return jsonify({ + "success": False, + "error": str(e), + "message": "Error getting database scope information" }), 500 @@ -688,33 +711,33 @@ def create_edges_on_graph(): }), 500 -@app.route("/api/graphrag-reset-progress", methods=["POST", "OPTIONS"]) -def graphrag_reset_progress_endpoint(): - """Reset GraphRAG progress status to initial state.""" +@app.route("/api/deepgit-ai-reset-progress", methods=["POST", "OPTIONS"]) +def deepgit_ai_reset_progress_endpoint(): + """Reset DeepGitAI progress status to initial state.""" if request.method == "OPTIONS": return "", 200 - global graphrag_progress - graphrag_progress = { + global deepgit_ai_progress + deepgit_ai_progress = { "current_step": "Initializing...", "current": 0, "total": 100, - "message": "Preparing GraphRAG setup", + "message": "Preparing DeepGitAI setup", "status": "running" } return jsonify({"success": True, "message": "Progress reset"}) -@app.route("/api/graphrag-progress", methods=["GET"]) -def graphrag_progress_endpoint(): - """Server-Sent Events endpoint for GraphRAG progress updates.""" +@app.route("/api/deepgit-ai-progress", methods=["GET"]) +def deepgit_ai_progress_endpoint(): + """Server-Sent Events endpoint for DeepGitAI progress updates.""" def generate(): while True: # Send current progress - data = f"data: {json.dumps(graphrag_progress)}\n\n" + data = f"data: {json.dumps(deepgit_ai_progress)}\n\n" yield data # If completed or error, stop streaming - if graphrag_progress["status"] in ["completed", "error"]: + if deepgit_ai_progress["status"] in ["completed", "error"]: break time.sleep(0.5) # Update every 0.5 seconds for more responsive updates @@ -722,10 +745,10 @@ def generate(): return Response(generate(), mimetype="text/event-stream") -@app.route("/api/graphrag-setup", methods=["POST"]) -def graphrag_setup_endpoint(): - """GraphRAG setup endpoint with progress tracking.""" - global graphrag_progress, graphrag_ready +@app.route("/api/deepgit-ai-setup", methods=["POST"]) +def deepgit_ai_setup_endpoint(): + """DeepGitAI setup endpoint with progress tracking.""" + global deepgit_ai_progress, deepgit_ai_ready try: data = request.get_json() @@ -736,31 +759,31 @@ def graphrag_setup_endpoint(): graph_file = data.get("graphFile", "") session_id = data.get("sessionId", "") - # Merge frontend API keys with config and convert to GraphRAG format - graphrag_api_keys = {} + # Merge frontend API keys with config and convert to DeepGitAI format + deepgit_ai_api_keys = {} if provider == "openai": api_key = api_keys.get("openaiKey") or config_manager.get("ai_providers.openai.api_key", "") - graphrag_api_keys = {"openaiKey": api_key} + deepgit_ai_api_keys = {"openaiKey": api_key} elif provider == "azure_openai": api_key = api_keys.get("azureOpenAIKey") or config_manager.get("ai_providers.azure_openai.api_key", "") endpoint = api_keys.get("azureOpenAIEndpoint") or config_manager.get("ai_providers.azure_openai.endpoint", "") deployment = api_keys.get("azureOpenAIDeployment") or config_manager.get("ai_providers.azure_openai.deployment_name", "") - graphrag_api_keys = { + deepgit_ai_api_keys = { "azureOpenAIKey": api_key, "azureOpenAIEndpoint": endpoint, "azureOpenAIDeployment": deployment } elif provider == "google_genai": api_key = api_keys.get("geminiKey") or config_manager.get("ai_providers.google_genai.api_key", "") - graphrag_api_keys = {"geminiKey": api_key} - provider = "gemini" # Map to GraphRAG service provider name + deepgit_ai_api_keys = {"geminiKey": api_key} + provider = "gemini" # Map to DeepGitAI service provider name elif provider == "gemini": # Support direct "gemini" provider selection from frontend api_key = api_keys.get("geminiKey") or config_manager.get("ai_providers.google_genai.api_key", "") - graphrag_api_keys = {"geminiKey": api_key} + deepgit_ai_api_keys = {"geminiKey": api_key} elif provider == "anthropic": api_key = api_keys.get("anthropicKey") or config_manager.get("ai_providers.anthropic.api_key", "") - graphrag_api_keys = {"anthropicKey": api_key} + deepgit_ai_api_keys = {"anthropicKey": api_key} if not graph_file: return jsonify({ @@ -778,7 +801,7 @@ def graphrag_setup_endpoint(): }), 400 # Reset progress - graphrag_progress = { + deepgit_ai_progress = { "current_step": "Checking existing database...", "current": 0, "total": 100, @@ -787,79 +810,79 @@ def graphrag_setup_endpoint(): } # Check if database already exists for this graph - db_check = graphrag_service.check_database_exists(graph_file) + db_check = deepgit_ai_service.check_database_exists(graph_file) if db_check["exists"]: # Database exists, just update provider and README data if needed print(f"🔄 Database exists: {db_check['message']}") - graphrag_service.db_path = Path(db_check["db_path"]) + deepgit_ai_service.db_path = Path(db_check["db_path"]) # Update progress - graphrag_progress["current_step"] = "Updating README data..." - graphrag_progress["current"] = 20 - graphrag_progress["message"] = "Updating README content with new GitHub token" + deepgit_ai_progress["current_step"] = "Updating README data..." + deepgit_ai_progress["current"] = 20 + deepgit_ai_progress["message"] = "Updating README content with new GitHub token" # Update README data with new GitHub token - readme_result = graphrag_service.update_readme_data(github_token, graphrag_progress) + readme_result = deepgit_ai_service.update_readme_data(github_token, deepgit_ai_progress) if not readme_result["success"]: print(f"⚠️ README update failed: {readme_result.get('error', 'Unknown error')}") # Continue anyway, as README update is not critical # Update progress - graphrag_progress["current_step"] = "Database ready..." - graphrag_progress["current"] = 30 - graphrag_progress["message"] = "Database is ready, initializing AI system" + deepgit_ai_progress["current_step"] = "Database ready..." + deepgit_ai_progress["current"] = 30 + deepgit_ai_progress["message"] = "Database is ready, initializing AI system" else: # Database doesn't exist, create it print(f"🆕 Creating new database: {db_check['message']}") - graphrag_progress["current_step"] = "Creating database..." - graphrag_progress["current"] = 10 - graphrag_progress["message"] = "Creating new database from graph data" + deepgit_ai_progress["current_step"] = "Creating database..." + deepgit_ai_progress["current"] = 10 + deepgit_ai_progress["message"] = "Creating new database from graph data" # Setup database from GEXF content with progress updates - setup_result = graphrag_service.setup_database_from_gexf_with_progress(graph_file, github_token, graphrag_progress, session_id) + setup_result = deepgit_ai_service.setup_database_from_gexf_with_progress(graph_file, github_token, deepgit_ai_progress, session_id) if not setup_result["success"]: - graphrag_progress["status"] = "error" - graphrag_progress["message"] = setup_result.get("error", "Setup failed") + deepgit_ai_progress["status"] = "error" + deepgit_ai_progress["message"] = setup_result.get("error", "Setup failed") return jsonify(setup_result), 500 - # Initialize GraphRAG with the selected provider - graphrag_progress["current_step"] = "Initializing AI system..." - graphrag_progress["current"] = 90 - graphrag_progress["message"] = "Setting up AI analysis system" + # Initialize DeepGitAI with the selected provider + deepgit_ai_progress["current_step"] = "Initializing AI system..." + deepgit_ai_progress["current"] = 90 + deepgit_ai_progress["message"] = "Setting up AI analysis system" - init_result = graphrag_service.initialize_graphrag(provider, graphrag_api_keys) + init_result = deepgit_ai_service.initialize_deepgit_ai(provider, deepgit_ai_api_keys) if not init_result["success"]: - graphrag_progress["status"] = "error" - graphrag_progress["message"] = init_result.get("error", "AI initialization failed") + deepgit_ai_progress["status"] = "error" + deepgit_ai_progress["message"] = init_result.get("error", "AI initialization failed") return jsonify(init_result), 500 # Mark as completed and set ready flag - graphrag_progress["status"] = "completed" - graphrag_progress["current"] = 100 - graphrag_progress["message"] = "GraphRAG setup completed successfully!" - graphrag_ready = True + deepgit_ai_progress["status"] = "completed" + deepgit_ai_progress["current"] = 100 + deepgit_ai_progress["message"] = "DeepGitAI setup completed successfully!" + deepgit_ai_ready = True return jsonify({ "success": True, - "message": "GraphRAG setup completed successfully", + "message": "DeepGitAI setup completed successfully", "ready": True }) except Exception as e: - graphrag_progress["status"] = "error" - graphrag_progress["message"] = str(e) + deepgit_ai_progress["status"] = "error" + deepgit_ai_progress["message"] = str(e) return jsonify({ "success": False, "error": str(e), - "message": "An error occurred during GraphRAG setup" + "message": "An error occurred during DeepGitAI setup" }), 500 -@app.route("/api/graphrag-change-provider", methods=["POST"]) -def graphrag_change_provider_endpoint(): - """Change GraphRAG provider without recreating the database.""" +@app.route("/api/deepgit-ai-change-provider", methods=["POST"]) +def deepgit_ai_change_provider_endpoint(): + """Change DeepGitAI provider without recreating the database.""" try: data = request.get_json() @@ -867,31 +890,31 @@ def graphrag_change_provider_endpoint(): provider = data.get("provider", "openai") api_keys = data.get("apiKeys", {}) - # Merge frontend API keys with config and convert to GraphRAG format - graphrag_api_keys = {} + # Merge frontend API keys with config and convert to DeepGitAI format + deepgit_ai_api_keys = {} if provider == "openai": api_key = api_keys.get("openaiKey") or config_manager.get("ai_providers.openai.api_key", "") - graphrag_api_keys = {"openaiKey": api_key} + deepgit_ai_api_keys = {"openaiKey": api_key} elif provider == "azure_openai": api_key = api_keys.get("azureOpenAIKey") or config_manager.get("ai_providers.azure_openai.api_key", "") endpoint = api_keys.get("azureOpenAIEndpoint") or config_manager.get("ai_providers.azure_openai.endpoint", "") deployment = api_keys.get("azureOpenAIDeployment") or config_manager.get("ai_providers.azure_openai.deployment_name", "") - graphrag_api_keys = { + deepgit_ai_api_keys = { "azureOpenAIKey": api_key, "azureOpenAIEndpoint": endpoint, "azureOpenAIDeployment": deployment } elif provider == "google_genai": api_key = api_keys.get("geminiKey") or config_manager.get("ai_providers.google_genai.api_key", "") - graphrag_api_keys = {"geminiKey": api_key} - provider = "gemini" # Map to GraphRAG service provider name + deepgit_ai_api_keys = {"geminiKey": api_key} + provider = "gemini" # Map to DeepGitAI service provider name elif provider == "gemini": # Support direct "gemini" provider selection from frontend api_key = api_keys.get("geminiKey") or config_manager.get("ai_providers.google_genai.api_key", "") - graphrag_api_keys = {"geminiKey": api_key} + deepgit_ai_api_keys = {"geminiKey": api_key} elif provider == "anthropic": api_key = api_keys.get("anthropicKey") or config_manager.get("ai_providers.anthropic.api_key", "") - graphrag_api_keys = {"anthropicKey": api_key} + deepgit_ai_api_keys = {"anthropicKey": api_key} if not provider: return jsonify({ @@ -901,7 +924,7 @@ def graphrag_change_provider_endpoint(): }), 400 # Change provider - result = graphrag_service.change_provider(provider, graphrag_api_keys) + result = deepgit_ai_service.change_provider(provider, deepgit_ai_api_keys) if result["success"]: return jsonify({ @@ -920,8 +943,8 @@ def graphrag_change_provider_endpoint(): }), 500 -@app.route("/api/graphrag-update-readme", methods=["POST"]) -def graphrag_update_readme_endpoint(): +@app.route("/api/deepgit-ai-update-readme", methods=["POST"]) +def deepgit_ai_update_readme_endpoint(): """Update README data with new GitHub token without recreating the database.""" try: data = request.get_json() @@ -937,7 +960,7 @@ def graphrag_update_readme_endpoint(): }), 400 # Update README data - result = graphrag_service.update_readme_data(github_token) + result = deepgit_ai_service.update_readme_data(github_token) if result["success"]: return jsonify({ @@ -955,19 +978,19 @@ def graphrag_update_readme_endpoint(): }), 500 -@app.route("/api/graphrag-fix-schema", methods=["POST"]) -def graphrag_fix_schema_endpoint(): +@app.route("/api/deepgit-ai-fix-schema", methods=["POST"]) +def deepgit_ai_fix_schema_endpoint(): """Fix database schema by adding missing README properties.""" try: - if not graphrag_service.db_path: + if not deepgit_ai_service.db_path: return jsonify({ "success": False, "error": "No database path set", - "message": "Please run GraphRAG setup first" + "message": "Please run DeepGitAI setup first" }), 400 # Fix database schema - success = graphrag_service.fix_database_schema(graphrag_service.db_path) + success = deepgit_ai_service.fix_database_schema(deepgit_ai_service.db_path) if success: return jsonify({ @@ -1009,7 +1032,7 @@ def get_config_endpoint(): "rate_limit_per_hour": config_manager.get("github.rate_limit_per_hour", 5000), "has_token": bool(config_manager.get("github.token", "")) }, - "graphrag": config_manager.get_graphrag_config(), + "deepgit_ai": config_manager.get_deepgit_ai_config(), "server": config_manager.get_server_config(), "database": config_manager.get_database_config() } @@ -1029,7 +1052,7 @@ def get_config_endpoint(): @app.route("/api/config/keys", methods=["GET"]) def get_config_keys_endpoint(): - """Get API keys from configuration for GraphRAG setup.""" + """Get API keys from configuration for DeepGitAI setup.""" try: keys_config = { "github": { @@ -1104,9 +1127,9 @@ def create_example_config_endpoint(): }), 500 -@app.route("/api/graphrag", methods=["POST"]) -def graphrag_endpoint(): - """GraphRAG endpoint for AI-powered graph analysis.""" +@app.route("/api/deepgit-ai", methods=["POST"]) +def deepgit_ai_endpoint(): + """DeepGitAI endpoint for AI-powered graph analysis.""" try: data = request.get_json() @@ -1115,31 +1138,31 @@ def graphrag_endpoint(): provider = data.get("provider", "openai") api_keys = data.get("apiKeys", {}) - # Merge frontend API keys with config and convert to GraphRAG format - graphrag_api_keys = {} + # Merge frontend API keys with config and convert to DeepGitAI format + deepgit_ai_api_keys = {} if provider == "openai": api_key = api_keys.get("openaiKey") or config_manager.get("ai_providers.openai.api_key", "") - graphrag_api_keys = {"openaiKey": api_key} + deepgit_ai_api_keys = {"openaiKey": api_key} elif provider == "azure_openai": api_key = api_keys.get("azureOpenAIKey") or config_manager.get("ai_providers.azure_openai.api_key", "") endpoint = api_keys.get("azureOpenAIEndpoint") or config_manager.get("ai_providers.azure_openai.endpoint", "") deployment = api_keys.get("azureOpenAIDeployment") or config_manager.get("ai_providers.azure_openai.deployment_name", "") - graphrag_api_keys = { + deepgit_ai_api_keys = { "azureOpenAIKey": api_key, "azureOpenAIEndpoint": endpoint, "azureOpenAIDeployment": deployment } elif provider == "google_genai": api_key = api_keys.get("geminiKey") or config_manager.get("ai_providers.google_genai.api_key", "") - graphrag_api_keys = {"geminiKey": api_key} - provider = "gemini" # Map to GraphRAG service provider name + deepgit_ai_api_keys = {"geminiKey": api_key} + provider = "gemini" # Map to DeepGitAI service provider name elif provider == "gemini": # Support direct "gemini" provider selection from frontend api_key = api_keys.get("geminiKey") or config_manager.get("ai_providers.google_genai.api_key", "") - graphrag_api_keys = {"geminiKey": api_key} + deepgit_ai_api_keys = {"geminiKey": api_key} elif provider == "anthropic": api_key = api_keys.get("anthropicKey") or config_manager.get("ai_providers.anthropic.api_key", "") - graphrag_api_keys = {"anthropicKey": api_key} + deepgit_ai_api_keys = {"anthropicKey": api_key} if not query: return jsonify({ @@ -1148,13 +1171,13 @@ def graphrag_endpoint(): "message": "Please provide a query" }), 400 - # Initialize GraphRAG with the selected provider if not already initialized - init_result = graphrag_service.initialize_graphrag(provider, graphrag_api_keys) + # Initialize DeepGitAI with the selected provider if not already initialized + init_result = deepgit_ai_service.initialize_deepgit_ai(provider, deepgit_ai_api_keys) if not init_result["success"]: return jsonify(init_result), 500 # Execute the query - query_result = graphrag_service.query_graphrag(query) + query_result = deepgit_ai_service.query_deepgit_ai(query) if not query_result["success"]: return jsonify(query_result), 500 @@ -1167,23 +1190,23 @@ def graphrag_endpoint(): return jsonify({ "success": False, "error": str(e), - "message": "An error occurred while processing the GraphRAG query" + "message": "An error occurred while processing the DeepGitAI query" }), 500 -@app.route("/api/graphrag-cleanup", methods=["POST"]) -def graphrag_cleanup_endpoint(): - """Clean up GraphRAG database and resources when user closes window.""" +@app.route("/api/deepgit-ai-cleanup", methods=["POST"]) +def deepgit_ai_cleanup_endpoint(): + """Clean up DeepGitAI database and resources when user closes window.""" try: data = request.get_json() or {} session_id = data.get("sessionId", "") - # Clean up GraphRAG resources - result = graphrag_service.cleanup(session_id) + # Clean up DeepGitAI resources + result = deepgit_ai_service.cleanup(session_id) return jsonify({ "success": True, - "message": "GraphRAG cleanup completed successfully", + "message": "DeepGitAI cleanup completed successfully", "details": result }) @@ -1191,13 +1214,13 @@ def graphrag_cleanup_endpoint(): return jsonify({ "success": False, "error": str(e), - "message": "An error occurred during GraphRAG cleanup" + "message": "An error occurred during DeepGitAI cleanup" }), 500 -@app.route("/api/graphrag-check-changes", methods=["POST"]) -def graphrag_check_changes_endpoint(): - """Check if GraphRAG database needs to be rebuilt due to graph changes.""" +@app.route("/api/deepgit-ai-check-changes", methods=["POST"]) +def deepgit_ai_check_changes_endpoint(): + """Check if DeepGitAI database needs to be rebuilt due to graph changes.""" try: data = request.get_json() or {} gexf_content = data.get("gexfContent", "") @@ -1210,7 +1233,7 @@ def graphrag_check_changes_endpoint(): }), 400 # Check if database should be rebuilt - result = graphrag_service.should_rebuild_database(gexf_content) + result = deepgit_ai_service.should_rebuild_database(gexf_content) return jsonify({ "success": True, diff --git a/backend/app/services/graphrag_service.py b/backend/app/services/deepgit_ai_service.py similarity index 80% rename from backend/app/services/graphrag_service.py rename to backend/app/services/deepgit_ai_service.py index 38fe1d4..dae7eff 100644 --- a/backend/app/services/graphrag_service.py +++ b/backend/app/services/deepgit_ai_service.py @@ -10,8 +10,9 @@ import csv import hashlib from pathlib import Path -from typing import Dict, Any, Optional +from typing import Dict, Any, Optional, List from dataclasses import dataclass +from config_manager import ConfigManager # from collections import defaultdict # Removed unused import # Import required libraries @@ -39,10 +40,11 @@ from langgraph.prebuilt import ToolNode from langchain_core.messages import HumanMessage, AIMessage, SystemMessage from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder - from langchain_openai import ChatOpenAI - from langchain_google_genai import ChatGoogleGenerativeAI + from langchain_openai import ChatOpenAI, OpenAIEmbeddings + from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings from langchain_core.tools import tool from langchain_core.output_parsers import JsonOutputParser + from langchain_community.embeddings import HuggingFaceEmbeddings from pydantic import BaseModel, Field except ImportError as e: print(f"❌ Failed to import LangGraph modules: {e}") @@ -56,14 +58,17 @@ MessagesPlaceholder = None ChatOpenAI = None ChatGoogleGenerativeAI = None + OpenAIEmbeddings = None + GoogleGenerativeAIEmbeddings = None + HuggingFaceEmbeddings = None tool = None JsonOutputParser = None BaseModel = None Field = None @dataclass -class GraphRAGState: - """State for the GraphRAG workflow.""" +class DeepGitAIState: + """State for the DeepGitAI workflow.""" messages: list = None query: str = "" graph_results: dict = None @@ -79,23 +84,126 @@ def __post_init__(self): if self.readme_content is None: self.readme_content = [] -class GraphRAGService: - """Service for handling GraphRAG operations.""" +class DeepGitAIService: + """Service for handling DeepGitAI operations.""" def __init__(self): self.db_path = None - self.graphrag_instance = None + self.deepgit_ai_instance = None self.session_id = None # Track which session owns this database # Use DuckDB in cache folder for README storage self.cache_dir = Path(__file__).parent.parent / "cache" self.cache_dir.mkdir(exist_ok=True) self.readme_cache_db_path = self.cache_dir / "readme_cache.duckdb" self._cache_stats = {"hits": 0, "misses": 0} + # Embedding model for README content + self.embedding_model = None + self.embedding_provider = None def get_cache_key(self, owner, repo): """Generate a cache key for a repository.""" return hashlib.md5(f"{owner}/{repo}".encode()).hexdigest() + def initialize_embeddings(self, provider: str, api_keys: Dict[str, str]) -> Dict[str, Any]: + """Initialize the embedding model based on the provider.""" + try: + if provider == "openai": + api_key = api_keys.get("openaiKey", "") + if not api_key: + return {"success": False, "error": "OpenAI API key not provided"} + + self.embedding_model = OpenAIEmbeddings( + model="text-embedding-3-small", + api_key=api_key + ) + self.embedding_provider = "openai" + + elif provider == "azure_openai": + api_key = api_keys.get("azureOpenAIKey", "") + endpoint = api_keys.get("azureOpenAIEndpoint", "") + deployment = api_keys.get("azureOpenAIDeployment", "") + + if not api_key or not endpoint or not deployment: + return {"success": False, "error": "Azure OpenAI credentials not provided"} + + self.embedding_model = OpenAIEmbeddings( + model=deployment, + api_key=api_key, + azure_endpoint=endpoint.rstrip('/'), + api_version="2024-02-15-preview" + ) + self.embedding_provider = "azure_openai" + + elif provider == "gemini": + api_key = api_keys.get("geminiKey", "") + if not api_key: + return {"success": False, "error": "Gemini API key not provided"} + + self.embedding_model = GoogleGenerativeAIEmbeddings( + model="models/embedding-001", + google_api_key=api_key + ) + self.embedding_provider = "gemini" + + else: + # Fallback to HuggingFace embeddings (no API key required) + try: + self.embedding_model = HuggingFaceEmbeddings( + model_name="sentence-transformers/all-MiniLM-L6-v2", + model_kwargs={'device': 'cpu'} + ) + self.embedding_provider = "huggingface" + print("Using HuggingFace embeddings as fallback") + except Exception as e: + return {"success": False, "error": f"Failed to initialize HuggingFace embeddings: {e}"} + + return {"success": True, "message": f"Embeddings initialized with {self.embedding_provider}"} + + except Exception as e: + return {"success": False, "error": str(e), "message": f"Failed to initialize embeddings with {provider}"} + + def create_readme_embedding(self, readme_content: str) -> Optional[List[float]]: + """Create embedding for README content.""" + if not self.embedding_model or not readme_content: + return None + + try: + # Clean and truncate content for embedding + cleaned_content = self.clean_text_for_embedding(readme_content) + if not cleaned_content: + return None + + # Create embedding + embedding = self.embedding_model.embed_query(cleaned_content) + return embedding + + except Exception as e: + print(f"Error creating embedding: {e}") + return None + + def clean_text_for_embedding(self, text: str) -> str: + """Clean text for embedding generation.""" + if not text or not isinstance(text, str): + return "" + + # Remove markdown formatting + text = re.sub(r'#+\s*', '', text) # Remove headers + text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) # Remove bold + text = re.sub(r'\*(.*?)\*', r'\1', text) # Remove italic + text = re.sub(r'`(.*?)`', r'\1', text) # Remove code blocks + text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) # Remove code blocks + text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Remove links, keep text + + # Remove extra whitespace and newlines + text = re.sub(r'\s+', ' ', text) + text = text.strip() + + # Truncate if too long (most embedding models have limits) + if len(text) > 8000: # Conservative limit + text = text[:8000] + "..." + + return text + def get_cached_repos_batch(self, repo_list): """Get list of repositories that already have cached READMEs.""" if not os.path.exists(self.readme_cache_db_path): @@ -214,7 +322,7 @@ def get_cached_readme(self, owner, repo): return None def cache_readme(self, owner, repo, content): - """Store README content in cache database, including 'no README' cases.""" + """Store README content and embedding in cache database, including 'no README' cases.""" try: # Initialize cache database if it doesn't exist self._initialize_cache_database() @@ -226,16 +334,25 @@ def cache_readme(self, owner, repo, content): # Clean the content for database storage cleaned_content = self.clean_text_for_csv(content) content_length = len(content) + + # Create embedding if embedding model is available + embedding = None + embedding_provider = None + if self.embedding_model: + embedding = self.create_readme_embedding(content) + embedding_provider = self.embedding_provider else: # Cache "no README" case to avoid repeated GitHub API calls cleaned_content = "" # Empty string indicates no README found content_length = 0 + embedding = None + embedding_provider = None - # Insert or update README content in cache database + # Insert or update README content and embedding in cache database query = """ INSERT OR REPLACE INTO repository_cache - (owner, repo, readme_content, readme_length, cached_at) - VALUES (?, ?, ?, ?, ?) + (owner, repo, readme_content, readme_length, readme_embedding, embedding_provider, cached_at) + VALUES (?, ?, ?, ?, ?, ?, ?) """ conn.execute(query, [ @@ -243,6 +360,8 @@ def cache_readme(self, owner, repo, content): repo, cleaned_content, content_length, + embedding, + embedding_provider, int(time.time()) ]) @@ -266,6 +385,8 @@ def _initialize_cache_database(self): repo VARCHAR, readme_content TEXT, readme_length BIGINT, + readme_embedding DOUBLE[], + embedding_provider VARCHAR, cached_at BIGINT, PRIMARY KEY (owner, repo) ) @@ -446,6 +567,8 @@ def create_kuzu_database(self, gexf_file, db_path): stargazers STRING, readme_content STRING, readme_length INT64, + readme_embedding DOUBLE[], + embedding_provider STRING, PRIMARY KEY (id) ) """ @@ -541,7 +664,7 @@ def create_kuzu_database(self, gexf_file, db_path): node_id, label, github_url, stars, forks, watchers, is_archived, language_count, pull_requests, issues, primary_language, created_at_year, license_info, topics, - contributors, stargazers, '', 0 # readme_content, readme_length + contributors, stargazers, '', 0, None, None # readme_content, readme_length, readme_embedding, embedding_provider ]) node_csv_path = node_csv.name @@ -680,7 +803,7 @@ def get_github_readme_optimized(self, owner, repo, token=None): headers = { 'Accept': 'application/vnd.github.v3+json', - 'User-Agent': 'DeepGit-GraphRAG/1.0' + 'User-Agent': 'DeepGit-DeepGitAI/1.0' } if token: headers['Authorization'] = f'token {token}' @@ -818,7 +941,15 @@ def add_readme_to_database(self, db_path, token=None): # Clean the content for CSV storage cleaned_content = self.clean_text_for_csv(readme_content) original_length = len(readme_content) - readme_data.append((repo_id, cleaned_content, original_length)) + + # Create embedding if embedding model is available + embedding = None + embedding_provider = None + if self.embedding_model: + embedding = self.create_readme_embedding(readme_content) + embedding_provider = self.embedding_provider + + readme_data.append((repo_id, cleaned_content, original_length, embedding, embedding_provider)) print(f" ✓ Found README ({original_length} characters, stored: {len(cleaned_content)} characters)") else: print(f" ✗ No README found") @@ -832,8 +963,8 @@ def add_readme_to_database(self, db_path, token=None): # Create temporary CSV file for README data with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, newline='') as readme_csv: readme_writer = csv.writer(readme_csv, quoting=csv.QUOTE_ALL) - for repo_id, content, length in readme_data: - readme_writer.writerow([repo_id, content, length]) + for repo_id, content, length, embedding, embedding_provider in readme_data: + readme_writer.writerow([repo_id, content, length, embedding, embedding_provider]) readme_csv_path = readme_csv.name # Import README data with parallel disabled @@ -925,7 +1056,14 @@ def add_readme_to_database_optimized(self, db_path, token=None): original_length = len(cached_content) else: original_length = len(str(cached_content)) - readme_data.append((repo_id, cleaned_content, original_length)) + # Create embedding if embedding model is available + embedding = None + embedding_provider = None + if self.embedding_model: + embedding = self.create_readme_embedding(cached_content) + embedding_provider = self.embedding_provider + + readme_data.append((repo_id, cleaned_content, original_length, embedding, embedding_provider)) successful += 1 continue @@ -935,7 +1073,15 @@ def add_readme_to_database_optimized(self, db_path, token=None): # Clean the content for CSV storage cleaned_content = self.clean_text_for_csv(readme_content) original_length = len(readme_content) - readme_data.append((repo_id, cleaned_content, original_length)) + + # Create embedding if embedding model is available + embedding = None + embedding_provider = None + if self.embedding_model: + embedding = self.create_readme_embedding(readme_content) + embedding_provider = self.embedding_provider + + readme_data.append((repo_id, cleaned_content, original_length, embedding, embedding_provider)) successful += 1 print(f" ✓ Found README ({original_length} characters, stored: {len(cleaned_content)} characters)") else: @@ -960,8 +1106,8 @@ def add_readme_to_database_optimized(self, db_path, token=None): # Create temporary CSV file for README data with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, newline='') as readme_csv: readme_writer = csv.writer(readme_csv, quoting=csv.QUOTE_ALL) - for repo_id, content, length in readme_data: - readme_writer.writerow([repo_id, content, length]) + for repo_id, content, length, embedding, embedding_provider in readme_data: + readme_writer.writerow([repo_id, content, length, embedding, embedding_provider]) readme_csv_path = readme_csv.name # Import README data with parallel disabled @@ -1126,7 +1272,15 @@ def add_readme_to_database_with_progress(self, db_path, token=None, progress_dic if cached_content: # Clean the content for CSV storage cleaned_content = self.clean_text_for_csv(cached_content) - readme_data.append((repo_id, cleaned_content, cached_length)) + + # Create embedding if embedding model is available + embedding = None + embedding_provider = None + if self.embedding_model: + embedding = self.create_readme_embedding(cached_content) + embedding_provider = self.embedding_provider + + readme_data.append((repo_id, cleaned_content, cached_length, embedding, embedding_provider)) successful += 1 else: failed += 1 @@ -1142,7 +1296,15 @@ def add_readme_to_database_with_progress(self, db_path, token=None, progress_dic # Clean the content for CSV storage cleaned_content = self.clean_text_for_csv(readme_content) original_length = len(readme_content) - readme_data.append((repo_id, cleaned_content, original_length)) + + # Create embedding if embedding model is available + embedding = None + embedding_provider = None + if self.embedding_model: + embedding = self.create_readme_embedding(readme_content) + embedding_provider = self.embedding_provider + + readme_data.append((repo_id, cleaned_content, original_length, embedding, embedding_provider)) successful += 1 else: failed += 1 @@ -1185,22 +1347,24 @@ def add_readme_to_database_with_progress(self, db_path, token=None, progress_dic if processed % 100 == 0 and readme_data: try: # Insert current batch to database - for repo_id, content, length in readme_data: + for repo_id, content, length, embedding, embedding_provider in readme_data: try: conn.execute(""" MERGE (r:Repository {id: $repo_id}) - SET r.readme_content = $content, r.readme_length = $length - """, parameters={"repo_id": repo_id, "content": content, "length": length}) + SET r.readme_content = $content, r.readme_length = $length, r.readme_embedding = $embedding, r.embedding_provider = $embedding_provider + """, parameters={"repo_id": repo_id, "content": content, "length": length, "embedding": embedding, "embedding_provider": embedding_provider}) except Exception as e: if "Cannot find property" in str(e): # Try to add the properties first conn.execute("ALTER TABLE Repository ADD COLUMN readme_content STRING") conn.execute("ALTER TABLE Repository ADD COLUMN readme_length INT64") + conn.execute("ALTER TABLE Repository ADD COLUMN readme_embedding DOUBLE[]") + conn.execute("ALTER TABLE Repository ADD COLUMN embedding_provider STRING") # Retry the insertion conn.execute(""" MERGE (r:Repository {id: $repo_id}) - SET r.readme_content = $content, r.readme_length = $length - """, parameters={"repo_id": repo_id, "content": content, "length": length}) + SET r.readme_content = $content, r.readme_length = $length, r.readme_embedding = $embedding, r.embedding_provider = $embedding_provider + """, parameters={"repo_id": repo_id, "content": content, "length": length, "embedding": embedding, "embedding_provider": embedding_provider}) else: print(f" ⚠️ Checkpoint insertion failed for {repo_id}: {e}") @@ -1227,7 +1391,7 @@ def add_readme_to_database_with_progress(self, db_path, token=None, progress_dic # Insert remaining README data directly into Repository table if readme_data: print(f"\nInserting final batch of {len(readme_data)} README files into database...") - for repo_id, content, length in readme_data: + for repo_id, content, length, embedding, embedding_provider in readme_data: try: # Try to insert README data with error handling for missing properties conn.execute(""" @@ -1245,8 +1409,8 @@ def add_readme_to_database_with_progress(self, db_path, token=None, progress_dic # Retry the insertion conn.execute(""" MERGE (r:Repository {id: $repo_id}) - SET r.readme_content = $content, r.readme_length = $length - """, parameters={"repo_id": repo_id, "content": content, "length": length}) + SET r.readme_content = $content, r.readme_length = $length, r.readme_embedding = $embedding, r.embedding_provider = $embedding_provider + """, parameters={"repo_id": repo_id, "content": content, "length": length, "embedding": embedding, "embedding_provider": embedding_provider}) except Exception as retry_e: print(f" ❌ Failed to add properties or retry insertion: {retry_e}") else: @@ -1304,7 +1468,7 @@ def setup_database_from_gexf(self, gexf_content: str, github_token: str, session gexf_hash = hashlib.md5(gexf_content.encode()).hexdigest() # Look for existing database with the same hash - existing_db_path = kuzu_dir / f"graphrag_db_{gexf_hash}" + existing_db_path = kuzu_dir / f"deepgit_ai_db_{gexf_hash}" if existing_db_path.exists(): # Database already exists for this graph, reuse it @@ -1409,7 +1573,7 @@ def setup_database_from_gexf_with_progress(self, gexf_content: str, github_token gexf_hash = hashlib.md5(gexf_content.encode()).hexdigest() # Look for existing database with the same hash - existing_db_path = kuzu_dir / f"graphrag_db_{gexf_hash}" + existing_db_path = kuzu_dir / f"deepgit_ai_db_{gexf_hash}" if existing_db_path.exists(): # Database already exists for this graph, reuse it @@ -1574,7 +1738,7 @@ def check_database_exists(self, gexf_content: str) -> Dict[str, Any]: # Look for existing database with the same hash kuzu_dir = Path(__file__).parent.parent / "kuzu" - existing_db_path = kuzu_dir / f"graphrag_db_{gexf_hash}" + existing_db_path = kuzu_dir / f"deepgit_ai_db_{gexf_hash}" if not existing_db_path.exists(): return { @@ -1652,7 +1816,7 @@ def update_readme_data(self, github_token: str, progress_dict: dict = None) -> D } def change_provider(self, provider: str, api_keys: Dict[str, str]) -> Dict[str, Any]: - """Change the GraphRAG provider without recreating the database.""" + """Change the DeepGitAI provider without recreating the database.""" try: if not self.db_path or not os.path.exists(self.db_path): return { @@ -1661,8 +1825,8 @@ def change_provider(self, provider: str, api_keys: Dict[str, str]) -> Dict[str, "message": "Database not found" } - # Initialize GraphRAG with new provider - result = self.initialize_graphrag(provider, api_keys) + # Initialize DeepGitAI with new provider + result = self.initialize_deepgit_ai(provider, api_keys) if result["success"]: return { @@ -1680,8 +1844,8 @@ def change_provider(self, provider: str, api_keys: Dict[str, str]) -> Dict[str, "message": "Failed to change provider" } - def initialize_graphrag(self, provider: str, api_keys: Dict[str, str]) -> Dict[str, Any]: - """Initialize the GraphRAG system with the specified provider.""" + def initialize_deepgit_ai(self, provider: str, api_keys: Dict[str, str]) -> Dict[str, Any]: + """Initialize the DeepGitAI system with the specified provider.""" try: # Check if required modules are available if StateGraph is None or END is None: @@ -1698,6 +1862,12 @@ def initialize_graphrag(self, provider: str, api_keys: Dict[str, str]) -> Dict[s "message": "Database not found" } + # Initialize embeddings first + embedding_result = self.initialize_embeddings(provider, api_keys) + if not embedding_result["success"]: + print(f"⚠️ Embedding initialization failed: {embedding_result.get('error', 'Unknown error')}") + # Continue without embeddings - not critical for basic functionality + # Set environment variables for API keys if provider == "openai": os.environ["OPENAI_API_KEY"] = api_keys.get("openaiKey", "") @@ -1708,33 +1878,205 @@ def initialize_graphrag(self, provider: str, api_keys: Dict[str, str]) -> Dict[s elif provider == "gemini": os.environ["GEMINI_API_KEY"] = api_keys.get("geminiKey", "") - # Initialize GraphRAG - self.graphrag_instance = MultiLLMGraphRAG(str(self.db_path), provider) + # Initialize DeepGitAI + self.deepgit_ai_instance = MultiLLMDeepGitAI(str(self.db_path), provider) return { "success": True, - "message": f"GraphRAG initialized with {provider}" + "message": f"DeepGitAI initialized with {provider}", + "embeddings_initialized": embedding_result["success"] } except Exception as e: return { "success": False, "error": str(e), - "message": f"Failed to initialize GraphRAG with {provider}" + "message": f"Failed to initialize DeepGitAI with {provider}" } - def query_graphrag(self, query: str) -> Dict[str, Any]: - """Execute a query using the GraphRAG system.""" + def semantic_search_readmes(self, query: str, limit: int = 10) -> List[Dict[str, Any]]: + """Perform semantic search on README content using embeddings.""" + if not self.embedding_model or not self.db_path: + return [] + try: - if not self.graphrag_instance: + # Create embedding for the query + query_embedding = self.create_readme_embedding(query) + if not query_embedding: + return [] + + # Connect to database + db = kuzu.Database(str(self.db_path), read_only=True) + conn = kuzu.Connection(db) + + # Get all repositories with embeddings + repos_query = """ + MATCH (r:Repository) + WHERE r.readme_embedding IS NOT NULL AND r.readme_content IS NOT NULL AND r.readme_content <> '' + RETURN r.id, r.readme_content, r.readme_embedding, r.stars, r.primaryLanguage, r.topics + """ + + result = conn.execute(repos_query).get_as_df() + + if result.empty: + return [] + + # Calculate cosine similarity for each repository + similarities = [] + for _, row in result.iterrows(): + repo_id = row['r.id'] + readme_content = row['r.readme_content'] + embedding = row['r.readme_embedding'] + stars = row['r.stars'] + language = row['r.primaryLanguage'] + topics = row['r.topics'] + + if embedding and len(embedding) > 0: + # Calculate cosine similarity + similarity = self._cosine_similarity(query_embedding, embedding) + similarities.append({ + 'repo_id': repo_id, + 'readme_content': readme_content, + 'similarity': similarity, + 'stars': stars, + 'language': language, + 'topics': topics + }) + + # Sort by similarity and return top results + similarities.sort(key=lambda x: x['similarity'], reverse=True) + return similarities[:limit] + + except Exception as e: + print(f"Error in semantic search: {e}") + return [] + + def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float: + """Calculate cosine similarity between two vectors.""" + try: + import numpy as np + + # Convert to numpy arrays + a = np.array(vec1) + b = np.array(vec2) + + # Calculate cosine similarity + dot_product = np.dot(a, b) + norm_a = np.linalg.norm(a) + norm_b = np.linalg.norm(b) + + if norm_a == 0 or norm_b == 0: + return 0.0 + + return dot_product / (norm_a * norm_b) + + except Exception as e: + print(f"Error calculating cosine similarity: {e}") + return 0.0 + + def validate_query_scope(self, query: str) -> Dict[str, Any]: + """Validate that the query is within the scope of the database.""" + try: + # Connect to database to check available data + db = kuzu.Database(str(self.db_path), read_only=True) + conn = kuzu.Connection(db) + + # Get database statistics + repo_count = conn.execute("MATCH (r:Repository) RETURN COUNT(r)").get_as_df().iloc[0, 0] + readme_count = conn.execute(""" + MATCH (r:Repository) + WHERE r.readme_content IS NOT NULL AND r.readme_content <> '' + RETURN COUNT(r) + """).get_as_df().iloc[0, 0] + + # Get available languages + languages = conn.execute(""" + MATCH (r:Repository) + WHERE r.primaryLanguage IS NOT NULL AND r.primaryLanguage <> '' + RETURN DISTINCT r.primaryLanguage + ORDER BY r.primaryLanguage + """).get_as_df() + + # Get available topics + topics = conn.execute(""" + MATCH (r:Repository) + WHERE r.topics IS NOT NULL AND r.topics <> '' + RETURN DISTINCT r.topics + ORDER BY r.topics + """).get_as_df() + + available_languages = languages['r.primaryLanguage'].tolist() if not languages.empty else [] + available_topics = topics['r.topics'].tolist() if not topics.empty else [] + + return { + "success": True, + "database_stats": { + "total_repositories": repo_count, + "repositories_with_readmes": readme_count, + "available_languages": available_languages[:20], # Limit to first 20 + "available_topics": available_topics[:20], # Limit to first 20 + "database_path": str(self.db_path) + }, + "query_scope": "limited_to_database", + "message": f"Query will be limited to {repo_count} repositories in the database" + } + + except Exception as e: + return { + "success": False, + "error": str(e), + "message": "Failed to validate query scope" + } + + def add_database_context_to_query(self, query: str) -> str: + """Add database context information to the query to limit LLM responses.""" + try: + # Get database statistics + scope_info = self.validate_query_scope(query) + + if not scope_info["success"]: + return query + + db_stats = scope_info["database_stats"] + + # Create context prefix + context_prefix = f""" +DATABASE CONTEXT: +- This query is limited to {db_stats['total_repositories']} repositories in the Kuzu database +- {db_stats['repositories_with_readmes']} repositories have README content available +- Available programming languages: {', '.join(db_stats['available_languages'][:10])} +- Available topics: {', '.join(db_stats['available_topics'][:10])} + +IMPORTANT: Only provide information about repositories, languages, and topics that exist in this specific database. Do not use external knowledge. + +USER QUERY: {query} +""" + + return context_prefix + + except Exception as e: + print(f"Error adding database context: {e}") + return query + + def query_deepgit_ai(self, query: str) -> Dict[str, Any]: + """Execute a query using the DeepGitAI system.""" + try: + if not self.deepgit_ai_instance: return { "success": False, - "error": "GraphRAG not initialized", - "message": "Please initialize GraphRAG first" + "error": "DeepGitAI not initialized", + "message": "Please initialize DeepGitAI first" } - # Execute the query - result = self.graphrag_instance.query(query) + # Execute the query with database context if enabled + config_manager = ConfigManager() + include_context = config_manager.get("deepgit_ai.include_database_context", True) + + if include_context: + contextualized_query = self.add_database_context_to_query(query) + result = self.deepgit_ai_instance.query(contextualized_query) + else: + result = self.deepgit_ai_instance.query(query) return { "success": True, @@ -1752,14 +2094,14 @@ def query_graphrag(self, query: str) -> Dict[str, Any]: def get_database_stats(self) -> Dict[str, Any]: """Get database statistics.""" try: - if not self.graphrag_instance: + if not self.deepgit_ai_instance: return { "success": False, - "error": "GraphRAG not initialized", - "message": "Please initialize GraphRAG first" + "error": "DeepGitAI not initialized", + "message": "Please initialize DeepGitAI first" } - stats = self.graphrag_instance.get_database_stats() + stats = self.deepgit_ai_instance.get_database_stats() return { "success": True, @@ -1794,13 +2136,13 @@ def cleanup(self, session_id: str = None): if self.db_path and os.path.exists(self.db_path): try: # Close any active connections first - if self.graphrag_instance: + if self.deepgit_ai_instance: # Close database connections - if hasattr(self.graphrag_instance, 'conn'): - self.graphrag_instance.conn.close() - if hasattr(self.graphrag_instance, 'db'): - self.graphrag_instance.db.close() - self.graphrag_instance = None + if hasattr(self.deepgit_ai_instance, 'conn'): + self.deepgit_ai_instance.conn.close() + if hasattr(self.deepgit_ai_instance, 'db'): + self.deepgit_ai_instance.db.close() + self.deepgit_ai_instance = None # Delete the database file (Kuzu databases are files, not directories) os.remove(self.db_path) @@ -1828,7 +2170,7 @@ def cleanup(self, session_id: str = None): # print(f"⚠️ {error_msg}") # cleanup_details["errors"].append(error_msg) - print("✅ GraphRAG cleanup completed") + print("✅ DeepGitAI cleanup completed") except Exception as e: error_msg = f"Unexpected error during cleanup: {e}" @@ -1838,34 +2180,34 @@ def cleanup(self, session_id: str = None): return cleanup_details def detect_graph_changes(self, current_gexf_content: str) -> Dict[str, Any]: - """Detect if the current graph has changed compared to the GraphRAG database.""" + """Detect if the current graph has changed compared to the DeepGitAI database.""" try: if not self.db_path or not os.path.exists(self.db_path): return { "has_changes": False, - "message": "No GraphRAG database exists to compare against" + "message": "No DeepGitAI database exists to compare against" } # Calculate hash of current GEXF content import hashlib current_hash = hashlib.md5(current_gexf_content.encode()).hexdigest() - # Extract hash from database path (format: graphrag_db_{hash}) + # Extract hash from database path (format: deepgit_ai_db_{hash}) db_name = os.path.basename(self.db_path) - if db_name.startswith("graphrag_db_"): - stored_hash = db_name[12:] # Remove "graphrag_db_" prefix + if db_name.startswith("deepgit_ai_db_"): + stored_hash = db_name[14:] # Remove "deepgit_ai_db_" prefix if current_hash != stored_hash: return { "has_changes": True, "current_hash": current_hash, "stored_hash": stored_hash, - "message": "Graph structure has changed. GraphRAG database may be outdated." + "message": "Graph structure has changed. DeepGitAI database may be outdated." } else: return { "has_changes": False, - "message": "Graph structure matches current GraphRAG database" + "message": "Graph structure matches current DeepGitAI database" } else: return { @@ -1881,7 +2223,7 @@ def detect_graph_changes(self, current_gexf_content: str) -> Dict[str, Any]: } def should_rebuild_database(self, current_gexf_content: str) -> Dict[str, Any]: - """Check if GraphRAG database should be rebuilt based on graph changes.""" + """Check if DeepGitAI database should be rebuilt based on graph changes.""" change_detection = self.detect_graph_changes(current_gexf_content) if change_detection.get("has_changes", False): @@ -1889,31 +2231,34 @@ def should_rebuild_database(self, current_gexf_content: str) -> Dict[str, Any]: "should_rebuild": True, "reason": "Graph structure has changed", "details": change_detection, - "message": "The graph structure has changed since the GraphRAG database was created. Would you like to rebuild the database to include the latest changes?" + "message": "The graph structure has changed since the DeepGitAI database was created. Would you like to rebuild the database to include the latest changes?" } else: return { "should_rebuild": False, - "message": "GraphRAG database is up to date" + "message": "DeepGitAI database is up to date" } -class MultiLLMGraphRAG: - """Enhanced GraphRAG system with support for multiple LLM providers.""" +class MultiLLMDeepGitAI: + """Enhanced DeepGitAI system with support for multiple LLM providers.""" def __init__(self, db_path: str, llm_provider: str = "openai"): - """Initialize the GraphRAG system.""" + """Initialize the DeepGitAI system.""" self.db_path = db_path - self.db = kuzu.Database(db_path, read_only=True) # Use read-only mode for GraphRAG + self.db = kuzu.Database(db_path, read_only=True) # Use read-only mode for DeepGitAI self.conn = kuzu.Connection(self.db) self.llm_provider = llm_provider + # Get reference to the service instance for semantic search + self.service = deepgit_ai_service + # Initialize LLM based on provider self.llm = self._initialize_llm() # Initialize the workflow self.workflow = self._create_workflow() - print(f"Multi-LLM GraphRAG system initialized with database: {db_path}") + print(f"Multi-LLM DeepGitAI system initialized with database: {db_path}") print(f"LLM Provider: {llm_provider}") if llm_provider == "azure_openai": deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME", "unknown") @@ -2001,7 +2346,7 @@ def _create_workflow(self) -> StateGraph: """Create the LangGraph workflow.""" # Define the workflow - workflow = StateGraph(GraphRAGState) + workflow = StateGraph(DeepGitAIState) # Add nodes workflow.add_node("analyze_query", self._analyze_query_node) @@ -2018,10 +2363,10 @@ def _create_workflow(self) -> StateGraph: return workflow.compile() - def _analyze_query_node(self, state: GraphRAGState) -> GraphRAGState: + def _analyze_query_node(self, state: DeepGitAIState) -> DeepGitAIState: """Analyze the user query to determine what graph queries to run.""" - system_prompt = """You are a query analyzer for a GraphRAG system. Your job is to analyze user queries and determine what graph queries should be executed to find relevant information. + system_prompt = """You are a query analyzer for a DeepGitAI system. Your job is to analyze user queries and determine what graph queries should be executed to find relevant information. Available graph queries: 1. find_similar_repos - Find repositories similar to a given one @@ -2031,6 +2376,7 @@ def _analyze_query_node(self, state: GraphRAGState) -> GraphRAGState: 5. find_popular_repos - Find most popular repositories 6. find_recent_repos - Find recently created repositories 7. find_by_activity - Find repositories by activity level (issues/PRs) +8. semantic_search - Find repositories using semantic search on README content Analyze the query and return a JSON with: - query_type: The type of graph query to run @@ -2071,7 +2417,8 @@ def _analyze_query_node(self, state: GraphRAGState) -> GraphRAGState: elif "popular" in state.query.lower(): state.graph_results["query_type"] = "find_popular_repos" else: - state.graph_results["query_type"] = "find_popular_repos" # default + # Default to semantic search for general queries + state.graph_results["query_type"] = "semantic_search" except Exception as e: state.error = f"Error analyzing query: {e}" @@ -2079,7 +2426,7 @@ def _analyze_query_node(self, state: GraphRAGState) -> GraphRAGState: return state - def _query_graph_node(self, state: GraphRAGState) -> GraphRAGState: + def _query_graph_node(self, state: DeepGitAIState) -> DeepGitAIState: """Execute graph queries based on the analysis.""" query_type = state.graph_results.get("query_type", "find_popular_repos") @@ -2096,6 +2443,8 @@ def _query_graph_node(self, state: GraphRAGState) -> GraphRAGState: results = self._find_by_topics(topic) elif query_type == "find_popular_repos": results = self._find_popular_repositories() + elif query_type == "semantic_search": + results = self._find_by_semantic_search(state.query) else: results = self._find_popular_repositories() @@ -2107,7 +2456,7 @@ def _query_graph_node(self, state: GraphRAGState) -> GraphRAGState: return state - def _retrieve_readmes_node(self, state: GraphRAGState) -> GraphRAGState: + def _retrieve_readmes_node(self, state: DeepGitAIState) -> DeepGitAIState: """Retrieve README content for the found repositories.""" repos = state.graph_results.get("results", []) @@ -2129,26 +2478,33 @@ def _retrieve_readmes_node(self, state: GraphRAGState) -> GraphRAGState: state.readme_content = readme_data return state - def _generate_answer_node(self, state: GraphRAGState) -> GraphRAGState: + def _generate_answer_node(self, state: DeepGitAIState) -> DeepGitAIState: """Generate the final answer using the graph results and README content.""" - system_prompt = """You are a helpful assistant that provides information about GitHub repositories based on graph analysis and README content. + system_prompt = """You are a helpful assistant that provides information about GitHub repositories based ONLY on the data provided in the context below. + +CRITICAL CONSTRAINTS: +- You MUST ONLY use information from the provided context +- You MUST NOT use any external knowledge or general information +- You MUST NOT make assumptions about repositories not in the context +- You MUST NOT provide information about technologies, languages, or concepts not mentioned in the context +- If information is not available in the context, you MUST say "This information is not available in the current dataset" You have access to: -1. Graph analysis results showing repository relationships and metadata -2. README content from the repositories +1. Graph analysis results showing repository relationships and metadata from the Kuzu database +2. README content from the repositories stored in the database Provide a comprehensive answer that: -- Explains the repositories found -- Highlights key features from READMEs -- Mentions relationships between repositories -- Suggests which repositories might be most relevant +- Explains the repositories found using ONLY the provided data +- Highlights key features from READMEs in the context +- Mentions relationships between repositories from the graph data +- Suggests which repositories might be most relevant based on the data IMPORTANT: When mentioning repositories, use the format [repository_name](repo_id) to make them clickable. For example: - "The [SWI-Prolog/swipl-devel](SWI-Prolog/swipl-devel) repository..." - "Check out [souffle-lang/souffle](souffle-lang/souffle) for..." -Be informative but concise.""" +Be informative but concise. If you cannot answer based on the provided context, explicitly state that the information is not available in the current dataset.""" # Prepare context graph_results = state.graph_results.get("results", []) @@ -2226,8 +2582,31 @@ def _find_by_topics(self, topic: str, limit: int = 10) -> list: return result.to_dict('records') + def _find_by_semantic_search(self, query: str, limit: int = 10) -> list: + """Find repositories using semantic search on README content.""" + try: + # Use the service's semantic search method + semantic_results = self.service.semantic_search_readmes(query, limit) + + # Convert to the expected format + results = [] + for result in semantic_results: + results.append({ + "id": result["repo_id"], + "stars": result["stars"], + "primaryLanguage": result["language"], + "topics": result["topics"], + "similarity": result["similarity"], + "readme_content": result["readme_content"] + }) + + return results + + except Exception as e: + print(f"Error in semantic search: {e}") + return [] + def _find_popular_repositories(self, limit: int = 10) -> list: - """Find most popular repositories.""" query = """ MATCH (r:Repository) @@ -2261,10 +2640,10 @@ def _get_readme_content(self, repo_id: str) -> Optional[str]: return None def query(self, user_query: str) -> str: - """Main query interface for the GraphRAG system.""" + """Main query interface for the DeepGitAI system.""" # Initialize state - state = GraphRAGState( + state = DeepGitAIState( query=user_query, messages=[HumanMessage(content=user_query)] ) @@ -2319,4 +2698,4 @@ def get_database_stats(self) -> Dict[str, Any]: return stats # Global instance -graphrag_service = GraphRAGService() +deepgit_ai_service = DeepGitAIService() diff --git a/backend/requirements.txt b/backend/requirements.txt index 52dd534..cfc81a2 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -20,4 +20,6 @@ langchain-openai>=0.1.0 langchain-community>=0.2.0 langchain-google-genai>=0.1.0 python-dotenv>=1.0.0 -matplotlib>=3.7.0 \ No newline at end of file +matplotlib>=3.7.0 +sentence-transformers>=2.2.0 +torch>=1.9.0 \ No newline at end of file diff --git a/environment.yml b/environment.yml index b8b04a4..89063ca 100644 --- a/environment.yml +++ b/environment.yml @@ -29,4 +29,6 @@ dependencies: - langchain-community>=0.2.0 - langchain-google-genai>=0.1.0 - python-dotenv>=1.0.0 - - matplotlib>=3.7.0 \ No newline at end of file + - matplotlib>=3.7.0 + - sentence-transformers>=2.2.0 + - torch>=1.9.0 \ No newline at end of file diff --git a/src/lib/config.ts b/src/lib/config.ts index b04a9a8..04ad15f 100644 --- a/src/lib/config.ts +++ b/src/lib/config.ts @@ -16,15 +16,16 @@ export const API_ENDPOINTS = { GENERATE_GRAPH_WITH_EDGES: `${API_BASE_URL}/api/generate-graph-with-edges`, GET_UNIQUE_REPOS: `${API_BASE_URL}/api/get-unique-repos`, CREATE_EDGES_ON_GRAPH: `${API_BASE_URL}/api/create-edges-on-graph`, - // GraphRAG endpoints - GRAPHRAG_HEALTH: `${API_BASE_URL}/api/graphrag-health`, - GRAPHRAG_SETUP: `${API_BASE_URL}/api/graphrag-setup`, - GRAPHRAG_PROGRESS: `${API_BASE_URL}/api/graphrag-progress`, - GRAPHRAG_RESET_PROGRESS: `${API_BASE_URL}/api/graphrag-reset-progress`, - GRAPHRAG_CHANGE_PROVIDER: `${API_BASE_URL}/api/graphrag-change-provider`, - GRAPHRAG_UPDATE_README: `${API_BASE_URL}/api/graphrag-update-readme`, - GRAPHRAG_FIX_SCHEMA: `${API_BASE_URL}/api/graphrag-fix-schema`, - GRAPHRAG_QUERY: `${API_BASE_URL}/api/graphrag`, - GRAPHRAG_CLEANUP: `${API_BASE_URL}/api/graphrag-cleanup`, - GRAPHRAG_CHECK_CHANGES: `${API_BASE_URL}/api/graphrag-check-changes`, + // DeepGitAI endpoints + DEEPGIT_AI_HEALTH: `${API_BASE_URL}/api/deepgit-ai-health`, + DEEPGIT_AI_SCOPE: `${API_BASE_URL}/api/deepgit-ai-scope`, + DEEPGIT_AI_SETUP: `${API_BASE_URL}/api/deepgit-ai-setup`, + DEEPGIT_AI_PROGRESS: `${API_BASE_URL}/api/deepgit-ai-progress`, + DEEPGIT_AI_RESET_PROGRESS: `${API_BASE_URL}/api/deepgit-ai-reset-progress`, + DEEPGIT_AI_CHANGE_PROVIDER: `${API_BASE_URL}/api/deepgit-ai-change-provider`, + DEEPGIT_AI_UPDATE_README: `${API_BASE_URL}/api/deepgit-ai-update-readme`, + DEEPGIT_AI_FIX_SCHEMA: `${API_BASE_URL}/api/deepgit-ai-fix-schema`, + DEEPGIT_AI_QUERY: `${API_BASE_URL}/api/deepgit-ai`, + DEEPGIT_AI_CLEANUP: `${API_BASE_URL}/api/deepgit-ai-cleanup`, + DEEPGIT_AI_CHECK_CHANGES: `${API_BASE_URL}/api/deepgit-ai-check-changes`, } as const; \ No newline at end of file diff --git a/src/styles/_graph.scss b/src/styles/_graph.scss index e2c1b7c..3f51b15 100644 --- a/src/styles/_graph.scss +++ b/src/styles/_graph.scss @@ -102,15 +102,17 @@ background: white; } - // GraphRAG Chat Interface Styles - .graphrag-chat { + // DeepGitAI Chat Interface Styles + .deepgit-ai-chat { min-height: 100%; + height: 100%; // Ensure full height display: flex; flex-direction: column; - overflow-y: auto; // Make the entire GraphRAG panel scrollable + position: relative; // Create positioning context + + // Custom scrollbar for the entire chat container scrollbar-width: thin; scrollbar-color: #ccc transparent; - -webkit-overflow-scrolling: touch; // For better iOS scrolling &::-webkit-scrollbar { width: 6px; @@ -125,11 +127,17 @@ border-radius: 3px; } + // Setup panel (not sticky, scrolls with content) + .setup-panel { + background: #f8f9fa; + border-bottom: 1px solid #e9ecef; + flex-shrink: 0; // Prevent setup panel from shrinking + } + .chat-messages { - flex: 1; - overflow-y: visible; // Remove individual scrolling from chat messages - overflow-x: hidden; - min-height: 200px; + flex: 1 1 auto; + min-height: 0; // critical for Chrome flexbox + overflow: visible; // let the panel-content be the scroll container max-height: none; // Remove max-height constraint } @@ -159,28 +167,6 @@ } } - .chat-input { - border-top: 1px solid #e9ecef; - background: white; - - .input-group { - .form-control { - border-radius: 20px 0 0 20px; - border: 1px solid #ced4da; - - &:focus { - border-color: #007bff; - box-shadow: 0 0 0 0.2rem rgba(0, 123, 255, 0.25); - } - } - - .btn { - border-radius: 0 20px 20px 0; - border: 1px solid #007bff; - } - } - } - .setup-panel { background: #f8f9fa; border-bottom: 1px solid #e9ecef; diff --git a/src/styles/_layout.scss b/src/styles/_layout.scss index fc2f1fa..855265c 100644 --- a/src/styles/_layout.scss +++ b/src/styles/_layout.scss @@ -70,15 +70,89 @@ body { flex-direction: column; @extend .custom-scrollbar; - overflow-y: auto; + overflow-y: auto; // single scroll ancestor for sticky &>*>*:not(hr) { padding: 1rem; } - // Special handling for GraphRAG panel - .graphrag-chat { + // Special handling for DeepGitAI panel + .deepgit-ai-chat { padding: 0; + height: 100%; + display: flex; + flex-direction: column; + } + + // When DeepGitAI tab is active, keep header/input sticky relative to panel scroll + &.deepgit-ai-mode { + position: relative; + + .deepgit-ai-header { + position: sticky; // sticks to panel-content + top: 0; + z-index: 1000; + background: white; + border-bottom: 1px solid #e9ecef; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); + backdrop-filter: blur(10px); + } + + .chat-input { + position: sticky; // sticks to panel-content + bottom: 0; + z-index: 999; + background: white; + border-top: 1px solid #e9ecef; + box-shadow: 0 -2px 4px rgba(0, 0, 0, 0.1); + backdrop-filter: blur(10px); + } + + .chat-messages { + flex: 1 1 auto; + min-height: 0; // allows shrinking so header/input remain visible + overflow: visible; // do not create nested scroll + } + } + + // Make DeepGitAI header sticky relative to panel content + .deepgit-ai-header { + position: sticky; + top: 0; + z-index: 1000; + background: white; + border-bottom: 1px solid #e9ecef; + box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); + backdrop-filter: blur(10px); + } + + // Make DeepGitAI chat input sticky relative to panel content + .chat-input { + position: sticky; + bottom: 0; + z-index: 999; + background: white; + border-top: 1px solid #e9ecef; + box-shadow: 0 -2px 4px rgba(0, 0, 0, 0.1); + backdrop-filter: blur(10px); + flex-shrink: 0; // Prevent input from shrinking + + .input-group { + .form-control { + border-radius: 20px 0 0 20px; + border: 1px solid #ced4da; + + &:focus { + border-color: #007bff; + box-shadow: 0 0 0 0.2rem rgba(0, 123, 255, 0.25); + } + } + + .btn { + border-radius: 0 20px 20px 0; + border: 1px solid #007bff; + } + } } } } diff --git a/src/views/ContextPanel.tsx b/src/views/ContextPanel.tsx index 7ed3d99..4b4b3a1 100644 --- a/src/views/ContextPanel.tsx +++ b/src/views/ContextPanel.tsx @@ -14,7 +14,7 @@ import GraphSumUp from "./GraphSumUp"; import NodesAppearanceBlock from "./NodesAppearanceBlock"; import Settings from "./Settings"; import SelectedNodePanel from "./SelectedNodePanel"; -import GraphRAGPanel from "./GraphRAGPanel"; +import DeepGitAIPanel from "./GraphRAGPanel"; const ContextPanel: FC = () => { const { navState, data, panel, setPanel } = useContext(GraphContext); @@ -35,8 +35,8 @@ const ContextPanel: FC = () => { let content: JSX.Element; if (panel === "settings") { content = ; - } else if (panel === "graphrag") { - content = ; + } else if (panel === "deepgit-ai") { + content = ; } else if (selectedNode) { content = ; } else { @@ -71,11 +71,11 @@ const ContextPanel: FC = () => { Settings {/*