Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .github/labels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,8 @@

- name: 'release-please:force-run'
color: bdca82
description: Manually trigger the release please workflow on a PR.
description: Manually trigger the release please workflow on a PR.

- name: 'ci:run-evals'
color: 4285f4
description: Manually trigger the evaluation CI pipeline on a PR.
106 changes: 106 additions & 0 deletions cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

options:
logging: CLOUD_LOGGING_ONLY

steps:

# --- Evaluation Step ---
- name: 'us-central1-docker.pkg.dev/cloud-db-nl2sql/evalbench/eval_server:latest'
entrypoint: 'bash'
# Decrypts the secret from Secret Manager into the MSSQL_DB_PASSWORD environment variable
secretEnv: ['MSSQL_DB_PASSWORD', 'GITHUB_TOKEN']
args:
- '-c'
- |
set -e

echo "Fetching PR data from GitHub API..."

# Fetch PR data and status code
HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \
"https://api.github.com/repos/$REPO_FULL_NAME/pulls/$_PR_NUMBER")

if [ "$$HTTP_STATUS" -ne 200 ]; then
echo "Error fetching PR data: HTTP $$HTTP_STATUS"
cat pr_data.json
exit 1
fi

PR_DATA=$(cat pr_data.json)

# Extract labels and title from PR data (Use $$ to escape bash variables)
PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")')
PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title')

# Check if execution labels are present
if [[ "$$PR_LABELS" != *"autorelease: pending"* && "$$PR_LABELS" != *"ci:run-evals"* ]]; then
echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution."
exit 0
fi
echo "Execution label detected. Processing release version context..."

# Determine Release Version based on branch name
if [[ "$_HEAD_BRANCH" == release-please-* ]]; then
if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then
export RELEASE_VERSION="$${BASH_REMATCH[1]}"
else
export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown"
fi
else
export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals"
fi

# Workaround for evalbench bug: settings are only applied if path basename matches extension ID
ln -s /workspace /workspace/cloud-sql-sqlserver
cd /evalbench

# evalbench specific environment variables
export EVAL_GCP_PROJECT_ID=$PROJECT_ID
export EVAL_GCP_PROJECT_REGION=$_CLOUD_SQL_REGION
export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT
export GOOGLE_CLOUD_PROJECT=$PROJECT_ID

# Cloud SQL SQL Server specific environment variables
export CLOUD_SQL_MSSQL_PROJECT=$PROJECT_ID
export CLOUD_SQL_MSSQL_INSTANCE=$_CLOUD_SQL_INSTANCE
export CLOUD_SQL_MSSQL_REGION=$_CLOUD_SQL_REGION
export CLOUD_SQL_MSSQL_DATABASE=$_CLOUD_SQL_DATABASE
export CLOUD_SQL_MSSQL_USER=$_CLOUD_SQL_USER
export CLOUD_SQL_MSSQL_IP_TYPE=$_CLOUD_SQL_IP_TYPE

# Maps the decrypted MSSQL_DB_PASSWORD to the exact variable expected by gemini_cli and extension skills
export CLOUD_SQL_MSSQL_PASSWORD=$$MSSQL_DB_PASSWORD

# Combine CI metadata with run config
cat /workspace/evals/ci_metadata.yaml >> /workspace/evals/run_config.yaml

# Substitute environment variables in model_config.yaml
python3 /workspace/evals/substitute_env.py

cd /evalbench
export PYTHONPATH=./evalbench:./evalbench/evalproto
export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python

echo "Launching Standalone Evaluation..."
python3 evalbench/evalbench.py --experiment_config=/workspace/evals/run_config.yaml


availableSecrets:
secretManager:
- versionName: projects/$PROJECT_ID/secrets/MSSQL_DB_PASSWORD/versions/latest
env: 'MSSQL_DB_PASSWORD'
- versionName: projects/$PROJECT_ID/secrets/GITHUB_TOKEN/versions/latest
env: 'GITHUB_TOKEN'
22 changes: 22 additions & 0 deletions evals/ci_metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

############################################################
### CI Metadata (Repository Specific)
### Note: These fields are used for version tracking in BQ
### and are not part of the core Evalbench schema.
############################################################

extension_id: cloud-sql-sqlserver
release_version: ${RELEASE_VERSION}
45 changes: 45 additions & 0 deletions evals/dataset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"scenarios": [
{
"id": "cloud-sql-debug-instance",
"starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.",
"conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_MSSQL_INSTANCE}' exists, get its details and validate it is RUNNABLE.",
"expected_trajectory": [
"list_instances",
"get_instance"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
},
{
"id": "cloud-sql-schema-tables-explore",
"starting_prompt": "I want to understand the structure of my database.",
"conversation_plan": "First, ask the agent to list the databases in the instance. After the agent provides the databases, ask it to list the tables specifically for that database.",
"expected_trajectory": [
"list_databases",
"list_tables"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
},
{
"id": "cloud-sql-performance-check",
"starting_prompt": "Our database performance seems degraded.",
"conversation_plan": "Start by asking the agent to check the CPU utilization system metrics for the database instance to see if it's overloaded.",
"expected_trajectory": [
"get_system_metrics"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}"
},
"kind": "tools",
"max_turns": 3
}
]
}
18 changes: 18 additions & 0 deletions evals/gemini_2.5_pro_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

generator: gcp_vertex_gemini
vertex_model: gemini-2.5-pro
base_prompt: ""
execs_per_minute: 5
33 changes: 33 additions & 0 deletions evals/model_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

gemini_cli_version: "@google/gemini-cli@latest"
generator: gemini_cli
env:
GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}"
GOOGLE_CLOUD_LOCATION: "global"
GOOGLE_GENAI_USE_VERTEXAI: "true"
GEMINI_CLI_TRUST_WORKSPACE: "true"
setup:
extensions:
# Points to the symlink created in cloudbuild.yaml to match the extension ID
"/workspace/cloud-sql-sqlserver":
settings:
CLOUD_SQL_MSSQL_PROJECT: "${CLOUD_SQL_MSSQL_PROJECT}"
CLOUD_SQL_MSSQL_INSTANCE: "${CLOUD_SQL_MSSQL_INSTANCE}"
CLOUD_SQL_MSSQL_REGION: "${CLOUD_SQL_MSSQL_REGION}"
CLOUD_SQL_MSSQL_DATABASE: "${CLOUD_SQL_MSSQL_DATABASE}"
CLOUD_SQL_MSSQL_USER: "${CLOUD_SQL_MSSQL_USER}"
CLOUD_SQL_MSSQL_PASSWORD: '${CLOUD_SQL_MSSQL_PASSWORD}'
CLOUD_SQL_MSSQL_IP_TYPE: "${CLOUD_SQL_MSSQL_IP_TYPE}"
41 changes: 41 additions & 0 deletions evals/run_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

dataset_config: /workspace/evals/dataset.json
dataset_format: gemini-cli-format

orchestrator: geminicli
model_config: /workspace/evals/model_config.yaml
simulated_user_model_config: /workspace/evals/gemini_2.5_pro_model.yaml

scorers:
# Qualitative (Judge-based)
goal_completion:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
behavioral_metrics:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
skills_best_practices:
model_config: /workspace/evals/gemini_2.5_pro_model.yaml
skills_dir: /workspace/cloud-sql-sqlserver/skills

# Performance
turn_count: {}
end_to_end_latency: {}
tool_call_latency: {}
token_consumption: {}
skills_trajectory: {}

reporting:
bigquery:
gcp_project_id: "${EVAL_REPORTING_PROJECT}"
18 changes: 18 additions & 0 deletions evals/substitute_env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os
import re

def main():
yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json']
for yaml_path in yaml_paths:
if os.path.exists(yaml_path):
with open(yaml_path, 'r') as f:
content = f.read()
content = re.sub(r'\${(\w+)}', lambda m: os.environ.get(m.group(1), m.group(0)), content)
with open(yaml_path, 'w') as f:
f.write(content)
print(f"Successfully substituted environment variables in {yaml_path}")
else:
print(f"File not found: {yaml_path}")

if __name__ == '__main__':
main()
Loading