diff --git a/.github/labels.yaml b/.github/labels.yaml index 5974f23..abf2d71 100644 --- a/.github/labels.yaml +++ b/.github/labels.yaml @@ -83,4 +83,8 @@ - name: 'release-please:force-run' color: bdca82 - description: Manually trigger the release please workflow on a PR. \ No newline at end of file + description: Manually trigger the release please workflow on a PR. + +- name: 'ci:run-evals' + color: 4285f4 + description: Manually trigger the evaluation CI pipeline on a PR. \ No newline at end of file diff --git a/cloudbuild.yaml b/cloudbuild.yaml index dd446dd..921717b 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -27,12 +27,7 @@ steps: - | set -e - # Only run on release branches - if [[ "$_HEAD_BRANCH" != release-please-* ]]; then - echo "Not a release-please branch. Exiting." - exit 0 - fi - echo "Release branch detected. Fetching PR data from GitHub API..." + echo "Fetching PR data from GitHub API..." # Fetch PR data and status code HTTP_STATUS=$(curl -s -o pr_data.json -w "%{http_code}" -H "Authorization: token $$GITHUB_TOKEN" \ @@ -46,28 +41,39 @@ steps: PR_DATA=$(cat pr_data.json) - # Extract labels and title from PR data (Use $$ to escape bash variables) - PR_LABELS=$(echo "$$PR_DATA" | jq -r '[.labels[].name] | join(",")') + # Extract title from PR data (Use $$ to escape bash variables) PR_TITLE=$(echo "$$PR_DATA" | jq -r '.title') - # Determine Release Version (Use double quotes and $$ for bash variables) - if [[ "$$PR_LABELS" == *"autorelease: pending"* ]]; then + # Check if execution labels are present using exact matching via jq + if ! jq -e '.labels | any(.name == "autorelease: pending" or .name == "ci:run-evals")' pr_data.json > /dev/null; then + echo "PR does not have 'autorelease: pending' or 'ci:run-evals' label. Skipping execution." + exit 0 + fi + echo "Execution label detected. Processing release version context..." + + # Determine Release Version based on branch name + if [[ "$_HEAD_BRANCH" == release-please-* ]]; then if [[ "$$PR_TITLE" =~ release\ ([0-9]+\.[0-9]+\.[0-9]+) ]]; then export RELEASE_VERSION="$${BASH_REMATCH[1]}" else - export RELEASE_VERSION="unknown" + export RELEASE_VERSION="pr-$_PR_NUMBER-release-unknown" fi else - export RELEASE_VERSION="unknown" + export RELEASE_VERSION="pr-$_PR_NUMBER-ci-run-evals" fi # Workaround for evalbench bug: settings are only applied if path basename matches extension ID ln -s /workspace /workspace/cloud-sql-postgresql cd /evalbench + # Set environment variables for evalbench export EVAL_GCP_PROJECT_ID=$PROJECT_ID export EVAL_GCP_PROJECT_REGION=$_CLOUD_SQL_REGION export GOOGLE_CLOUD_PROJECT=$PROJECT_ID + export EVAL_REPORTING_PROJECT=$_EVAL_REPORTING_PROJECT + + + # Set environment variables for extension export CLOUD_SQL_POSTGRES_PROJECT=$PROJECT_ID export CLOUD_SQL_POSTGRES_INSTANCE=$_CLOUD_SQL_INSTANCE export CLOUD_SQL_POSTGRES_REGION=$_CLOUD_SQL_REGION diff --git a/evals/dataset.json b/evals/dataset.json index a42bbae..654015f 100644 --- a/evals/dataset.json +++ b/evals/dataset.json @@ -2,14 +2,14 @@ "scenarios": [ { "id": "cloud-sql-debug-instance", - "starting_prompt": "Check on my databases in project ext-test-cloud-sql-postgres.", - "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if 'daily-ci-evals-db' exists, get its details and validate it is RUNNABLE.", + "starting_prompt": "Check on my databases in project ${GOOGLE_CLOUD_PROJECT}.", + "conversation_plan": "Ask the agent to list all Cloud SQL instances in the project. Once all instances are listed, if '${CLOUD_SQL_POSTGRES_INSTANCE}' exists, get its details and validate it is RUNNABLE.", "expected_trajectory": [ "list_instances", "get_instance" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 3 @@ -23,7 +23,7 @@ "list_tables" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 3 @@ -37,21 +37,21 @@ "list_locks" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 3 }, { "id": "cloud-sql-metrics-cpu-investigation", - "starting_prompt": "I'm worried about the database load for daily-ci-evals-db.", - "conversation_plan": "First, ask the agent to check the CPU utilization for the instance 'daily-ci-evals-db' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.", + "starting_prompt": "I'm worried about the database load for ${CLOUD_SQL_POSTGRES_INSTANCE}.", + "conversation_plan": "First, ask the agent to check the CPU utilization for the instance '${CLOUD_SQL_POSTGRES_INSTANCE}' for the last 5 minutes. After the agent provides the CPU data, ask it to check the overall database stats to see connection counts or transaction volume.", "expected_trajectory": [ "get_system_metrics", "list_database_stats" ], "env": { - "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + "GOOGLE_CLOUD_PROJECT": "${GOOGLE_CLOUD_PROJECT}" }, "kind": "tools", "max_turns": 3 diff --git a/evals/model_config.yaml b/evals/model_config.yaml index 485c758..2973cb4 100644 --- a/evals/model_config.yaml +++ b/evals/model_config.yaml @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -gemini_cli_version: "@google/gemini-cli@0.38.1" +gemini_cli_version: "@google/gemini-cli@latest" generator: gemini_cli env: GOOGLE_CLOUD_PROJECT: "${GOOGLE_CLOUD_PROJECT}" GOOGLE_CLOUD_LOCATION: "global" GOOGLE_GENAI_USE_VERTEXAI: "true" + GEMINI_CLI_TRUST_WORKSPACE: "true" setup: extensions: # Points to the symlink created in cloudbuild.yaml to match the extension ID diff --git a/evals/run_config.yaml b/evals/run_config.yaml index 0f45e6e..600bddd 100644 --- a/evals/run_config.yaml +++ b/evals/run_config.yaml @@ -25,13 +25,18 @@ scorers: model_config: /workspace/evals/gemini_2.5_pro_model.yaml behavioral_metrics: model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_best_practices: + model_config: /workspace/evals/gemini_2.5_pro_model.yaml + skills_dir: /workspace/cloud-sql-postgresql/skills # Performance turn_count: {} end_to_end_latency: {} tool_call_latency: {} token_consumption: {} + skills_trajectory: {} reporting: bigquery: - gcp_project_id: cloud-db-nl2sql + gcp_project_id: "${EVAL_REPORTING_PROJECT}" + diff --git a/evals/substitute_env.py b/evals/substitute_env.py index f10c8e3..cbe1a3a 100644 --- a/evals/substitute_env.py +++ b/evals/substitute_env.py @@ -2,7 +2,7 @@ import re def main(): - yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml'] + yaml_paths = ['/workspace/evals/model_config.yaml', '/workspace/evals/run_config.yaml', '/workspace/evals/dataset.json'] for yaml_path in yaml_paths: if os.path.exists(yaml_path): with open(yaml_path, 'r') as f: