<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/VERTEXAI_DEMO_DEC2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
CONFIG = {
    # ‚ö†Ô∏è HARDCODED CRITICAL PROJECT VALUES (Verified from previous logs) ‚ö†Ô∏è
    "PROJECT_ID": "gen-lang-client-0870511801",
    #"PROJECT_ID": "gen-lang-client-XXXXXXXXXX",
    "PROJECT_NUMBER": "677155171887",
    #"PROJECT_NUMBER": "XXXXXXXXXXXX",
    "REGION": "us-central1",
    "BUCKET_NAME": "poc-my-new-staging-bucket-2025-1",
    #"BUCKET_NAME": "poc-my-new-staging-bucket-XXXXX",

    # Model and Tuning Parameters
    #"BASE_MODEL": "gemini-2.0-flash-001",
    "BASE_MODEL": "gemini-2.5-pro",
    "FINAL_MODEL_DISPLAY_NAME": "cmapss-rul-gemini-final-launch-v2", # For reference only
    "EPOCHS": 10,
    "LEARNING_RATE_MULTIPLIER": 1.0,

    # Dataset Files
    "TRAIN_FILE_NAME": "cmapss_FD004_train_text.jsonl",
    "VALIDATION_FILE_NAME": "cmapss_FD004_test_text.jsonl",
}

PROJECT_ID = CONFIG['PROJECT_ID']
REGION = CONFIG['REGION']
BUCKET_NAME = CONFIG['BUCKET_NAME']
STAGING_BUCKET = f"gs://{CONFIG['BUCKET_NAME']}/staging"
TRAIN_DATASET_URI = f"gs://{CONFIG['BUCKET_NAME']}/{CONFIG['TRAIN_FILE_NAME']}"
VALIDATION_DATASET_URI = f"gs://{CONFIG['BUCKET_NAME']}/{CONFIG['VALIDATION_FILE_NAME']}"
MODEL_DISPLAY_NAME = CONFIG['FINAL_MODEL_DISPLAY_NAME']
BASE_MODEL = CONFIG['BASE_MODEL']

## TRAINING

In [8]:
import os
import time
from google.colab import auth
from google.cloud import aiplatform
from vertexai.preview.tuning import sft
import vertexai
from google.auth import default
from google.auth.transport.requests import Request as AuthRequest
import sys

# Assume CONFIG, STAGING_BUCKET, TRAIN_DATASET_URI, VALIDATION_DATASET_URI are defined here or globally

# --- CRITICAL STEP: MANUAL AUTHENTICATION (Must run successfully) ---
print("--- Interactive Authentication ---")
auth.authenticate_user()


# --- 1. INITIALIZATION (TOKEN REFRESH) ---
print("--- 1. Initializing SDK ---")
try:
    # Refresh credentials using the token established by auth.authenticate_user()
    credentials, project = default()
    credentials.refresh(AuthRequest())

    # Initialize both the aiplatform and vertexai libraries
    # Assuming CONFIG and STAGING_BUCKET are defined globally
    aiplatform.init(project=CONFIG['PROJECT_ID'], location=CONFIG['REGION'], staging_bucket=STAGING_BUCKET)
    vertexai.init(project=CONFIG['PROJECT_ID'], location=CONFIG['REGION'], staging_bucket=STAGING_BUCKET)
    print(f"‚úÖ Vertex AI SDK initialized for Project: {CONFIG['PROJECT_ID']}")
except Exception as e:
    print(f"‚ùå Initialization failed: {e}. Please ensure your authentication completed successfully.")
    sys.exit(1)


# --- 2. START THE FINE-TUNING JOB (FINAL SYNTAX) ---
print("\n--- 2. Starting Fine-Tuning Job ---")
print(f"   BASE_MODEL: {CONFIG['BASE_MODEL']}")
try:
    sft_tuning_job = sft.train(
        source_model=CONFIG['BASE_MODEL'],
        train_dataset=TRAIN_DATASET_URI,
        validation_dataset=VALIDATION_DATASET_URI,

        # The display_name argument is safely omitted to ensure compatibility

        epochs=CONFIG['EPOCHS'],
        learning_rate_multiplier=CONFIG['LEARNING_RATE_MULTIPLIER'],
    )

    # --- JOB ID EXTRACTION ---
    # The job ID is the last segment of the resource_name string
    job_id = sft_tuning_job.resource_name.split('/')[-1]

    # Building a stable URL using the extracted job_id
    job_monitor_url = (f"https://console.cloud.google.com/vertex-ai/locations/{CONFIG['REGION']}/"
                       f"training/{job_id}?project={CONFIG['PROJECT_ID']}")

    print(f"\n‚úÖ Tuning Job Submitted!")
    print(f"   Job ID (for SDK/CLI): {job_id}")
    print(f"   Monitor Job Here: {job_monitor_url}")

except Exception as e:
    print(f"\n‚ùå Job Submission Failed: {e}")

--- Interactive Authentication ---
--- 1. Initializing SDK ---


INFO:vertexai.tuning._tuning:Creating SupervisedTuningJob


‚úÖ Vertex AI SDK initialized for Project: gen-lang-client-0870511801

--- 2. Starting Fine-Tuning Job ---
   BASE_MODEL: gemini-2.5-pro


INFO:vertexai.tuning._tuning:SupervisedTuningJob created. Resource name: projects/677155171887/locations/us-central1/tuningJobs/5516984636737060864
INFO:vertexai.tuning._tuning:To use this SupervisedTuningJob in another session:
INFO:vertexai.tuning._tuning:tuning_job = sft.SupervisedTuningJob('projects/677155171887/locations/us-central1/tuningJobs/5516984636737060864')
INFO:vertexai.tuning._tuning:View Tuning Job:
https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/5516984636737060864?project=677155171887



‚úÖ Tuning Job Submitted!
   Job ID (for SDK/CLI): 5516984636737060864
   Monitor Job Here: https://console.cloud.google.com/vertex-ai/locations/us-central1/training/5516984636737060864?project=gen-lang-client-0870511801


In [9]:
import os
import sys
from google.colab import auth
from google.cloud import aiplatform
from vertexai.preview.tuning import sft
import vertexai
from google.auth import default
from google.auth.transport.requests import Request as AuthRequest

# --- CONFIGURATION (HARDCODED IDS) ---
CONFIG = {
    "PROJECT_ID": "gen-lang-client-0870511801",
    "REGION": "us-central1",
    # Target the new, correct job ID
    "TARGET_JOB_ID": job_id,
}

# --- AUTHENTICATION AND INITIALIZATION ---
print("--- 1. Initialization ---")
try:
    auth.authenticate_user()

    credentials, project = default()
    credentials.refresh(AuthRequest())

    aiplatform.init(project=CONFIG['PROJECT_ID'], location=CONFIG['REGION'])
    vertexai.init(project=CONFIG['PROJECT_ID'], location=CONFIG['REGION'])
    print("‚úÖ SDK Initialized.")
except Exception as e:
    print(f"‚ùå Initialization failed: {e}")
    sys.exit(1)


# --- 2. MONITOR JOB STATUS VIA PYTHON SDK (FIXED) ---
print(f"\n--- 2. Monitoring Job: {CONFIG['TARGET_JOB_ID']} ---")
try:
    # List all tuning jobs in the project
    jobs = sft.SupervisedTuningJob.list()

    found_job = None
    for job in jobs:
        # Check if the resource name ends with the target ID
        if job.resource_name.endswith(CONFIG['TARGET_JOB_ID']):
            found_job = job
            break

    if found_job:
        job_state = found_job.state.name

        # FIX: Removed the conflicting 'found_job.display_name' print statement
        print(f"‚úÖ Job ID: {CONFIG['TARGET_JOB_ID']} found.")
        print(f"Current Job State: {job_state}")

        if job_state == 'JOB_STATE_SUCCEEDED':
            print("\nüéâ JOB SUCCEEDED! You can now run the evaluation cell (Cell 2).")
        elif job_state == 'JOB_STATE_FAILED':
             print("\n‚ùå JOB FAILED! Please check the logs in the Google Cloud Console.")
        else:
             print("\n‚è≥ Job is still PENDING or RUNNING. Please wait and re-run this cell.")

    else:
        print(f"‚ùå Job with ID {CONFIG['TARGET_JOB_ID']} not found in the project list.")

except Exception as e:
    print(f"‚ùå Monitoring failed: {e}")

--- 1. Initialization ---
‚úÖ SDK Initialized.

--- 2. Monitoring Job: 5516984636737060864 ---
‚úÖ Job ID: 5516984636737060864 found.
Current Job State: JOB_STATE_RUNNING

‚è≥ Job is still PENDING or RUNNING. Please wait and re-run this cell.


In [10]:
import os
import sys
import time
from datetime import datetime
from google.colab import auth
from google.cloud import aiplatform
from vertexai.preview.tuning import sft
import vertexai
from google.auth import default
from google.auth.transport.requests import Request as AuthRequest

from warnings import filterwarnings
filterwarnings('ignore')

# --- CONFIGURATION (HARDCODED IDS) ---
CONFIG = {
    "PROJECT_ID": "gen-lang-client-0870511801",
    #"PROJECT_ID": "gen-lang-client-XXXXXXXXXX",
    "REGION": "us-central1",
    # Target the correct job ID
    "TARGET_JOB_ID": job_id,
    "POLLING_INTERVAL_SECONDS": 300, # Check every 5 minutes
}

# --- AUTHENTICATION AND INITIALIZATION ---
print("--- 1. Initialization ---")
try:
    auth.authenticate_user()

    credentials, project = default()
    credentials.refresh(AuthRequest())

    aiplatform.init(project=CONFIG['PROJECT_ID'], location=CONFIG['REGION'])
    vertexai.init(project=CONFIG['PROJECT_ID'], location=CONFIG['REGION'])
    print("‚úÖ SDK Initialized.")
except Exception as e:
    print(f"‚ùå Initialization failed: {e}")
    sys.exit(1)


# --- 2. MONITOR JOB STATUS WITH WHILE LOOP ---
print(f"\n--- 2. Monitoring Job: {CONFIG['TARGET_JOB_ID']} (Polling every {CONFIG['POLLING_INTERVAL_SECONDS']} seconds) ---")

# Construct the full resource name once
JOB_RESOURCE_NAME = f"projects/{CONFIG['PROJECT_ID']}/locations/{CONFIG['REGION']}/tuningJobs/{CONFIG['TARGET_JOB_ID']}"

try:
    # Get the job object once
    job = sft.SupervisedTuningJob(JOB_RESOURCE_NAME)

    # Get the creation time (it's a datetime object)
    creation_time = job.create_time.replace(tzinfo=None)

    print(f"Job started at: {creation_time.strftime('%Y-%m-%d %H:%M:%S')} UTC")

    while True:
        # Re-instantiate the job object to reliably fetch the absolute latest state and metadata
        job = sft.SupervisedTuningJob(JOB_RESOURCE_NAME)
        current_state = job.state.name

        # Calculate elapsed time
        current_time_utc = datetime.utcnow()
        elapsed_time = current_time_utc - creation_time
        elapsed_minutes = elapsed_time.total_seconds() / 60

        print(f"[{time.strftime('%H:%M:%S')}] State: {current_state} | Elapsed: {elapsed_minutes:.1f} minutes")

        if current_state == 'JOB_STATE_SUCCEEDED':
            print("\nüéâ JOB SUCCEEDED! Exiting monitor loop.")
            break

        elif current_state in ['JOB_STATE_FAILED', 'JOB_STATE_CANCELLED', 'JOB_STATE_ERROR']:
            print(f"\n‚ùå JOB TERMINATED with state: {current_state}. Exiting monitor loop.")
            break

        # If still running, wait for the defined interval
        print(f"‚è≥ Waiting {CONFIG['POLLING_INTERVAL_SECONDS']} seconds...")
        time.sleep(CONFIG['POLLING_INTERVAL_SECONDS'])

    print("\n--- Monitoring Complete ---")

except Exception as e:
    print(f"‚ùå Monitoring loop failed unexpectedly: {e}")

--- 1. Initialization ---
‚úÖ SDK Initialized.

--- 2. Monitoring Job: 5516984636737060864 (Polling every 300 seconds) ---


Job started at: 2025-12-10 15:48:32 UTC


[15:53:17] State: JOB_STATE_RUNNING | Elapsed: 4.8 minutes
‚è≥ Waiting 300 seconds...


[15:58:18] State: JOB_STATE_RUNNING | Elapsed: 9.8 minutes
‚è≥ Waiting 300 seconds...


[16:03:19] State: JOB_STATE_RUNNING | Elapsed: 14.8 minutes
‚è≥ Waiting 300 seconds...


[16:08:20] State: JOB_STATE_RUNNING | Elapsed: 19.8 minutes
‚è≥ Waiting 300 seconds...


[16:13:21] State: JOB_STATE_RUNNING | Elapsed: 24.8 minutes
‚è≥ Waiting 300 seconds...


[16:18:22] State: JOB_STATE_RUNNING | Elapsed: 29.8 minutes
‚è≥ Waiting 300 seconds...


[16:23:23] State: JOB_STATE_RUNNING | Elapsed: 34.9 minutes
‚è≥ Waiting 300 seconds...


[16:28:25] State: JOB_STATE_RUNNING | Elapsed: 39.9 minutes
‚è≥ Waiting 300 seconds...


[16:33:28] State: JOB_STATE_RUNNING | Elapsed: 44.9 minutes
‚è≥ Waiting 300 seconds...


[16:38:29] State: JOB_STATE_RUNNING | Elapsed: 50.0 minutes
‚è≥ Waiting 300 seconds...


[16:43:31] State: JOB_STATE_RUNNING | Elapsed: 55.0 minutes
‚è≥ Waiting 300 seconds...


[16:48:32] State: JOB_STATE_RUNNING | Elapsed: 60.0 minutes
‚è≥ Waiting 300 seconds...


[16:53:34] State: JOB_STATE_RUNNING | Elapsed: 65.0 minutes
‚è≥ Waiting 300 seconds...


[16:58:35] State: JOB_STATE_RUNNING | Elapsed: 70.1 minutes
‚è≥ Waiting 300 seconds...


[17:03:37] State: JOB_STATE_SUCCEEDED | Elapsed: 75.1 minutes

üéâ JOB SUCCEEDED! Exiting monitor loop.

--- Monitoring Complete ---


## EVALUATION

In [12]:
!gcloud ai endpoints list --region=us-central1 --project=gen-lang-client-0870511801

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
ENDPOINT_ID          DISPLAY_NAME
203144668836265984   SupervisedTuningJob 2025-12-10 15:48:31.106257
1018296201390325760  SupervisedTuningJob 2025-12-10 15:48:31.106257
6415860354793865216  SupervisedTuningJob 2025-12-10 15:48:31.106257
3157506024391311360  SupervisedTuningJob 2025-12-10 15:48:31.106257
1799952211715817472  SupervisedTuningJob 2025-12-10 15:48:31.106257
3900599962907443200  SupervisedTuningJob 2025-12-10 15:48:31.106257
1022799801017696256  SupervisedTuningJob 2025-12-10 15:48:31.106257
1941534125001277440  SupervisedTuningJob 2025-12-10 15:48:31.106257
9045962537178234880  SupervisedTuningJob 2025-12-10 15:48:31.106257
405806652067938304   SupervisedTuningJob 2025-12-10 15:48:31.106257


In [13]:
from google.cloud import aiplatform
from google.api_core import exceptions as api_exceptions
from typing import List

# --- CONFIGURATION (Use your previous settings) ---
PROJECT_ID = "gen-lang-client-0870511801"
REGION = "us-central1"

def list_vertex_ai_endpoints(project_id: str, location: str) -> List[str]:
    """
    Lists all Vertex AI Endpoints, using robust attribute access.
    Returns a list of all found endpoint display names.
    """

    print("-" * 50)
    print(f"--- Listing Endpoints in {project_id}/{location} ---")

    # --- CHANGE 1: Define the list before the loop ---
    all_endpoint_display_names = []

    try:
        aiplatform.init(project=project_id, location=location)
        endpoints = aiplatform.Endpoint.list(location=location)

        if not endpoints:
            print("\nüéâ No Endpoints found in this region.")
            print("-" * 50)
            return all_endpoint_display_names # Return the empty list

        print("\n‚úÖ Found Endpoints:")
        print("| Display Name | Endpoint ID | Deployed Models | State |")
        print("| :--- | :--- | :--- | :--- |")

        for endpoint in endpoints:

            # --- CHANGE 2: Assign/append within the loop ---
            endpoint_display_name = getattr(endpoint, 'display_name', 'Unknown Endpoint')
            all_endpoint_display_names.append(endpoint_display_name)

            try:
                # ... (rest of the robust attribute checks remain the same)
                endpoint_id = endpoint.name.split('/')[-1]
                deployed_models = getattr(endpoint, 'deployed_models', None)
                if deployed_models is None:
                    model_count = "N/A (Old SDK)"
                else:
                    model_count = len(deployed_models)

                endpoint_state_obj = getattr(endpoint, 'state', None)
                if endpoint_state_obj is None:
                    state_name = "N/A (Old SDK)"
                else:
                    state_name = endpoint_state_obj.name

                print(f"| {endpoint_display_name} | {endpoint_id} | {model_count} | **{state_name}** |")

            except Exception as e:
                print(f"| ‚ùå ERROR processing {endpoint_display_name} | | | |")
                print(f"  (Details: {e})")
                continue

    except Exception as e:
        print(f"\n‚ùå A critical error occurred during initialization or listing: {e}")

    print("-" * 50)

    # --- CHANGE 3: Return the list so it can be used outside the function ---
    return all_endpoint_display_names

# Execute the listing function
if __name__ == "__main__":
    # The result of the function is captured in this variable
    endpoint_names = list_vertex_ai_endpoints(PROJECT_ID, REGION)

    # --- Example of using the variable outside the function/loop ---
    if endpoint_names:
        print("\n--- Summary of Names Available Outside Loop ---")
        print(f"Total endpoints found: {len(endpoint_names)}")
        print(f"First endpoint name: {endpoint_names[0]}")
        print("---------------------------------------------")

--------------------------------------------------
--- Listing Endpoints in gen-lang-client-0870511801/us-central1 ---

‚úÖ Found Endpoints:
| Display Name | Endpoint ID | Deployed Models | State |
| :--- | :--- | :--- | :--- |
| SupervisedTuningJob 2025-12-10 15:48:31.106257 | 203144668836265984 | N/A (Old SDK) | **N/A (Old SDK)** |
| SupervisedTuningJob 2025-12-10 15:48:31.106257 | 1018296201390325760 | N/A (Old SDK) | **N/A (Old SDK)** |
| SupervisedTuningJob 2025-12-10 15:48:31.106257 | 6415860354793865216 | N/A (Old SDK) | **N/A (Old SDK)** |
| SupervisedTuningJob 2025-12-10 15:48:31.106257 | 3157506024391311360 | N/A (Old SDK) | **N/A (Old SDK)** |
| SupervisedTuningJob 2025-12-10 15:48:31.106257 | 1799952211715817472 | N/A (Old SDK) | **N/A (Old SDK)** |
| SupervisedTuningJob 2025-12-10 15:48:31.106257 | 3900599962907443200 | N/A (Old SDK) | **N/A (Old SDK)** |
| SupervisedTuningJob 2025-12-10 15:48:31.106257 | 1022799801017696256 | N/A (Old SDK) | **N/A (Old SDK)** |
| Supervis

In [14]:
import time
import json
import numpy as np
import re
import os
from google.cloud import aiplatform
from google.colab import auth
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm

# --- IMPORT THE WORKING CLIENTS ---
from google import genai
from google.genai import types
import vertexai

# --- CONFIGURATION (UPDATED TO USE LATEST JOB NAME) ---
CONFIG = {
    "PROJECT_ID": os.environ.get("GOOGLE_CLOUD_PROJECT", "gen-lang-client-0870511801"),
    "PROJECT_NUMBER": os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER", "677155171887"),
    "REGION": os.environ.get("GOOGLE_CLOUD_REGION", "us-central1"),
    "BUCKET_NAME": os.environ.get("GOOGLE_CLOUD_BUCKET_NAME", "poc-my-new-staging-bucket-2025-1"),

    # Use the name of the latest, successful training job to find the endpoint
    #"MODEL_DISPLAY_NAME": "SupervisedTuningJob 2025-12-10 12:14:07.704792"
    "MODEL_DISPLAY_NAME": endpoint_names[0]
}

# Derived paths
EVAL_DATASET_URI = f"gs://{CONFIG['BUCKET_NAME']}/cmapss_FD004_test_text.jsonl"
LOCAL_DATASET_PATH = '/content/cmapss_FD004_test_text.jsonl'
# TUNED_MODEL_ENDPOINT will be set dynamically below


# --- 1. AUTHENTICATION AND INITIALIZATION ---
print("--- 1. Authentication and Initialization ---")
try:
    auth.authenticate_user()
    aiplatform.init(project=CONFIG['PROJECT_ID'], location=CONFIG['REGION'])

    # Initialize the working client (no endpoint needed yet)
    client = genai.Client(
        vertexai=True,
        project=CONFIG['PROJECT_ID'],
        location=CONFIG['REGION'],
    )
    print("‚úÖ Client initialized successfully.")

except Exception as e:
    print(f"‚ùå Initialization failed: {e}")
    exit()


# --- 2. RETRIEVE THE NEW ENDPOINT ID ---
print("\n--- 2. Retrieving New Endpoint ID ---")
try:
    # 1. Search for the Endpoint associated with the latest job name
    endpoint_list = aiplatform.Endpoint.list(
        filter=f'display_name="{CONFIG["MODEL_DISPLAY_NAME"]}"',
        order_by='create_time desc'
    )

    if not endpoint_list:
        raise Exception("Endpoint for the latest job not found. Ensure the model is deployed.")

    NEW_ENDPOINT_ID = endpoint_list[0].name
    # Construct the final endpoint path using the new ID
    TUNED_MODEL_ENDPOINT = f"projects/{CONFIG['PROJECT_NUMBER']}/locations/{CONFIG['REGION']}/endpoints/{NEW_ENDPOINT_ID}"

    print(f"‚úÖ Found new Endpoint ID: {NEW_ENDPOINT_ID}")
    #print(f"‚úÖ Full Endpoint Path: {TUNED_MODEL_ENDPOINT}")

except Exception as e:
    print(f"‚ùå Failed to find the latest Endpoint: {e}")
    exit()


# --- 3. GENERATE AND EVALUATE (RUL METRICS) ---
print("\n--- 3. Starting RUL Prediction and Evaluation ---")
try:
    # Copy the validation dataset locally
    !gsutil cp {EVAL_DATASET_URI} {LOCAL_DATASET_PATH}
    print("‚úÖ Test data copied locally.")

    # Variables for RUL metrics
    ground_truth_ruls = []
    predicted_ruls = []
    RUL_PATTERN = re.compile(r'Remaining Useful Life:\s*(\d+\.?\d*)')

    num_lines = sum(1 for line in open(LOCAL_DATASET_PATH))

    # Read and process the dataset file
    with open(LOCAL_DATASET_PATH, 'r') as f:
        for line in tqdm(f, total=num_lines, desc="Running Predictions"):
            data = json.loads(line)

            try:
                prompt = data['contents'][0]['parts'][0]['text']
                gt_text = data['contents'][1]['parts'][0]['text']
            except (IndexError, KeyError):
                continue

            # 1. Extract Ground Truth (GT) RUL
            gt_match = RUL_PATTERN.search(gt_text)
            if gt_match:
                ground_truth_ruls.append(float(gt_match.group(1)))
            else: continue

            # 2. Generate Prediction using the WORKING CLIENT
            generated_text = ""
            try:
                contents = [prompt]

                # Use the working stream method from your reference code
                for chunk in client.models.generate_content_stream(
                    model=TUNED_MODEL_ENDPOINT, # USES NEW ENDPOINT ID
                    contents=contents,
                    config=types.GenerateContentConfig(
                        temperature=0.0,
                        max_output_tokens=128,
                    ),
                ):
                    generated_text += chunk.text
            except Exception as e:
                # Log the specific error for tracking and continue
                print(f"Error during text generation: {e}")
                continue

            # 3. Extract Predicted (P) RUL
            pred_match = RUL_PATTERN.search(generated_text)
            if pred_match:
                predicted_ruls.append(float(pred_match.group(1)))
            else:
                predicted_ruls.append(0.0)

    # --- 4. CALCULATE METRICS ---
    min_len = min(len(ground_truth_ruls), len(predicted_ruls))
    gt_ruls = np.array(ground_truth_ruls[:min_len])
    pred_ruls = np.array(predicted_ruls[:min_len])

    if min_len > 0:
        mae = mean_absolute_error(gt_ruls, pred_ruls)
        rmse = np.sqrt(mean_squared_error(gt_ruls, pred_ruls))

        print("\n\n--- RUL Prediction Performance ---")
        print(f"Total Test Samples Evaluated: {min_len}")
        print(f"Mean Absolute Error (MAE): {mae:.3f} cycles")
        print(f"Root Mean Squared Error (RMSE): {rmse:.3f} cycles")
        print("\nLower MAE and RMSE imply higher accuracy of the regression model.")
    else:
        print("Evaluation failed: No valid RUL predictions were generated.")

except Exception as e:
    print(f"\n‚ùå Final Prediction/Evaluation Failed: {e}")

--- 1. Authentication and Initialization ---
‚úÖ Client initialized successfully.

--- 2. Retrieving New Endpoint ID ---
‚úÖ Found new Endpoint ID: 203144668836265984

--- 3. Starting RUL Prediction and Evaluation ---
Copying gs://poc-my-new-staging-bucket-2025-1/cmapss_FD004_test_text.jsonl...
- [1 files][  1.4 MiB/  1.4 MiB]                                                
Operation completed over 1 objects/1.4 MiB.                                      
‚úÖ Test data copied locally.


Running Predictions:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 236/252 [09:39<00:17,  1.09s/it]

Error during text generation: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details.', 'status': 'RESOURCE_EXHAUSTED'}}


Running Predictions:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 237/252 [09:40<00:13,  1.14it/s]

Error during text generation: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'Resource exhausted. Please try again later. Please refer to https://cloud.google.com/vertex-ai/generative-ai/docs/error-code-429 for more details.', 'status': 'RESOURCE_EXHAUSTED'}}


Running Predictions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 252/252 [10:28<00:00,  2.50s/it]



--- RUL Prediction Performance ---
Total Test Samples Evaluated: 250
Mean Absolute Error (MAE): 26.000 cycles
Root Mean Squared Error (RMSE): 26.000 cycles

Lower MAE and RMSE imply higher accuracy of the regression model.





In [16]:
!pip install rouge_score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [17]:
from google import genai
from google.genai import types
import json
import os
from google.cloud import aiplatform
from rouge_score import rouge_scorer
from tqdm import tqdm

# --- CONFIGURATION (UPDATED TO USE ENDPOINT ID) ---
CONFIG = {
    "PROJECT_ID": os.environ.get("GOOGLE_CLOUD_PROJECT", "gen-lang-client-0870511801"),
    "PROJECT_NUMBER": os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER", "677155171887"),
    "REGION": os.environ.get("GOOGLE_CLOUD_REGION", "us-central1"),
    "BUCKET_NAME": os.environ.get("GOOGLE_CLOUD_BUCKET_NAME", "poc-my-new-staging-bucket-2025-1"),

    # *** CRITICAL FIX: Use the specific, numeric ENDPOINT_ID instead of the DISPLAY_NAME ***
    # I am using '5982670365636165632' as found in your list.
    "ENDPOINT_ID": NEW_ENDPOINT_ID
}

PROJECT_NUMBER=CONFIG['PROJECT_NUMBER']
REGION=CONFIG['REGION']
BUCKET_NAME=CONFIG['BUCKET_NAME']
PROJECT_ID=CONFIG['PROJECT_ID']
# Removed MODEL_DISPLAY_NAME as it's no longer used for the path
ENDPOINT_ID=CONFIG['ENDPOINT_ID']

STAGING_BUCKET=f"gs://{BUCKET_NAME}"

# --- CORRECTED RESOURCE NAME ---
# The ENDPOINT_ID is now the numeric ID, ensuring a unique path.
ENDPOINT_RESOURCE_NAME = f"projects/{PROJECT_NUMBER}/locations/{REGION}/endpoints/{ENDPOINT_ID}"

EVAL_DATASET_URI = f"gs://{BUCKET_NAME}/cmapss_FD004_test_text.jsonl"
# Keeping this placeholder variable for the print statement
model_resource_name_id = f'projects/{PROJECT_NUMBER}/locations/{REGION}/models/YOUR_MODEL_ID'

aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

def generate_and_evaluate():
    client = genai.Client(
        vertexai=True,
        project=PROJECT_ID,
        location=REGION,
    )

    # Use the full, numeric endpoint resource name
    model_identifier = ENDPOINT_RESOURCE_NAME


    #print('\n')
    #report=f"Evaluation of the model in Vertex AI: {model_resource_name_id} using the endpoint {model_identifier}, with the dataset: {EVAL_DATASET_URI}"
    #print(report)
    #print('\n\n')


    validation_dataset_uri = EVAL_DATASET_URI

    # Copy the validation dataset locally
    local_dataset_path = '/content/cmapss_FD004_test_text.jsonl'
    !gsutil cp {validation_dataset_uri} .
    print('\n')

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    all_scores = []

    # Calculate total lines beforehand
    try:
        num_lines = sum(1 for line in open(local_dataset_path))
    except FileNotFoundError:
        print(f"Error: Dataset not found at {local_dataset_path}. Check gsutil copy command.")
        return


    # Read and process the dataset file
    with open(local_dataset_path, 'r') as f:
        for line in tqdm(f, total=num_lines, desc="Processing dataset"):
            data = json.loads(line)

            # Extract prompt and ground truth from JSON structure
            try:
                prompt = data['contents'][0]['parts'][0]['text']
                ground_truth_text = data['contents'][1]['parts'][0]['text']
            except (IndexError, KeyError):
                print("Skipping invalid data point:", line)
                continue

            if prompt and ground_truth_text:
                contents = [prompt]

                # Generate content for the current prompt
                generated_text = ""
                try:
                    for chunk in client.models.generate_content_stream(
                        model=model_identifier,
                        contents=contents,
                        config=types.GenerateContentConfig(
                            temperature=1,
                            top_p=0.95,
                            max_output_tokens=8192,
                            response_modalities=["TEXT"],
                            safety_settings=[types.SafetySetting(category=c, threshold="OFF") for c in [
                                "HARM_CATEGORY_HATE_SPEECH",
                                "HARM_CATEGORY_DANGEROUS_CONTENT",
                                "HARM_CATEGORY_SEXUALLY_EXPLICIT",
                                "HARM_CATEGORY_HARASSMENT",
                            ]],
                        ),
                    ):
                        generated_text += chunk.text
                except Exception as e:
                    print(f"Error during text generation for prompt '{prompt[:50]}...': {e}")
                    continue

                # Calculate ROUGE scores
                scores = scorer.score(ground_truth_text, generated_text)
                all_scores.append(scores)


    # Calculate and print average ROUGE scores
    if all_scores:
        avg_rouge1 = sum(s['rouge1'].fmeasure for s in all_scores) / len(all_scores)
        avg_rougeL = sum(s['rougeL'].fmeasure for s in all_scores) / len(all_scores)
        print('\n\n')
        print(f"Average ROUGE-1: {avg_rouge1}")
        print(f"Average ROUGE-L: {avg_rougeL}")
        print('\n')
    else:
        print("No ROUGE scores were calculated. Check the dataset and text generation process.")

generate_and_evaluate()

Copying gs://poc-my-new-staging-bucket-2025-1/cmapss_FD004_test_text.jsonl...
- [1 files][  1.4 MiB/  1.4 MiB]                                                
Operation completed over 1 objects/1.4 MiB.                                      




Processing dataset: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 252/252 [09:46<00:00,  2.33s/it]




Average ROUGE-1: 0.75
Average ROUGE-L: 0.75







## DELETE

In [76]:
!gcloud ai endpoints list --region=us-central1 --project=gen-lang-client-0870511801

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
ENDPOINT_ID          DISPLAY_NAME
4931924277575286784  SupervisedTuningJob 2025-12-10 14:30:36.418412
5607464221680861184  SupervisedTuningJob 2025-12-10 14:30:36.418412
6129881778455838720  SupervisedTuningJob 2025-12-10 14:30:36.418412
3265592415448203264  SupervisedTuningJob 2025-12-10 14:30:36.418412
4470305315769810944  SupervisedTuningJob 2025-12-10 14:30:36.418412
6776148324983504896  SupervisedTuningJob 2025-12-10 14:30:36.418412
8705940765311762432  SupervisedTuningJob 2025-12-10 14:30:36.418412
8453739186179014656  SupervisedTuningJob 2025-12-10 14:30:36.418412
2999880037433344000  SupervisedTuningJob 2025-12-10 14:30:36.418412
5623226820376657920  SupervisedTuningJob 2025-12-10 14:30:36.418412
1197595761555013632  SupervisedTuningJob 2025-12-10 13:58:08.977437
8854559553014988800  SupervisedTuningJob 2025-12-10 13:58:08.977437
8228559204810489856  SupervisedTuningJob 2025-12-10 13:58:08.977437
183344773394438553

In [77]:
from google.cloud import aiplatform
from google.api_core import exceptions as api_exceptions
import os
import time # Added time for better handling of long-running operations

# --- CONFIGURATION ---
CONFIG = {
    "PROJECT_ID": "gen-lang-client-0870511801",  # Your Project ID
    "REGION": "us-central1",                     # The region where your endpoints are deployed
}

PROJECT_ID = CONFIG['PROJECT_ID']
REGION = CONFIG['REGION']

def cleanup_all_vertex_ai_endpoints(project_id: str, location: str):
    """
    Lists all Vertex AI Endpoints in the specified project/location,
    undeploys any remaining models, and then deletes the endpoints.
    """
    print(f"--- Starting Cleanup Process ---")
    print(f"Project: {project_id}, Region: {location}")

    # Initialize the Vertex AI client
    aiplatform.init(project=project_id, location=location)

    try:
        # List all endpoints in the specified region
        endpoints = aiplatform.Endpoint.list(location=location)

    except api_exceptions.NotFound:
        print("\nNo endpoints found in this location, or the project/region is incorrect.")
        return

    if not endpoints:
        print("\nNo endpoints found in the list. Nothing to delete.")
        return

    print(f"\nFound {len(endpoints)} endpoints to process.")

    deleted_count = 0

    for endpoint in endpoints:
        endpoint_id = endpoint.name.split('/')[-1]

        print("-" * 50)
        print(f"Endpoint found: ID={endpoint_id}, Name='{endpoint.display_name}'")

        # 1. Undeploy Models (Run this again just in case, though it ran successfully before)
        deployed_models = endpoint.list_models()

        if deployed_models:
            print(f"  > Found {len(deployed_models)} deployed models. Undeploying...")

            try:
                # The undeploy_all() method undeploys all models and waits for the operation to complete.
                endpoint.undeploy_all()
                print("  ‚úÖ All models successfully undeployed.")
                # Add a small pause to ensure the resource state is updated before deletion
                time.sleep(5)
            except Exception as e:
                print(f"  ‚ùå ERROR during undeploy for {endpoint.display_name}: {e}")
                print("  Skipping deletion of this endpoint.")
                continue # Cannot delete if undeploy fails

        else:
            print("  > No models currently deployed to this endpoint. Proceeding to deletion.")


        # 2. Delete the Endpoint

        # --- DELETION LOGIC IS NOW UNCOMMENTED AND ACTIVE ---
        print(f"  > Deleting endpoint {endpoint.display_name} ({endpoint_id})...")
        try:
            # The delete() method waits for the operation to complete
            endpoint.delete()
            print("  ‚úÖ Endpoint successfully deleted.")
            deleted_count += 1
        except Exception as e:
            print(f"  ‚ùå ERROR during deletion for {endpoint.display_name}: {e}")

        # --------------------------------------------------------

    print("-" * 50)
    print(f"\n--- Cleanup Process Finished ---")
    print(f"Total endpoints deleted: {deleted_count} out of {len(endpoints)} processed.")

# Execute the cleanup function
if __name__ == "__main__":
    cleanup_all_vertex_ai_endpoints(PROJECT_ID, REGION)

--- Starting Cleanup Process ---
Project: gen-lang-client-0870511801, Region: us-central1

Found 14 endpoints to process.
--------------------------------------------------
Endpoint found: ID=4931924277575286784, Name='SupervisedTuningJob 2025-12-10 14:30:36.418412'
  > Found 1 deployed models. Undeploying...
  ‚úÖ All models successfully undeployed.
  > Deleting endpoint SupervisedTuningJob 2025-12-10 14:30:36.418412 (4931924277575286784)...
  ‚úÖ Endpoint successfully deleted.
--------------------------------------------------
Endpoint found: ID=5607464221680861184, Name='SupervisedTuningJob 2025-12-10 14:30:36.418412'
  > Found 1 deployed models. Undeploying...
  ‚úÖ All models successfully undeployed.
  > Deleting endpoint SupervisedTuningJob 2025-12-10 14:30:36.418412 (5607464221680861184)...
  ‚úÖ Endpoint successfully deleted.
--------------------------------------------------
Endpoint found: ID=6129881778455838720, Name='SupervisedTuningJob 2025-12-10 14:30:36.418412'
  > Found

In [78]:
!gcloud ai endpoints list --region=us-central1 --project=gen-lang-client-0870511801

Using endpoint [https://us-central1-aiplatform.googleapis.com/]
Listed 0 items.


In [79]:
!pip install --upgrade google-cloud-aiplatform

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.130.0-py2.py3-none-any.whl.metadata (46 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m46.1/46.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading google_cloud_aiplatform-1.130.0-py2.py3-none-any.whl (8.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m8.1/8.1 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-aiplatform
  Attempting uninstall: google-cloud-aiplatform
    Found existing installation: google-cloud-aiplatform 1.129.0
    Uninstalling google-cloud-aiplatform-1.129.0:
      Successfully uninstalled google-cloud-aiplatform-1.129.0
Successfully installed google-cloud-aiplatform-1.130.0


In [1]:
from google.cloud import aiplatform
# Import the low-level client components
from google.cloud.aiplatform_v1.services.pipeline_service import PipelineServiceClient
from google.api_core import exceptions as api_exceptions
import time
import os
import re
from typing import Tuple, Dict

# --- CONFIGURATION ---
PROJECT_ID = "gen-lang-client-0870511801"
REGION = "us-central1"

DELETABLE_STATE_NAMES: Tuple[str, ...] = (
    'JOB_STATE_SUCCEEDED',
    'JOB_STATE_FAILED',
    'JOB_STATE_CANCELLED'
)

# --- CRITICAL FIX: MANUAL MAPPING FOR LOW-LEVEL TRAINING PIPELINE STATES ---
# This dictionary maps the integer value returned by the low-level client
# (TrainingPipeline.State) to the string name for comparison.
TRAINING_PIPELINE_STATE_MAP: Dict[int, str] = {
    0: 'JOB_STATE_UNSPECIFIED',
    1: 'JOB_STATE_QUEUED',
    2: 'JOB_STATE_PENDING',
    3: 'JOB_STATE_RUNNING',
    4: 'JOB_STATE_SUCCEEDED', # Deletable
    5: 'JOB_STATE_FAILED',    # Deletable
    6: 'JOB_STATE_CANCELLING',
    7: 'JOB_STATE_CANCELLED', # Deletable
    8: 'JOB_STATE_PAUSED',
    9: 'JOB_STATE_EXPIRED',
}


def delete_completed_jobs_and_pipelines(
    project_id: str,
    location: str,
    deletable_states: Tuple[str, ...],
    state_map: Dict[int, str]
) -> None:
    """
    Lists and deletes completed Vertex AI resources. Uses the low-level
    PipelineServiceClient for Training Pipelines with the manual State enum mapping.
    """

    print("-" * 70)
    print(f"--- Starting Job and Pipeline Cleanup in {project_id}/{location} ---")

    aiplatform.init(project=project_id, location=location)
    deleted_count = 0

    # Initialize the low-level client for the Training Pipeline Service outside the deletion logic
    pipeline_client = PipelineServiceClient(client_options={"api_endpoint": f"{location}-aiplatform.googleapis.com"})
    parent_path = f"projects/{project_id}/locations/{location}"

    # Function to handle the repeated deletion logic
    def perform_deletion(resource, resource_type: str, is_low_level: bool = False) -> bool:
        """Attempts to delete a resource and handles exceptions."""
        nonlocal deleted_count
        try:
            time.sleep(1) # CRITICAL: Rate Limit Delay

            if is_low_level:
                # Low-level client delete call
                pipeline_client.delete_training_pipeline(name=resource.name)
            else:
                # High-level SDK delete call
                resource.delete()

            print(f"  ‚úÖ {resource_type} deleted.")
            deleted_count += 1
            return True
        except api_exceptions.NotFound:
            print(f"  ‚ö†Ô∏è {resource_type} already deleted.")
        except api_exceptions.ResourceExhausted:
            print(f"  ‚ùå ERROR: Quota Exceeded (429). Please wait 1-2 minutes and re-run the script.")
            return False # Signal to stop execution
        except Exception as e:
            print(f"  ‚ùå UNEXPECTED ERROR deleting {resource_type}: {e}")
        return True # Signal to continue


    # 1. Process Custom Jobs (High-level SDK)
    print("\n--- Processing Custom Jobs ---")
    try:
        custom_jobs = aiplatform.CustomJob.list(location=location)
        for job in custom_jobs:
            if job.state.name in deletable_states:
                print(f"Deleting Custom Job: {job.display_name} (ID: {job.name.split('/')[-1]})")
                if not perform_deletion(job, "Custom Job"): return
            else:
                print(f"Skipping active Custom Job: {job.display_name} (State: {job.state.name})")
    except Exception as e:
        print(f"Error processing Custom Jobs: {e}")

    # 2. Process Training Pipelines (FIXED: Using Low-Level Client and Manual State Mapping)
    print("\n--- Processing Training Pipelines (Low-Level Client) ---")
    try:
        training_pipelines_page = pipeline_client.list_training_pipelines(parent=parent_path)

        for pipeline in training_pipelines_page:
            # FIX: Get the integer state and map it to the string name
            state_name = state_map.get(pipeline.state, 'UNKNOWN_STATE')

            if state_name in deletable_states:
                print(f"Deleting Training Pipeline: {pipeline.display_name} (ID: {pipeline.name.split('/')[-1]})")
                if not perform_deletion(pipeline, "Training Pipeline", is_low_level=True): return
            else:
                print(f"Skipping active Training Pipeline: {pipeline.display_name} (State: {state_name})")

    except Exception as e:
        print(f"Error processing Training Pipelines: {e}")

    # 3. Process Hyperparameter Tuning Jobs (High-level SDK)
    print("\n--- Processing Hyperparameter Tuning Jobs ---")
    try:
        tuning_jobs = aiplatform.HyperparameterTuningJob.list(location=location)
        for job in tuning_jobs:
            if job.state.name in deletable_states:
                print(f"Deleting Hyperparameter Tuning Job: {job.display_name} (ID: {job.name.split('/')[-1]})")
                if not perform_deletion(job, "Hyperparameter Tuning Job"): return
            else:
                print(f"Skipping active Hyperparameter Tuning Job: {job.display_name} (State: {job.state.name})")
    except Exception as e:
        print(f"Error processing Hyperparameter Tuning Jobs: {e}")


    print("-" * 70)
    print(f"\n--- Job and Pipeline Cleanup Finished ---")
    print(f"Total completed resources deleted this run: {deleted_count}")
    print("If Quota Exceeded errors occurred, please wait 1-2 minutes and run the script again.")

# Execute the cleanup function
if __name__ == "__main__":
    delete_completed_jobs_and_pipelines(
        project_id=PROJECT_ID,
        location=REGION,
        deletable_states=DELETABLE_STATE_NAMES,
        state_map=TRAINING_PIPELINE_STATE_MAP # Pass the new mapping
    )

----------------------------------------------------------------------
--- Starting Job and Pipeline Cleanup in gen-lang-client-0870511801/us-central1 ---

--- Processing Custom Jobs ---

--- Processing Training Pipelines (Low-Level Client) ---

--- Processing Hyperparameter Tuning Jobs ---
----------------------------------------------------------------------

--- Job and Pipeline Cleanup Finished ---
Total completed resources deleted this run: 0
If Quota Exceeded errors occurred, please wait 1-2 minutes and run the script again.


In [3]:
from google.cloud import aiplatform
from google.cloud.aiplatform_v1.services.pipeline_service import PipelineServiceClient
from google.api_core import exceptions as api_exceptions
import time
import os
import re
from typing import Tuple, Dict

# --- CONFIGURATION ---
PROJECT_ID = "gen-lang-client-0870511801"
REGION = "us-central1"

DELETABLE_STATE_NAMES: Tuple[str, ...] = (
    'JOB_STATE_SUCCEEDED',
    'JOB_STATE_FAILED',
    'JOB_STATE_CANCELLED'
)

# --- NEW CONFIGURATION FOR MODELS AND ENDPOINTS ---

DELETABLE_NAME_PREFIXES: Tuple[str, ...] = (
    # Existing prefixes (can generally remain if you might create more with these names)
    "NASA-cmapss-rul-",
    "POC-my-custom-training-job",
    "my-custom-training-job",
    "test-",
    "SupervisedTuningJob",
    "cmapss-rul-jsonl-model",
    "cmapss-text-tuned-gemini-",

    # NEW PREFIXES FOR SKIPPED MODELS:
    "cmapss-rul-gemini-final-launch", # Targets the exact name of the first skipped model
    "my-pytorch-model"               # Targets the exact name of the second skipped model
)



# --- CRITICAL FIX: MANUAL MAPPING FOR LOW-LEVEL TRAINING PIPELINE STATES ---
TRAINING_PIPELINE_STATE_MAP: Dict[int, str] = {
    0: 'JOB_STATE_UNSPECIFIED', 1: 'JOB_STATE_QUEUED', 2: 'JOB_STATE_PENDING',
    3: 'JOB_STATE_RUNNING', 4: 'JOB_STATE_SUCCEEDED', 5: 'JOB_STATE_FAILED',
    6: 'JOB_STATE_CANCELLING', 7: 'JOB_STATE_CANCELLED', 8: 'JOB_STATE_PAUSED',
    9: 'JOB_STATE_EXPIRED',
}


def delete_completed_jobs_and_pipelines(
    project_id: str,
    location: str,
    deletable_states: Tuple[str, ...],
    state_map: Dict[int, str],
    name_prefixes: Tuple[str, ...]
) -> None:
    """
    Lists and deletes completed Vertex AI Jobs/Pipelines, and conditionally
    deletes Models/Endpoints based on name prefixes.
    """

    print("-" * 70)
    print(f"--- Starting Cleanup in {project_id}/{location} ---")

    aiplatform.init(project=project_id, location=location)
    deleted_count = 0

    # Low-level client initialization
    pipeline_client = PipelineServiceClient(client_options={"api_endpoint": f"{location}-aiplatform.googleapis.com"})
    parent_path = f"projects/{project_id}/locations/{location}"

    # Helper function to check if a resource name contains a deletable prefix
    def should_delete_by_prefix(display_name: str) -> bool:
        if not name_prefixes:
            return True # Delete everything if the list is empty
        return any(display_name.startswith(prefix) for prefix in name_prefixes)

    # Function to handle the repeated deletion logic
    def perform_deletion(resource, resource_type: str, is_low_level: bool = False) -> bool:
        """Attempts to delete a resource and handles exceptions."""
        nonlocal deleted_count
        try:
            time.sleep(1) # CRITICAL: Rate Limit Delay

            if is_low_level:
                pipeline_client.delete_training_pipeline(name=resource.name)
            else:
                resource.delete()

            print(f"  ‚úÖ {resource_type} deleted.")
            deleted_count += 1
            return True
        except api_exceptions.NotFound:
            print(f"  ‚ö†Ô∏è {resource_type} already deleted.")
        except api_exceptions.ResourceExhausted:
            print(f"  ‚ùå ERROR: Quota Exceeded (429). Please wait 1-2 minutes and re-run the script.")
            return False # Signal to stop execution
        except Exception as e:
            print(f"  ‚ùå UNEXPECTED ERROR deleting {resource_type}: {e}")
        return True # Signal to continue


    # 1. Process Custom Jobs (High-level SDK)
    print("\n--- Processing Custom Jobs ---")
    try:
        custom_jobs = aiplatform.CustomJob.list(location=location)
        for job in custom_jobs:
            if job.state.name in deletable_states:
                print(f"Deleting Custom Job: {job.display_name} (State: {job.state.name})")
                if not perform_deletion(job, "Custom Job"): return
            else:
                print(f"Skipping active Custom Job: {job.display_name} (State: {job.state.name})")
    except Exception as e:
        print(f"Error processing Custom Jobs: {e}")

    # 2. Process Training Pipelines (Low-Level Client and Manual State Mapping)
    print("\n--- Processing Training Pipelines ---")
    try:
        training_pipelines_page = pipeline_client.list_training_pipelines(parent=parent_path)

        for pipeline in training_pipelines_page:
            state_name = state_map.get(pipeline.state, 'UNKNOWN_STATE')

            if state_name in deletable_states:
                print(f"Deleting Training Pipeline: {pipeline.display_name} (State: {state_name})")
                if not perform_deletion(pipeline, "Training Pipeline", is_low_level=True): return
            else:
                print(f"Skipping active Training Pipeline: {pipeline.display_name} (State: {state_name})")

    except Exception as e:
        print(f"Error processing Training Pipelines: {e}")

    # 3. Process Hyperparameter Tuning Jobs (High-level SDK)
    print("\n--- Processing Hyperparameter Tuning Jobs ---")
    try:
        tuning_jobs = aiplatform.HyperparameterTuningJob.list(location=location)
        for job in tuning_jobs:
            if job.state.name in deletable_states:
                print(f"Deleting Hyperparameter Tuning Job: {job.display_name} (State: {job.state.name})")
                if not perform_deletion(job, "Hyperparameter Tuning Job"): return
            else:
                print(f"Skipping active Hyperparameter Tuning Job: {job.display_name} (State: {job.state.name})")
    except Exception as e:
        print(f"Error processing Hyperparameter Tuning Jobs: {e}")

    # --- NEW CLEANUP SECTIONS ---

    # 4. Process Endpoints (Deployment cleanup)
    print("\n--- Processing Endpoints ---")
    try:
        endpoints = aiplatform.Endpoint.list(location=location)
        for endpoint in endpoints:
            # Check the prefix for deletion
            if should_delete_by_prefix(endpoint.display_name):
                print(f"Deleting Endpoint: {endpoint.display_name} (ID: {endpoint.name.split('/')[-1]})")

                # Check if deployed models exist (must undeploy first)
                if endpoint.deployed_models:
                    print(f"  ‚ö†Ô∏è Undeploying models from Endpoint: {endpoint.display_name}...")

                    # Undeploy all models from the endpoint before deleting the endpoint itself
                    for deployed_model in endpoint.deployed_models:
                        try:
                            # This call is blocking and can take time
                            endpoint.undeploy(deployed_model_id=deployed_model.id)
                            print(f"    - Undeployed Model ID: {deployed_model.id}")
                            time.sleep(1)
                        except Exception as e:
                            print(f"    ‚ùå ERROR undeploying model {deployed_model.id}: {e}")

                # Attempt to delete the now-empty endpoint
                if not perform_deletion(endpoint, "Endpoint"): return
            else:
                print(f"Skipping Endpoint (Prefix Filter): {endpoint.display_name}")
    except Exception as e:
        print(f"Error processing Endpoints: {e}")

    # 5. Process Models (Storage cleanup)
    print("\n--- Processing Models ---")
    try:
        models = aiplatform.Model.list(location=location)
        for model in models:
            if should_delete_by_prefix(model.display_name):
                # The Model resource delete is straightforward
                print(f"Deleting Model: {model.display_name} (ID: {model.name.split('/')[-1]})")
                if not perform_deletion(model, "Model"): return
            else:
                print(f"Skipping Model (Prefix Filter): {model.display_name}")
    except Exception as e:
        print(f"Error processing Models: {e}")


    print("-" * 70)
    print(f"\n--- Resource Cleanup Finished ---")
    print(f"Total completed/filtered resources deleted this run: {deleted_count}")
    print("If Quota Exceeded errors occurred, please wait 1-2 minutes and run the script again.")

# Execute the cleanup function
if __name__ == "__main__":
    delete_completed_jobs_and_pipelines(
        project_id=PROJECT_ID,
        location=REGION,
        deletable_states=DELETABLE_STATE_NAMES,
        state_map=TRAINING_PIPELINE_STATE_MAP,
        name_prefixes=DELETABLE_NAME_PREFIXES
    )

----------------------------------------------------------------------
--- Starting Cleanup in gen-lang-client-0870511801/us-central1 ---

--- Processing Custom Jobs ---

--- Processing Training Pipelines ---

--- Processing Hyperparameter Tuning Jobs ---

--- Processing Endpoints ---

--- Processing Models ---
----------------------------------------------------------------------

--- Resource Cleanup Finished ---
Total completed/filtered resources deleted this run: 0
If Quota Exceeded errors occurred, please wait 1-2 minutes and run the script again.
