<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Vertex_AI_SDK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-aiplatform -q
!pip install google-cloud-storage -q
!pip install google-cloud-bigquery -q
!pip install google-cloud-bigquery-storage -q
!pip install google-cloud-aiplatform -q
!pip install datasets -q
!pip install colab-env -q

# Install necessary libraries
!pip install  -q gcsfs==2024.3.1
!pip install  -q accelerate==0.31.0
!pip install  -q transformers==4.45.2
!pip install  -q  datasets==2.19.2
!pip install google-cloud-aiplatform[all] -q
!pip install vertexai  -q
!pip install tensorflow_datasets -q

## DATA PREPARATION

In [None]:
import os
import pandas as pd
import json
import zipfile
from google.cloud import storage
from google.colab import auth


# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)


# --- Data Loading from Google Drive ---
zip_path = '/content/gdrive/MyDrive/datasets/CMAPSSData.zip'
extract_dir = 'data/cmapss'
os.makedirs(extract_dir, exist_ok=True)

if not os.path.exists(zip_path):
    print(f"Error: CMAPSSData.zip not found at {zip_path}. Please ensure the file is correctly located in your Google Drive.")
    raise FileNotFoundError(f"CMAPSSData.zip not found at {zip_path}")

try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        if zip_ref.testzip() is None:  # Check for ZIP file integrity
            zip_ref.extractall(extract_dir)
            print(f"Extracted dataset files to: {extract_dir}")
        else:
            print("Error: ZIP file integrity check failed. The file may not be a valid ZIP file.")
            raise zipfile.BadZipFile("ZIP file integrity check failed.")

except zipfile.BadZipFile as e:
    print(f"Error extracting ZIP file: {e}")
    print(
        "The uploaded file may not be a valid or complete ZIP file. "
        "Please ensure you have uploaded the correct file, that it is not corrupted, "
        "and that it is a standard ZIP archive."
    )
    raise  # Stop execution if extraction fails

# --- Prepare NASA CMAPSS Data and Save to JSONL in GCS ---
extract_dir = 'data/cmapss'
os.makedirs(extract_dir, exist_ok=True)

# Process all four subsets
data_subsets = ['FD001', 'FD002', 'FD003', 'FD004']

for data_subset in data_subsets:
    train_file = os.path.join(extract_dir, f'train_{data_subset}.txt')
    test_file = os.path.join(extract_dir, f'test_{data_subset}.txt')
    rul_file = os.path.join(extract_dir, f'RUL_{data_subset}.txt')

    SENSOR_COLUMNS = ['sensor' + str(i).zfill(2) for i in range(1, 22)]
    OP_SETTING_COLUMNS = ['op_setting_' + str(i) for i in range(1, 4)]
    DATA_COLUMNS = ['unit_nr', 'time_cycles'] + OP_SETTING_COLUMNS + SENSOR_COLUMNS

    # Load training data
    try:
        train_df = pd.read_csv(train_file, names=DATA_COLUMNS, delim_whitespace=True, header=None)
        test_df = pd.read_csv(test_file, names=DATA_COLUMNS, delim_whitespace=True, header=None)
        rul_df = pd.read_csv(rul_file, names=['RUL'], delim_whitespace=True, header=None)

        train_df.columns = DATA_COLUMNS
        test_df.columns = DATA_COLUMNS

        print(f"\nProcessing data subset: {data_subset}")
        print("Shape of train_df after loading:", train_df.shape)
        print("train_df head after loading:\n", train_df.head())
        print("Shape of test_df:", test_df.shape)
        print("test_df head after loading:\n", test_df.head())
        print("Shape of RUL data:", rul_df.shape)

    except FileNotFoundError as e:
        print(f"Error loading data files for subset {data_subset}: {e}")
        raise  # Stop execution if a file is missing

    def create_jsonl(df, rul_df, output_path, sequence_length=30, is_test=False):
        grouped_data = df.groupby('unit_nr')
        rul_values = rul_df.values.tolist()  # Convert RUL DataFrame to list
        engine_count = 0  # To track which RUL value to use

        with open(output_path, 'w') as f:
            for unit_nr, unit_data in grouped_data:
                num_cycles = len(unit_data)
                data_values = unit_data.drop(['unit_nr'], axis=1).values.tolist()
                json_data = []  # Initialize an empty list to hold JSON objects

                for i in range(max(0, num_cycles - sequence_length + 1)):
                    sequence = data_values[i:i + sequence_length]
                    rul = num_cycles - (i + sequence_length)

                    # Ensure RUL is not out of bounds
                    if engine_count < len(rul_values):
                        current_rul = rul_values[engine_count][0]  # Get the RUL value
                    else:
                        current_rul = 0  # Or some default value if RUL data is exhausted

                    if len(sequence) == sequence_length:
                        json_record = {"sequence": sequence, "sequence_length": len(sequence), "rul": current_rul}  # Include sequence length
                        json_data.append(json_record)

                # Write all JSON objects to the file at once
                with open(output_path, 'w') as f:
                    for json_record in json_data:
                        f.write(json.dumps(json_record) + '\n')

                engine_count += 1  # Increment engine counter

    local_train_jsonl_path = f"cmapss_{data_subset}_train_sequences.jsonl"
    local_test_jsonl_path = f"cmapss_{data_subset}_test_sequences.jsonl"

    # Create JSONL for training
    create_jsonl(train_df, rul_df, local_train_jsonl_path, is_test=False)
    print(f"Created {local_train_jsonl_path}")

    # Create JSONL for testing
    create_jsonl(test_df, rul_df, local_test_jsonl_path, is_test=True)
    print(f"Created {local_test_jsonl_path}")

    # --- Upload JSONL files to GCS ---
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(BUCKET_NAME)

    blob_train = bucket.blob(f"cmapss_{data_subset}_train_sequences.jsonl")  # Adapt to your naming scheme
    blob_test = bucket.blob(f"cmapss_{data_subset}_test_sequences.jsonl")   # Adapt to your naming scheme

    blob_train.upload_from_filename(local_train_jsonl_path)
    print(f"Uploaded training data to: gs://{BUCKET_NAME}/cmapss_{data_subset}_train_sequences.jsonl")

    blob_test.upload_from_filename(local_test_jsonl_path)
    print(f"Uploaded evaluation data to: gs://{BUCKET_NAME}/cmapss_{data_subset}_test_sequences.jsonl")

print("JSONL files created and uploaded.")

## FINE TUNING - NASA DATASET

In [None]:
import colab_env
import os
from google.cloud import aiplatform, storage
import logging
from google.colab import auth
import pandas as pd
import json
import zipfile
import requests
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from google.cloud import storage
from torch.utils.data import Dataset, DataLoader

# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
SERVICEACCOUNT = os.environ.get("GOOGLE_CLOUD_SERVICEACCOUNT")
PROJECT_NUMBER = os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

TRAINING_DATA_PATH = f"gs://{BUCKET_NAME}/cmapss_FD004_train_sequences.jsonl"
EVAL_DATA_PATH = f"gs://{BUCKET_NAME}/cmapss_FD004_test_sequences.jsonl"

# --- Define trainer/train.py content ---
train_py_content = """
import argparse
import os
import json
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
import time
import subprocess  # Import for gsutil cp
from google.cloud import storage

logging.basicConfig(level=logging.INFO)

# Function to create the model directory in GCS
def create_model_dir(model_dir):
    storage_client = storage.Client()
    bucket_name = model_dir.split('/')[2]  # Extract bucket name
    blob_prefix = '/'.join(model_dir.split('/')[3:])  # Extract blob prefix
    bucket = storage_client.bucket(bucket_name)

    # Create each subdirectory individually
    subdirs = blob_prefix.split('/')
    current_prefix = ''
    for subdir in subdirs:
        current_prefix = os.path.join(current_prefix, subdir)
        blob = bucket.blob(current_prefix + '/')  # Add trailing slash to create directory
        blob.upload_from_string('')  # Upload empty string to create directory
        logging.info("Created model subdirectory: %s", current_prefix)


class CMAPSSJSONLDataset(data.Dataset):
    def __init__(self, data_path, sequence_length=30):
        self.data = []
        self.sequence_length = sequence_length
        storage_client = storage.Client()
        bucket_name = data_path.split('/')[2]
        blob_name = '/'.join(data_path.split('/')[3:])
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        tmp_file = "/tmp/temp_data.jsonl"
        blob.download_to_filename(tmp_file)

        with open(tmp_file, 'r') as f:
            for line in f:
                try:
                    record = json.loads(line)
                    sequence = torch.tensor(record["sequence"], dtype=torch.float32)
                    rul = torch.tensor([record["rul"]], dtype=torch.float32)
                    self.data.append((sequence, rul))
                except json.JSONDecodeError as e:
                    logging.warning("Skipping invalid JSON line: %r, Error: %s", repr(line), e)
        os.remove(tmp_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class RULPredictionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("Input x shape:", x.shape)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

def train_model(model_name, train_dataset_path, eval_dataset_path, staging_bucket, bucket_name, base_output_dir):
    import logging
    import torch

    logging.info("Model name: %s", model_name)
    logging.info("Train Dataset Path: %s", train_dataset_path)
    logging.info("Eval Dataset Path: %s", eval_dataset_path)
    logging.info("Staging Bucket: %s", staging_bucket)
    logging.info("Bucket Name: %s", bucket_name)
    logging.info("Base Output Dir: %s", base_output_dir)

    train_dataset = CMAPSSJSONLDataset(train_dataset_path)
    eval_dataset = CMAPSSJSONLDataset(eval_dataset_path)

    train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
    eval_loader = data.DataLoader(eval_dataset, batch_size=64)

    input_size = train_dataset[0][0].shape[-1]
    hidden_size = 64

    model = RULPredictionModel(input_size, hidden_size)

    device = torch.device("cpu")
    model.to(device)

    print("Using device:", device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    num_epochs = 10

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_idx, (sequences, ruls) in enumerate(train_loader):
            print("Sequence shape:", sequences.shape)
            sequences = sequences.to(device)
            ruls = ruls.to(device)

            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, ruls)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if batch_idx % 50 == 0:
                log_msg = f"Epoch: {epoch + 1}, Batch: {batch_idx}, Loss: {loss.item()}"
                logging.info(log_msg)
        avg_loss = total_loss / len(train_loader)
        log_msg = f"Epoch: {epoch + 1}, Average Training Loss: {avg_loss}"
        logging.info(log_msg)

        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for sequences_eval, ruls_eval in eval_loader:
                sequences_eval = sequences_eval.to(device)
                ruls_eval = ruls_eval.to(device)

                outputs_eval = model(sequences_eval)
                loss_eval = criterion(outputs_eval, ruls_eval)
                # Ensure loss_eval is a tensor
                eval_loss += loss_eval.item()  # Extract the value
        avg_eval_loss = eval_loss / len(eval_loader)
        log_msg = f"Epoch: {epoch + 1}, Average Evaluation Loss: {avg_eval_loss}"
        logging.info(log_msg)

    logging.info("Starting model saving...")

    # Local path to temporarily save the model
    local_model_path = 'local_model.pth'
    torch.save(model.state_dict(), local_model_path)

    if 'AIP_MODEL_DIR' in os.environ:
        gcs_model_path = os.path.join(os.environ['AIP_MODEL_DIR'], 'model-nasa.pth')
        # use gsutil to copy the model
        subprocess.call(['gsutil', 'cp', local_model_path, gcs_model_path])
        logging.info("Copied model to Vertex AI path: %s", gcs_model_path)
    else:
        logging.info("Saving model to local path: %s", local_model_path)

    logging.info("Model saved to: %s", model_save_path)

    print("Training completed.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="rul_predictor_jsonl", help="Model name")
    parser.add_argument("--train_dataset", type=str, required=True, help="Path to training dataset JSONL")
    parser.add_argument("--eval_dataset", type=str, required=True, help="Path to evaluation dataset JSONL")
    parser.add_argument("--staging_bucket", type=str, required=True, help="Staging bucket for Vertex AI")
    parser.add_argument("--bucket_name", type=str, required=True, help="Bucket name")
    parser.add_argument("--base_output_dir", type=str, required=True, help="Base output directory in GCS")
    args = parser.parse_args()

    train_model(args.model, args.train_dataset, args.eval_dataset, args.staging_bucket, args.bucket_name, args.base_output_dir)
"""

# Create or overwrite trainer/train.py
os.makedirs('trainer', exist_ok=True)
with open('trainer/train.py', 'w') as f:
    f.write(train_py_content)

# --- Define and run custom training job ---
BASE_MODEL_NAME = "rul_predictor_cmapss_jsonl"
BASE_OUTPUT_DIR = f"gs://{BUCKET_NAME}/model_output"  # Define base output directory

job = aiplatform.CustomTrainingJob(
    display_name="NASA-cmapss-rul-prediction-jsonl",
    script_path="trainer/train.py",
    container_uri='us-docker.pkg.dev/vertex-ai/training/pytorch-xla.2-4.py310:latest',
    requirements=["google-cloud-aiplatform", "torch", "google-cloud-storage"],
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/pytorch-cpu.1-11:latest",
    #service_account=f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com" # Added to job definition
)

model = job.run(
    args=[
        "--model", BASE_MODEL_NAME,
        "--train_dataset", TRAINING_DATA_PATH,
        "--eval_dataset", EVAL_DATA_PATH,
        "--staging_bucket", STAGING_BUCKET,
        "--bucket_name", BUCKET_NAME,
        "--base_output_dir", BASE_OUTPUT_DIR,
    ],
    replica_count=1,
    machine_type="n1-standard-8",
    model_display_name="cmapss-rul-jsonl-model"
)

logging.info(f"Fine-tuned model: {model.resource_name}")

# --- Potential Next Steps (Outline) ---
print("\\nPotential Next Steps:")
print("- Monitor the training job in the Google Cloud Console.")
print("- Evaluate the model performance using more comprehensive metrics.")
print("- Deploy the trained model to a Vertex AI Endpoint for predictions.")
print("- Experiment with different model architectures and hyperparameters.")

In [None]:
!gsutil iam ch serviceAccount:{PROJECT_NUMBER}@cloudbuild.gserviceaccount.com:roles/storage.objectAdmin gs://{BUCKET_NAME}

In [None]:
!gsutil iam get gs://{BUCKET_NAME}

In [None]:
!gsutil iam ch serviceAccount:{PROJECT_NUMBER}@cloudbuild.gserviceaccount.com:roles/storage.objectViewer gs://{BUCKET_NAME}

In [None]:
!gsutil iam ch serviceAccount:{PROJECT_NUMBER}@cloudbuild.gserviceaccount.com:roles/storage.objectCreator gs://{BUCKET_NAME}