<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/Vertex_AI_SDK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-aiplatform -q
!pip install google-cloud-storage -q
!pip install google-cloud-bigquery -q
!pip install google-cloud-bigquery-storage -q
!pip install google-cloud-aiplatform -q
!pip install datasets -q
!pip install colab-env -q

# Install necessary libraries
!pip install  -q gcsfs==2024.3.1
!pip install  -q accelerate==0.31.0
!pip install  -q transformers==4.45.2
!pip install  -q  datasets==2.19.2
!pip install google-cloud-aiplatform[all] -q
!pip install vertexai  -q
!pip install tensorflow_datasets -q

## DATA PREPARATION

In [None]:
import os
import pandas as pd
import json
import zipfile
from google.cloud import storage
from google.colab import auth


# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
SERVICEACCOUNT = os.environ.get("GOOGLE_CLOUD_SERVICEACCOUNT")
PROJECT_NUMBER = os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# --- Data Loading from Google Drive ---
zip_path = '/content/gdrive/MyDrive/datasets/CMAPSSData.zip'
extract_dir = 'data/cmapss'
os.makedirs(extract_dir, exist_ok=True)

if not os.path.exists(zip_path):
    print(f"Error: CMAPSSData.zip not found at {zip_path}. Please ensure the file is correctly located in your Google Drive.")
    raise FileNotFoundError(f"CMAPSSData.zip not found at {zip_path}")

try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        if zip_ref.testzip() is None:  # Check for ZIP file integrity
            zip_ref.extractall(extract_dir)
            print(f"Extracted dataset files to: {extract_dir}")
        else:
            print("Error: ZIP file integrity check failed. The file may not be a valid ZIP file.")
            raise zipfile.BadZipFile("ZIP file integrity check failed.")

except zipfile.BadZipFile as e:
    print(f"Error extracting ZIP file: {e}")
    print(
        "The uploaded file may not be a valid or complete ZIP file. "
        "Please ensure you have uploaded the correct file, that it is not corrupted, "
        "and that it is a standard ZIP archive."
    )
    raise  # Stop execution if extraction fails

# --- Prepare NASA CMAPSS Data and Save to JSONL in GCS ---
extract_dir = 'data/cmapss'
os.makedirs(extract_dir, exist_ok=True)

# Process all four subsets
data_subsets = ['FD001', 'FD002', 'FD003', 'FD004']

for data_subset in data_subsets:
    train_file = os.path.join(extract_dir, f'train_{data_subset}.txt')
    test_file = os.path.join(extract_dir, f'test_{data_subset}.txt')
    rul_file = os.path.join(extract_dir, f'RUL_{data_subset}.txt')

    SENSOR_COLUMNS = ['sensor' + str(i).zfill(2) for i in range(1, 22)]
    OP_SETTING_COLUMNS = ['op_setting_' + str(i) for i in range(1, 4)]
    DATA_COLUMNS = ['unit_nr', 'time_cycles'] + OP_SETTING_COLUMNS + SENSOR_COLUMNS

    # Load training data
    try:
        train_df = pd.read_csv(train_file, names=DATA_COLUMNS, delim_whitespace=True, header=None)
        test_df = pd.read_csv(test_file, names=DATA_COLUMNS, delim_whitespace=True, header=None)
        rul_df = pd.read_csv(rul_file, names=['RUL'], delim_whitespace=True, header=None)

        train_df.columns = DATA_COLUMNS
        test_df.columns = DATA_COLUMNS

        print(f"\nProcessing data subset: {data_subset}")
        print("Shape of train_df after loading:", train_df.shape)
        print("train_df head after loading:\n", train_df.head())
        print("Shape of test_df:", test_df.shape)
        print("test_df head after loading:\n", test_df.head())
        print("Shape of RUL data:", rul_df.shape)

    except FileNotFoundError as e:
        print(f"Error loading data files for subset {data_subset}: {e}")
        raise  # Stop execution if a file is missing

    def create_jsonl(df, rul_df, output_path, sequence_length=30, is_test=False):
        grouped_data = df.groupby('unit_nr')
        rul_values = rul_df.values.tolist()  # Convert RUL DataFrame to list
        engine_count = 0  # To track which RUL value to use

        with open(output_path, 'w') as f:
            for unit_nr, unit_data in grouped_data:
                num_cycles = len(unit_data)
                data_values = unit_data.drop(['unit_nr'], axis=1).values.tolist()
                json_data = []  # Initialize an empty list to hold JSON objects

                for i in range(max(0, num_cycles - sequence_length + 1)):
                    sequence = data_values[i:i + sequence_length]
                    rul = num_cycles - (i + sequence_length)

                    # Ensure RUL is not out of bounds
                    if engine_count < len(rul_values):
                        current_rul = rul_values[engine_count][0]  # Get the RUL value
                    else:
                        current_rul = 0  # Or some default value if RUL data is exhausted

                    if len(sequence) == sequence_length:
                        json_record = {"sequence": sequence, "sequence_length": len(sequence), "rul": current_rul}  # Include sequence length
                        json_data.append(json_record)

                # Write all JSON objects to the file at once
                with open(output_path, 'w') as f:
                    for json_record in json_data:
                        f.write(json.dumps(json_record) + '\n')

                engine_count += 1  # Increment engine counter

    local_train_jsonl_path = f"cmapss_{data_subset}_train_sequences.jsonl"
    local_test_jsonl_path = f"cmapss_{data_subset}_test_sequences.jsonl"

    # Create JSONL for training
    create_jsonl(train_df, rul_df, local_train_jsonl_path, is_test=False)
    print(f"Created {local_train_jsonl_path}")

    # Create JSONL for testing
    create_jsonl(test_df, rul_df, local_test_jsonl_path, is_test=True)
    print(f"Created {local_test_jsonl_path}")

    # --- Upload JSONL files to GCS ---
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(BUCKET_NAME)

    blob_train = bucket.blob(f"cmapss_{data_subset}_train_sequences.jsonl")  # Adapt to your naming scheme
    blob_test = bucket.blob(f"cmapss_{data_subset}_test_sequences.jsonl")   # Adapt to your naming scheme

    blob_train.upload_from_filename(local_train_jsonl_path)
    print(f"Uploaded training data to: gs://{BUCKET_NAME}/cmapss_{data_subset}_train_sequences.jsonl")

    blob_test.upload_from_filename(local_test_jsonl_path)
    print(f"Uploaded evaluation data to: gs://{BUCKET_NAME}/cmapss_{data_subset}_test_sequences.jsonl")

print("JSONL files created and uploaded.")

## FINE TUNING - NASA DATASET

In [None]:
import colab_env
import os
from google.cloud import aiplatform, storage
import logging
from google.colab import auth
import pandas as pd
import json
import zipfile
import requests
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from google.cloud import storage
from torch.utils.data import Dataset, DataLoader

# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
SERVICEACCOUNT = os.environ.get("GOOGLE_CLOUD_SERVICEACCOUNT")
PROJECT_NUMBER = os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

TRAINING_DATA_PATH = f"gs://{BUCKET_NAME}/cmapss_FD002_train_sequences.jsonl"
EVAL_DATA_PATH = f"gs://{BUCKET_NAME}/cmapss_FD002_test_sequences.jsonl"


# --- Define trainer/train.py content ---
#train_py_content_with_early_stopping
train_py_content = """
import argparse
import os
import json
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import time
import subprocess
from google.cloud import storage
import pandas as pd
from typing import List
import numpy as np
import torch.utils.data as data

logging.basicConfig(level=logging.INFO)

# --- Helper Functions ---

def create_gcs_dir(model_dir):
    #Creates the model directory in Google Cloud Storage.
    try:
        storage_client = storage.Client()
        bucket_name = model_dir.split('/')[2]
        blob_prefix = '/'.join(model_dir.split('/')[3:])
        bucket = storage_client.bucket(bucket_name)

        subdirs = blob_prefix.split('/')
        current_prefix = ''
        for subdir in subdirs:
            current_prefix = os.path.join(current_prefix, subdir)
            blob = bucket.blob(current_prefix + '/')
            blob.upload_from_string('')
            logging.info("Created GCS directory: %s", current_prefix)
    except Exception as e:
        logging.error(f"Error creating GCS directory: {e}")
        raise

def load_jsonl_dataset(data_path, sequence_length=30):
    #Loads a CMAPSS dataset from a JSONL file (local or GCS).

    data = []
    try:
        if data_path.startswith('gs://'):
            storage_client = storage.Client()
            bucket_name = data_path.split('/')[2]
            blob_name = '/'.join(data_path.split('/')[3:])
            bucket = storage_client.bucket(bucket_name)
            blob = bucket.blob(blob_name)
            tmp_file = "/tmp/temp_data.jsonl"
            blob.download_to_filename(tmp_file)
            file_path = tmp_file
        else:
            file_path = data_path

        with open(file_path, 'r') as f:
            for line in f:
                try:
                    record = json.loads(line)
                    sequence = torch.tensor(record["sequence"], dtype=torch.float32)
                    rul = torch.tensor([record["rul"]], dtype=torch.float32)
                    data.append((sequence, rul))
                except json.JSONDecodeError as e:
                    logging.warning("Skipping invalid JSON line: %r, Error: %s", repr(line), e)

        if data_path.startswith('gs://'):
            os.remove(tmp_file)  # Clean up temporary file
        return data

    except Exception as e:
        logging.error(f"Error loading dataset: {e}")
        raise

# --- Feature Engineering Functions ---

def create_rolling_features(df, window_size=10):

    for sensor in [col for col in df.columns if col.startswith('sensor')]:
        df[f'{sensor}_mean'] = df.groupby('unit_nr')[sensor].transform(lambda x: x.rolling(window=window_size).mean())
        df[f'{sensor}_std'] = df.groupby('unit_nr')[sensor].transform(lambda x: x.rolling(window=window_size).std())
    return df

# --- Dataset Class ---

class CMAPSSJSONLDataset(data.Dataset):
    #CMAPSS dataset loader from JSONL.

    def __init__(self, data_path, sequence_length=30, use_rolling_features=False): #Accepts single path
        self.data = load_jsonl_dataset(data_path, sequence_length)
        self.sequence_length = sequence_length
        self.use_rolling_features = use_rolling_features

        if self.use_rolling_features:
            # Convert data to DataFrame for feature engineering
            df = pd.DataFrame([item[0].numpy() for item in self.data])
            # Add unit_nr as a placeholder (assuming it's needed for grouping)
            df['unit_nr'] = np.repeat(np.arange(len(df) / self.sequence_length), self.sequence_length)
            df = create_rolling_features(df)

            # Update self.data with the modified sequences (and RUL)
            new_data = []
            for i in range(len(self.data)):
                rul = self.data[i][1]
                seq = df.iloc[i].values
                new_data.append((torch.tensor(seq, dtype=torch.float32), rul))
            self.data = new_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# --- Model Definition ---

class RULPredictionModel(nn.Module):
    #LSTM-based model for RUL prediction

    def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.2):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size // 2, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc1(out[:, -1, :])
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

# --- Training Function ---

def train_model(model_name, train_dataset_path, eval_dataset_path, #Accepts single path
                staging_bucket, bucket_name, base_output_dir,
                use_rolling_features=False):

    #Trains the RUL prediction model with FD004 dataset.

    logging.info("Training configuration:")
    logging.info(f"Model name: {model_name}")
    logging.info(f"Train Dataset Path: {train_dataset_path}")
    logging.info(f"Eval Dataset Path: {eval_dataset_path}")
    logging.info(f"Staging Bucket: {staging_bucket}")
    logging.info(f"Bucket Name: {bucket_name}")
    logging.info(f"Base Output Dir: {base_output_dir}")
    logging.info(f"Use Rolling Features: {use_rolling_features}")

    # 1. Data Loaders
    train_dataset = CMAPSSJSONLDataset(train_dataset_path, use_rolling_features=use_rolling_features)
    eval_dataset = CMAPSSJSONLDataset(eval_dataset_path, use_rolling_features=use_rolling_features)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=64)

    # 2. Model Initialization
    input_size = train_dataset[0][0].shape[-1]
    hidden_size = 128
    model = RULPredictionModel(input_size, hidden_size, num_layers=3, dropout=0.3)

    # 3. Device Configuration (CPU)
    device = torch.device("cpu")  # Force CPU
    model.to(device)
    logging.info(f"Using device: {device}")

    # 4. Optimizer and Loss Function
    optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
    criterion = nn.MSELoss()

    # 5. Training Loop
    num_epochs = 100000  # Adjusted epochs for testing
    best_eval_loss = float('inf')
    patience = 10
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_idx, (sequences, ruls) in enumerate(train_loader):
            sequences = sequences.to(device)
            ruls = ruls.to(device)

            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, ruls)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if batch_idx % 50 == 0:
                logging.info(f"Epoch: {epoch + 1}, Batch: {batch_idx}, Loss: {loss.item()}")

        avg_loss = total_loss / len(train_loader)
        logging.info(f"Epoch: {epoch + 1}, Average Training Loss: {avg_loss}")

        # 6. Evaluation
        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for sequences_eval, ruls_eval in eval_loader:
                sequences_eval = sequences_eval.to(device)
                ruls_eval = ruls_eval.to(device)

                outputs_eval = model(sequences_eval)
                loss_eval = criterion(outputs_eval, ruls_eval)
                eval_loss += loss_eval.item()

        avg_eval_loss = eval_loss / len(eval_loader)
        logging.info(f"Epoch: {epoch + 1}, Average Evaluation Loss: {avg_eval_loss}")

        # 7. Early Stopping
        if avg_eval_loss < best_eval_loss:
            best_eval_loss = avg_eval_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                logging.info("Early stopping triggered")
                break

    # 8. Load the best model
    model.load_state_dict(torch.load('best_model.pth'))

    logging.info("Starting model saving...")

    # 9. Save Model (Local and GCS)
    local_model_path = 'model-nasa.pth'
    torch.save(model.state_dict(), local_model_path)

    try:
        base_output_path = base_output_dir
        subprocess.run(['gsutil', 'cp', local_model_path, base_output_path], check=True)
        logging.info(f"Copied model to GCS BUCKET path: {base_output_path}")

        if 'AIP_MODEL_DIR' in os.environ:
            gcs_model_path = os.path.join(os.environ['AIP_MODEL_DIR'], 'model-nasa.pth')
            subprocess.run(['gsutil', 'cp', local_model_path, gcs_model_path], check=True)
            logging.info(f"Copied model to Vertex AI path: {gcs_model_path}")
            model_save_path = gcs_model_path
        else:
            logging.info(f"Saving model to local path: {local_model_path}")
            model_save_path = local_model_path

        logging.info(f"Model saved to: {model_save_path}")

    except subprocess.CalledProcessError as e:
        logging.error(f"Error saving model: {e}")
        raise

    print("Training completed.")

# --- Main Execution ---

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="rul_predictor_jsonl", help="Model name")
    parser.add_argument("--train_dataset", type=str, required=True, help="Path to training dataset JSONL") #Single path
    parser.add_argument("--eval_dataset", type=str, required=True, help="Path to evaluation dataset JSONL")  #Single path
    parser.add_argument("--staging_bucket", type=str, required=True, help="Staging bucket for Vertex AI")
    parser.add_argument("--bucket_name", type=str, required=True, help="Bucket name")
    parser.add_argument("--base_output_dir", type=str, required=True, help="Base output directory in GCS")
    parser.add_argument("--use_rolling_features", action='store_true', help="Use rolling window features")
    args = parser.parse_args()

    train_model(args.model, args.train_dataset, args.eval_dataset,
                args.staging_bucket, args.bucket_name, args.base_output_dir,
                args.use_rolling_features)
"""

# --- Define trainer/train.py content ---
train_py_content_original = """
import argparse
import os
import json
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
import time
import subprocess  # Import for gsutil cp
from google.cloud import storage

logging.basicConfig(level=logging.INFO)

# Function to create the model directory in GCS
def create_model_dir(model_dir):
    storage_client = storage.Client()
    bucket_name = model_dir.split('/')[2]  # Extract bucket name
    blob_prefix = '/'.join(model_dir.split('/')[3:])  # Extract blob prefix
    bucket = storage_client.bucket(bucket_name)

    # Create each subdirectory individually
    subdirs = blob_prefix.split('/')
    current_prefix = ''
    for subdir in subdirs:
        current_prefix = os.path.join(current_prefix, subdir)
        blob = bucket.blob(current_prefix + '/')  # Add trailing slash to create directory
        blob.upload_from_string('')  # Upload empty string to create directory
        logging.info("Created model subdirectory: %s", current_prefix)


class CMAPSSJSONLDataset(data.Dataset):
    def __init__(self, data_path, sequence_length=30):
        self.data = []
        self.sequence_length = sequence_length
        storage_client = storage.Client()
        bucket_name = data_path.split('/')[2]
        blob_name = '/'.join(data_path.split('/')[3:])
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        tmp_file = "/tmp/temp_data.jsonl"
        blob.download_to_filename(tmp_file)

        with open(tmp_file, 'r') as f:
            for line in f:
                try:
                    record = json.loads(line)
                    sequence = torch.tensor(record["sequence"], dtype=torch.float32)
                    rul = torch.tensor([record["rul"]], dtype=torch.float32)
                    self.data.append((sequence, rul))
                except json.JSONDecodeError as e:
                    logging.warning("Skipping invalid JSON line: %r, Error: %s", repr(line), e)
        os.remove(tmp_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class RULPredictionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        print("Input x shape:", x.shape)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

def train_model(model_name, train_dataset_path, eval_dataset_path, staging_bucket, bucket_name, base_output_dir):
    import logging
    import torch

    logging.info("Model name: %s", model_name)
    logging.info("Train Dataset Path: %s", train_dataset_path)
    logging.info("Eval Dataset Path: %s", eval_dataset_path)
    logging.info("Staging Bucket: %s", staging_bucket)
    logging.info("Bucket Name: %s", bucket_name)
    logging.info("Base Output Dir: %s", base_output_dir)

    train_dataset = CMAPSSJSONLDataset(train_dataset_path)
    eval_dataset = CMAPSSJSONLDataset(eval_dataset_path)

    train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
    eval_loader = data.DataLoader(eval_dataset, batch_size=64)

    input_size = train_dataset[0][0].shape[-1]
    hidden_size = 64

    model = RULPredictionModel(input_size, hidden_size)

    device = torch.device("cpu")
    model.to(device)

    print("Using device:", device)

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    num_epochs = 120000

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_idx, (sequences, ruls) in enumerate(train_loader):
            print("Sequence shape:", sequences.shape)
            sequences = sequences.to(device)
            ruls = ruls.to(device)

            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, ruls)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if batch_idx % 50 == 0:
                log_msg = f"Epoch: {epoch + 1}, Batch: {batch_idx}, Loss: {loss.item()}"
                logging.info(log_msg)
        avg_loss = total_loss / len(train_loader)
        log_msg = f"Epoch: {epoch + 1}, Average Training Loss: {avg_loss}"
        logging.info(log_msg)

        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for sequences_eval, ruls_eval in eval_loader:
                sequences_eval = sequences_eval.to(device)
                ruls_eval = ruls_eval.to(device)

                outputs_eval = model(sequences_eval)
                loss_eval = criterion(outputs_eval, ruls_eval)
                # Ensure loss_eval is a tensor
                eval_loss += loss_eval.item()  # Extract the value
        avg_eval_loss = eval_loss / len(eval_loader)
        log_msg = f"Epoch: {epoch + 1}, Average Evaluation Loss: {avg_eval_loss}"
        logging.info(log_msg)

    logging.info("Starting model saving...")

    # Local path to temporarily save the model
    local_model_path = 'model-nasa.pth'
    torch.save(model.state_dict(), local_model_path)

    # use gsutil to copy the model
    base_output_path = base_output_dir
    #base_output_path = os.path.join(base_output_dir, model_name)
    subprocess.call(['gsutil', 'cp', local_model_path, base_output_path])
    logging.info("Copied model to GCS BUCKET path: %s", base_output_path)

    if 'AIP_MODEL_DIR' in os.environ:
        gcs_model_path = os.path.join(os.environ['AIP_MODEL_DIR'], 'model-nasa.pth')
        # use gsutil to copy the model
        subprocess.call(['gsutil', 'cp', local_model_path, gcs_model_path])
        logging.info("Copied model to Vertex AI path: %s", gcs_model_path)
        # Assign the correct path to model_save_path
        model_save_path = gcs_model_path
    else:

        logging.info("Saving model to local path: %s", local_model_path)
        # Assign the correct path to model_save_path
        model_save_path = local_model_path

    logging.info("Model saved to: %s", model_save_path)  # Now model_save_path is defined

    print("Training completed.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="rul_predictor_jsonl", help="Model name")
    parser.add_argument("--train_dataset", type=str, required=True, help="Path to training dataset JSONL")
    parser.add_argument("--eval_dataset", type=str, required=True, help="Path to evaluation dataset JSONL")
    parser.add_argument("--staging_bucket", type=str, required=True, help="Staging bucket for Vertex AI")
    parser.add_argument("--bucket_name", type=str, required=True, help="Bucket name")
    parser.add_argument("--base_output_dir", type=str, required=True, help="Base output directory in GCS")
    args = parser.parse_args()

    train_model(args.model, args.train_dataset, args.eval_dataset, args.staging_bucket, args.bucket_name, args.base_output_dir)
"""

# Create or overwrite trainer/train.py
os.makedirs('trainer', exist_ok=True)
with open('trainer/train.py', 'w') as f:
    f.write(train_py_content)

# --- Define and run custom training job ---
BASE_MODEL_NAME = "rul_predictor_cmapss_jsonl"
BASE_OUTPUT_DIR = f"gs://{BUCKET_NAME}/model_output"  # Define base output directory

job = aiplatform.CustomTrainingJob(
    display_name="NASA-cmapss-rul-prediction-jsonl",
    script_path="trainer/train.py",
    container_uri='us-docker.pkg.dev/vertex-ai/training/pytorch-xla.2-4.py310:latest',
    requirements=["google-cloud-aiplatform", "torch", "google-cloud-storage"],
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/pytorch-cpu.1-11:latest",
    #service_account=f"{PROJECT_NUMBER}-compute@developer.gserviceaccount.com" # Added to job definition
)

model = job.run(
    args=[
        "--model", BASE_MODEL_NAME,
        "--train_dataset", TRAINING_DATA_PATH,
        "--eval_dataset", EVAL_DATA_PATH,
        "--staging_bucket", STAGING_BUCKET,
        "--bucket_name", BUCKET_NAME,
        "--base_output_dir", BASE_OUTPUT_DIR,
    ],
    replica_count=1,
    machine_type="n1-standard-8",
    model_display_name="cmapss-rul-jsonl-model"
)

logging.info(f"Fine-tuned model: {model.resource_name}")

In [3]:
# --- Potential Next Steps (Outline) ---
print("Potential Next Steps:")
print("- Monitor the training job in the Google Cloud Console.")
print("- Evaluate the model performance using more comprehensive metrics.")
print("- Deploy the trained model to a Vertex AI Endpoint for predictions.")
print("- Experiment with different model architectures and hyperparameters.")

Potential Next Steps:
- Monitor the training job in the Google Cloud Console.
- Evaluate the model performance using more comprehensive metrics.
- Deploy the trained model to a Vertex AI Endpoint for predictions.
- Experiment with different model architectures and hyperparameters.


In [4]:
print(f'The Fine Tune {model.display_name} is complete')

The Fine Tune cmapss-rul-jsonl-model is complete


In [None]:
!gsutil iam ch serviceAccount:{PROJECT_NUMBER}@cloudbuild.gserviceaccount.com:roles/storage.objectAdmin gs://{BUCKET_NAME}

In [None]:
!gsutil iam ch serviceAccount:{PROJECT_NUMBER}@cloudbuild.gserviceaccount.com:roles/storage.objectAdmin gs://{BUCKET_NAME}

In [None]:
!gsutil iam get gs://{BUCKET_NAME}

In [None]:
# Grant Storage Object Creator role

!gsutil iam ch \
"serviceAccount:$SERVICEACCOUNT:objectCreator" \
"gs://$BUCKET_NAME"

# Grant Storage Object Viewer role

!gsutil iam ch \
"serviceAccount:$SERVICEACCOUNT:objectViewer" \
"gs://$BUCKET_NAME"

# (Optional) Grant Storage Object Admin role (If Necessary)
!gsutil iam ch \
"serviceAccount:$SERVICEACCOUNT:objectAdmin" \
"gs://$BUCKET_NAME"

In [None]:
!gsutil ls gs://{BUCKET_NAME}/model_output/

## MODEL EVALUATION

In [None]:
from google.cloud import storage
import torch

# Initialize the GCS client
client = storage.Client()

# Specify the bucket and blob (file)
bucket_name = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
bucket = client.bucket(bucket_name)
blob_name = "model_output/model-nasa.pth"

blob_name = "model_output/model-nasa.pth"
bucket = client.bucket(bucket_name)
blob = bucket.blob(blob_name)

# Download the file to a local path
local_file_path = "model-nasa.pth"  # Or your desired local path
blob.download_to_filename(local_file_path)

print(f"Downloaded '{blob_name}' from '{bucket_name}' to '{local_file_path}'")

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Assuming this is how the CMAPSSJSONLDataset is defined in the document
import torch
from torch.utils import data
import json
from google.cloud import storage
import logging
import os

class CMAPSSJSONLDataset(data.Dataset):
    def __init__(self, data_path, sequence_length=30):
        self.data = []
        self.sequence_length = sequence_length

        # Check if data_path is a local file or a GCS URI
        if data_path.startswith('gs://'):
            # If GCS URI, download to a temporary file
            storage_client = storage.Client()
            bucket_name = data_path.split('/')[2]
            blob_name = '/'.join(data_path.split('/')[3:])
            bucket = storage_client.bucket(bucket_name)
            blob = bucket.blob(blob_name)
            tmp_file = "/tmp/temp_data.jsonl"
            blob.download_to_filename(tmp_file)
            data_file = tmp_file
        else:
            # If local file, use it directly
            data_file = data_path

        with open(data_file, 'r') as f:
            for line in f:
                try:
                    record = json.loads(line)
                    sequence = torch.tensor(record["sequence"], dtype=torch.float32)
                    rul = torch.tensor([record["rul"]], dtype=torch.float32)
                    self.data.append((sequence, rul))
                except json.JSONDecodeError as e:
                    logging.warning("Skipping invalid JSON line: %s, Error: %s", line, e)

        # Remove the temporary file if it was created
        if data_path.startswith('gs://'):
            os.remove(tmp_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

class RULPredictionModel(nn.Module):
    #LSTM-based model for RUL prediction
    def __init__(self, input_size, hidden_size, num_layers=3, dropout=0.3): # Match the architecture in train.py
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size // 2, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc1(out[:, -1, :])
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

import numpy as np

def calculate_cmapss_score(true_rul, predicted_rul):
    """Calculates a simplified CMAPSS score."""
    d = np.array(predicted_rul) - np.array(true_rul)  # Difference between predicted and true RUL
    score = sum([
        np.exp(-d[i] / 13) - 1 if d[i] < 0 else np.exp(d[i] / 10) - 1
        for i in range(len(d))
    ])
    return score


def evaluate_model(model_path, eval_dataset_path, input_size, hidden_size, sequence_length):
    """Evaluates the trained RUL prediction model."""

    # Load the saved model using the correct model architecture
    model = RULPredictionModel(input_size, hidden_size, num_layers=3)  # Change num_layers to 3 to match the training architecture
    model.load_state_dict(torch.load(model_path))
    model.eval()  # Set to evaluation mode


    # Load the evaluation dataset
    eval_dataset = CMAPSSJSONLDataset(eval_dataset_path, sequence_length)
    eval_loader = DataLoader(eval_dataset, batch_size=64, shuffle=False)  # No need to shuffle for evaluation

    # Make predictions and calculate metrics
    all_predictions = []
    all_targets = []
    with torch.no_grad():  # Disable gradient calculations during evaluation
        for sequences, ruls in eval_loader:
            predictions = model(sequences)
            all_predictions.extend(predictions.flatten().tolist())
            all_targets.extend(ruls.flatten().tolist())

    # Calculate evaluation metrics (e.g., MAE, RMSE)
    mae = mean_absolute_error(all_targets, all_predictions)
    rmse = np.sqrt(mean_squared_error(all_targets, all_predictions))
    mse = mean_squared_error(all_targets, all_predictions)  # Calculate MSE

    # Calculate CMAPSS score
    cmapss_score = calculate_cmapss_score(all_targets, all_predictions)

    # Print the results
    print(f"Evaluation Results:")
    print(f"Average Evaluation Loss (MSE): {mse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"CMAPSS Score: {cmapss_score:.2f}")



# --- Example Usage (if __name__ == "__main__": block) ---
if __name__ == "__main__":
    model_path = "model-nasa.pth"  # Replace with the actual path to your saved model
    eval_dataset_path = f"gs://{BUCKET_NAME}/cmapss_FD004_test_sequences.jsonl"  # Replace with your evaluation data path (GCS URI)
    input_size = 25  # Updated to match the input size used during training
    hidden_size = 128 # Updated to match the hidden size used during training
    sequence_length = 30  # Replace with the sequence length used during training

    evaluate_model(model_path, eval_dataset_path, input_size, hidden_size, sequence_length)

Evaluation Results:
Average Evaluation Loss (MSE): 675.07
Mean Absolute Error (MAE): 25.98
Root Mean Squared Error (RMSE): 25.98
CMAPSS Score: 1607.48


N=10

Evaluation Results:
Average Evaluation Loss (MSE): 675.91
Mean Absolute Error (MAE): 26.00
Root Mean Squared Error (RMSE): 26.00
CMAPSS Score: 1609.80

N=200

Evaluation Results:
Average Evaluation Loss (MSE): 676.00
Mean Absolute Error (MAE): 26.00
Root Mean Squared Error (RMSE): 26.00
CMAPSS Score: 1610.04



## Fine Tune Commandline (FTC)

In [2]:
train_py_content="""

import argparse
import os
import json
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import time
import subprocess
from google.cloud import storage
import pandas as pd
from typing import List
import torch.utils.data as data

import numpy as np

logging.basicConfig(level=logging.INFO)

# --- Helper Functions ---

def create_gcs_dir(model_dir):
    try:
        storage_client = storage.Client()
        bucket_name = model_dir.split('/')[2]
        blob_prefix = '/'.join(model_dir.split('/')[3:])
        bucket = storage_client.bucket(bucket_name)

        subdirs = blob_prefix.split('/')
        current_prefix = ''
        for subdir in subdirs:
            current_prefix = os.path.join(current_prefix, subdir)
            blob = bucket.blob(current_prefix + '/')
            blob.upload_from_string('')
            logging.info("Created GCS directory: %s", current_prefix)
    except Exception as e:
        logging.error(f"Error creating GCS directory: {e}")
        raise

def load_jsonl_dataset(data_path, sequence_length=30):
    data = []
    try:
        if data_path.startswith('gs://'):
            storage_client = storage.Client()
            bucket_name = data_path.split('/')[2]
            blob_name = '/'.join(data_path.split('/')[3:])
            bucket = storage_client.bucket(bucket_name)
            blob = bucket.blob(blob_name)
            tmp_file = "/tmp/temp_data.jsonl"
            blob.download_to_filename(tmp_file)
            file_path = tmp_file
        else:
            file_path = data_path

        with open(file_path, 'r') as f:
            for line in f:
                try:
                    record = json.loads(line)
                    sequence = torch.tensor(record["sequence"], dtype=torch.float32)
                    rul = torch.tensor([record["rul"]], dtype=torch.float32)
                    data.append((sequence, rul))
                except json.JSONDecodeError as e:
                    logging.warning("Skipping invalid JSON line: %r, Error: %s", repr(line), e)

        if data_path.startswith('gs://'):
            os.remove(tmp_file)
        return data

    except Exception as e:
        logging.error(f"Error loading dataset: {e}")
        raise

# --- Feature Engineering Functions ---

def create_rolling_features(df, window_size=10):

    for col in df.columns:
        if isinstance(col, str) and col.startswith('sensor'):
            df[f'{col}_mean'] = df.groupby('unit_nr')[col].transform(lambda x: x.rolling(window=window_size).mean())
            df[f'{col}_std'] = df.groupby('unit_nr')[sensor].transform(lambda x: x.rolling(window=window_size).std())  # Corrected 'sensor' to 'col'
    return df

# --- Dataset Class ---
class CMAPSSJSONLDataset(Dataset):

    def __init__(self, data_path, sequence_length=30, use_rolling_features=False):
        self.data = load_jsonl_dataset(data_path, sequence_length)
        self.sequence_length = sequence_length
        self.use_rolling_features = use_rolling_features

        if self.use_rolling_features:
            # Prepare data for feature engineering
            sequences = [item[0].numpy() for item in self.data]

            # Reshape sequences for DataFrame
            num_sequences = len(sequences)
            num_cycles = sequences[0].shape[0]  # Assuming all sequences have the same number of cycles
            num_features = sequences[0].shape[1]

            reshaped_sequences = np.concatenate(sequences).reshape(num_sequences * num_cycles, num_features)
            df = pd.DataFrame(reshaped_sequences)

            # Create a unit_nr column to group data
            df['unit_nr'] = np.repeat(np.arange(num_sequences), num_cycles)

            # Apply rolling features
            df = create_rolling_features(df)

            # Create sequences from the processed DataFrame
            processed_sequences = []
            for i in range(num_sequences):
                start_idx = i * num_cycles
                end_idx = (i + 1) * num_cycles
                seq = df[start_idx:end_idx].values
                processed_sequences.append(seq)

            # Update self.data with processed sequences and RUL
            new_data = []
            for i in range(len(self.data)):
                rul = self.data[i][1]
                # Ensure the processed sequence is the correct length
                if processed_sequences[i].shape[0] >= self.sequence_length:
                    seq = processed_sequences[i][-self.sequence_length:]  # Take the last part of the sequence
                else:
                    # Handle cases where processed sequence is shorter than desired length
                    seq = np.zeros((self.sequence_length, processed_sequences[i].shape[1]))
                    seq[-processed_sequences[i].shape[0]:] = processed_sequences[i]
                new_data.append((torch.tensor(seq, dtype=torch.float32), rul))
            self.data = new_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# --- Model Definition ---

class RULPredictionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.2):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size // 2, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc1(out[:, -1, :])
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

# --- Training Function ---

def train_model(model_name, train_dataset_path, eval_dataset_path,
                staging_bucket, bucket_name, base_output_dir,
                use_rolling_features=False,
                hidden_size=128,
                num_layers=3,
                dropout=0.3,
                learning_rate=0.0005,
                weight_decay=1e-4,
                num_epochs=50,
                batch_size=64,
                patience=10,
                project_id=None,
                region=None,
                ):



    # --- Project Configuration ---
    if not project_id:
        project_id = os.environ.get("GOOGLE_CLOUD_PROJECT")
        if not project_id:
            raise ValueError("project_id must be provided as a command-line argument or set in the GOOGLE_CLOUD_PROJECT environment variable.")
    if not region:
        region = os.environ.get("GOOGLE_CLOUD_REGION")
        if not region:
            raise ValueError("region must be provided as a command-line argument or set in the GOOGLE_CLOUD_REGION environment variable.")

    logging.info("Training configuration:")
    logging.info(f"Model name: {model_name}")
    logging.info(f"Train Dataset Path: {train_dataset_path}")
    logging.info(f"Eval Dataset Path: {eval_dataset_path}")
    logging.info(f"Staging Bucket: {staging_bucket}")
    logging.info(f"Bucket Name: {bucket_name}")
    logging.info(f"Base Output Dir: {base_output_dir}")
    logging.info(f"Use Rolling Features: {use_rolling_features}")
    logging.info(f"Hidden Size: {hidden_size}")
    logging.info(f"Number of Layers: {num_layers}")
    logging.info(f"Dropout: {dropout}")
    logging.info(f"Learning Rate: {learning_rate}")
    logging.info(f"Weight Decay: {weight_decay}")
    logging.info(f"Number of Epochs: {num_epochs}")
    logging.info(f"Batch Size: {batch_size}")
    logging.info(f"Patience: {patience}")
    logging.info(f"Project ID: {project_id}")
    logging.info(f"Region: {region}")

    # 1. Data Loaders

    train_dataset = CMAPSSJSONLDataset(train_dataset_path, use_rolling_features=use_rolling_features)
    eval_dataset = CMAPSSJSONLDataset(eval_dataset_path, use_rolling_features=use_rolling_features)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

    # Print input sizes
    print("Training Dataset Input Size:", train_dataset[0][0].shape[-1])
    print("Evaluation Dataset Input Size:", eval_dataset[0][0].shape[-1])

    # 2. Model Initialization
    input_size = train_dataset[0][0].shape[-1]
    model = RULPredictionModel(input_size, hidden_size, num_layers=num_layers, dropout=dropout)

    # 3. Device Configuration (CPU)
    #device = torch.device("cpu")  # Force CPU
    #model.to(device)
    #logging.info(f"Using device: {device}")

    # 3. Device Configuration (GPU)
    if torch.cuda.is_available():
        device = torch.device("cuda")  # Use GPU if available
        print("GPU is available and being used.")
    else:
        device = torch.device("cpu")  # Fallback to CPU if GPU is not available
        print("GPU is not available, using CPU instead.")

    model.to(device)
    logging.info(f"Using device: {device}")





    # 4. Optimizer and Loss Function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.MSELoss()

    # 5. Training Loop
    best_eval_loss = float('inf')
    patience = 10
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch_idx, (sequences, ruls) in enumerate(train_loader):
            sequences = sequences.to(device)
            ruls = ruls.to(device)

            optimizer.zero_grad()
            outputs = model(sequences)
            loss = criterion(outputs, ruls)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if batch_idx % 50 == 0:
                logging.info(f"Epoch: {epoch + 1}, Batch: {batch_idx}, Loss: {loss.item()}")

        avg_loss = total_loss / len(train_loader)
        logging.info(f"Epoch: {epoch + 1}, Average Training Loss: {avg_loss}")

        # 6. Evaluation
        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for sequences_eval, ruls_eval in eval_loader:
                sequences_eval = sequences_eval.to(device)
                ruls_eval = ruls_eval.to(device)

                outputs_eval = model(sequences_eval)
                loss_eval = criterion(outputs_eval, ruls_eval)
                eval_loss += loss_eval.item()

        avg_eval_loss = eval_loss / len(eval_loader)
        logging.info(f"Epoch: {epoch + 1}, Average Evaluation Loss: {avg_eval_loss}")

        # 7. Early Stopping
        if avg_eval_loss < best_eval_loss:
            best_eval_loss = avg_eval_loss
            epochs_no_improve = 0
            torch.save(model.state_dict(), 'best_model.pth')
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                logging.info("Early stopping triggered")
                break

    # 8. Load the best model
    model.load_state_dict(torch.load('best_model.pth'))

    logging.info("Starting model saving...")

    # 9. Save Model (Local and GCS)
    local_model_path = 'model-nasa-ftc.pth'
    torch.save(model.state_dict(), local_model_path)

    try:
        base_output_path = base_output_dir
        subprocess.run(['gsutil', 'cp', local_model_path, base_output_path], check=True)
        logging.info(f"Copied model to GCS BUCKET path: {base_output_path}")

        if 'AIP_MODEL_DIR' in os.environ:
            gcs_model_path = os.path.join(os.environ['AIP_MODEL_DIR'], 'model-nasa-ftc.pth')
            subprocess.run(['gsutil', 'cp', local_model_path, gcs_model_path], check=True)
            logging.info(f"Copied model to Vertex AI path: {gcs_model_path}")
            model_save_path = gcs_model_path
        else:
            logging.info(f"Saving model to local path: {local_model_path}")
            model_save_path = local_model_path

        logging.info(f"Model saved to: {model_save_path}")

    except subprocess.CalledProcessError as e:
        logging.error(f"Error saving model: {e}")
        raise

    print("Training completed.")

# --- Main Execution ---

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="rul_predictor_jsonl", help="Model name")
    parser.add_argument("--train_dataset", type=str, required=True, help="Path to training dataset JSONL") #Single path
    parser.add_argument("--eval_dataset", type=str, required=True, help="Path to evaluation dataset JSONL")  #Single path
    parser.add_argument("--staging_bucket", type=str, required=True, help="Staging bucket for Vertex AI")
    parser.add_argument("--bucket_name", type=str, required=True, help="Bucket name")
    parser.add_argument("--base_output_dir", type=str, required=True, help="Base output directory in GCS")
    parser.add_argument("--use_rolling_features", action='store_true', help="Use rolling window features")
    parser.add_argument("--hidden_size", type=int, default=128, help="Hidden size of LSTM")
    parser.add_argument("--num_layers", type=int, default=3, help="Number of LSTM layers")
    parser.add_argument("--dropout", type=float, default=0.3, help="Dropout rate")
    parser.add_argument("--learning_rate", type=float, default=0.0005, help="Learning rate")
    parser.add_argument("--weight_decay", type=float, default=1e-4, help="Weight decay")
    parser.add_argument("--num_epochs", type=int, default=50, help="Number of epochs")
    parser.add_argument("--batch_size", type=int, default=64, help="Batch size")
    parser.add_argument("--patience", type=int, default=10, help="Early stopping patience")
    parser.add_argument("--project_id", type=str, help="Google Cloud Project ID")  # Added
    parser.add_argument("--region", type=str, help="Google Cloud Region")       # Added


    args = parser.parse_args()

    train_model(args.model, args.train_dataset, args.eval_dataset,
                args.staging_bucket, args.bucket_name, args.base_output_dir,
                args.use_rolling_features, args.hidden_size, args.num_layers,
                args.dropout, args.learning_rate, args.weight_decay,
                args.num_epochs, args.batch_size, args.patience,
                args.project_id, args.region)
"""

import os
# Create or overwrite trainer/train.py
os.makedirs('trainer', exist_ok=True)
with open('trainer/train_cmd.py', 'w') as f:
    f.write(train_py_content)

In [8]:
import colab_env
import os
from google.cloud import aiplatform, storage
import logging
from google.colab import auth
import pandas as pd
import json
import zipfile
import requests
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from google.cloud import storage
from torch.utils.data import Dataset, DataLoader

# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
SERVICEACCOUNT = os.environ.get("GOOGLE_CLOUD_SERVICEACCOUNT")
PROJECT_NUMBER = os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

!python trainer/train_cmd.py \
    --train_dataset gs://{BUCKET_NAME}/cmapss_FD004_train_sequences.jsonl \
    --eval_dataset gs://{BUCKET_NAME}/cmapss_FD004_test_sequences.jsonl \
    --staging_bucket {STAGING_BUCKET} \
    --bucket_name {BUCKET_NAME} \
    --base_output_dir gs://{BUCKET_NAME}/model_output \
    --use_rolling_features \
    --hidden_size 256 \
    --num_layers 4 \
    --dropout 0.4 \
    --learning_rate 0.001 \
    --weight_decay 1e-5 \
    --num_epochs 50000 \
    --batch_size 32 \
    --patience 15

INFO:root:Training configuration:
INFO:root:Model name: rul_predictor_jsonl
INFO:root:Train Dataset Path: gs://poc-my-new-staging-bucket-2025-1/cmapss_FD004_train_sequences.jsonl
INFO:root:Eval Dataset Path: gs://poc-my-new-staging-bucket-2025-1/cmapss_FD004_test_sequences.jsonl
INFO:root:Staging Bucket: gs://poc-my-new-staging-bucket-2025-1/staging
INFO:root:Bucket Name: poc-my-new-staging-bucket-2025-1
INFO:root:Base Output Dir: gs://poc-my-new-staging-bucket-2025-1/model_output
INFO:root:Use Rolling Features: True
INFO:root:Hidden Size: 256
INFO:root:Number of Layers: 4
INFO:root:Dropout: 0.4
INFO:root:Learning Rate: 0.001
INFO:root:Weight Decay: 1e-05
INFO:root:Number of Epochs: 50000
INFO:root:Batch Size: 32
INFO:root:Patience: 15
INFO:root:Project ID: gen-lang-client-0870511801
INFO:root:Region: us-central1
Training Dataset Input Size: 26
Evaluation Dataset Input Size: 26
GPU is available and being used.
INFO:root:Using device: cuda
INFO:root:Epoch: 1, Batch: 0, Loss: 0.000131960

## EVAL - FTC

In [9]:
from google.cloud import storage
import torch

# Initialize the GCS client
client = storage.Client()

# Specify the bucket and blob (file)
bucket_name = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
bucket = client.bucket(bucket_name)
blob_name = "model_output/model-nasa-ftc.pth"

blob_name = "model_output/model-nasa-ftc.pth"
bucket = client.bucket(bucket_name)
blob = bucket.blob(blob_name)

# Download the file to a local path
local_file_path = "model-nasa-ftc.pth"  # Or your desired local path
blob.download_to_filename(local_file_path)

print(f"Downloaded '{blob_name}' from '{bucket_name}' to '{local_file_path}'")

Downloaded 'model_output/model-nasa-ftc.pth' from 'poc-my-new-staging-bucket-2025-1' to 'model-nasa-ftc.pth'


In [10]:
import os
import json
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import time
import subprocess
from google.cloud import storage
import pandas as pd
from typing import List
import torch.utils.data as data
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from google.colab import auth
from google.cloud import aiplatform


# --- Dataset and Model Definitions ---

class CMAPSSJSONLDataset(data.Dataset):
    def __init__(self, data_path, sequence_length=30, use_rolling_features=False):
        self.data = []
        self.sequence_length = sequence_length
        self.use_rolling_features = use_rolling_features

        try:
            if data_path.startswith('gs://'):
                storage_client = storage.Client()
                bucket_name = data_path.split('/')[2]
                blob_name = '/'.join(data_path.split('/')[3:])
                bucket = storage_client.bucket(bucket_name)
                blob = bucket.blob(blob_name)
                tmp_file = "/tmp/temp_data.jsonl"
                blob.download_to_filename(tmp_file)
                file_path = tmp_file
            else:
                file_path = data_path

            with open(file_path, 'r') as f:
                for line in f:
                    try:
                        record = json.loads(line)
                        sequence = torch.tensor(record["sequence"], dtype=torch.float32)
                        rul = torch.tensor([record["rul"]], dtype=torch.float32)
                        self.data.append((sequence, rul))
                    except json.JSONDecodeError as e:
                        logging.warning("Skipping invalid JSON line: %r, Error: %s", repr(line), e)

            if data_path.startswith('gs://'):
                os.remove(tmp_file)

            if self.use_rolling_features:
                # Convert data to DataFrame for feature engineering
                sequences = [item[0].numpy() for item in self.data]
                num_sequences = len(sequences)
                num_cycles = sequences[0].shape[0]  # Assuming all sequences have the same number of cycles
                num_features = sequences[0].shape[1]
                reshaped_sequences = np.concatenate(sequences).reshape(num_sequences * num_cycles, num_features)
                df = pd.DataFrame(reshaped_sequences)
                df['unit_nr'] = np.repeat(np.arange(num_sequences), num_cycles)
                # Function to create rolling features
                def create_rolling_features(df, window_size=10):
                    for col in df.columns:
                        if isinstance(col, str) and col.startswith('sensor'):
                            df[f'{col}_mean'] = df.groupby('unit_nr')[col].transform(lambda x: x.rolling(window=window_size).mean())
                            df[f'{col}_std'] = df.groupby('unit_nr')[col].transform(lambda x: x.rolling(window=window_size).std())
                    return df

                df = create_rolling_features(df)
                processed_sequences = []
                for i in range(num_sequences):
                    start_idx = i * num_cycles
                    end_idx = (i + 1) * num_cycles
                    seq = df[start_idx:end_idx].values
                    processed_sequences.append(seq)

                # Update self.data with the modified sequences (and RUL)
                new_data = []
                for i in range(len(self.data)):
                    rul = self.data[i][1]
                    # Ensure the processed sequence is the correct length
                    if processed_sequences[i].shape[0] >= self.sequence_length:
                        seq = processed_sequences[i][-self.sequence_length:]
                    else:
                        seq = np.zeros((self.sequence_length, processed_sequences[i].shape[1]))
                        seq[-processed_sequences[i].shape[0]:] = processed_sequences[i]
                    new_data.append((torch.tensor(seq, dtype=torch.float32), rul))
                self.data = new_data


        except Exception as e:
            logging.error(f"Error loading dataset: {e}")
            raise

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]


class RULPredictionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.2):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc1 = nn.Linear(hidden_size, hidden_size // 2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size // 2, 1)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc1(out[:, -1, :])
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out


# --- Evaluation Function ---

def calculate_cmapss_score(true_rul, predicted_rul):
    """Calculates a simplified CMAPSS score."""
    d = np.array(predicted_rul) - np.array(true_rul)  # Difference between predicted and true RUL
    score = sum([
        np.exp(-d[i] / 13) - 1 if d[i] < 0 else np.exp(d[i] / 10) - 1
        for i in range(len(d))
    ])
    return score


def evaluate_model_ftc(model, eval_dataset_path, input_size, hidden_size, sequence_length):
    """Evaluates the trained RUL prediction model."""

    model.eval()  # Set to evaluation mode

    # Load the evaluation dataset
    eval_dataset = CMAPSSJSONLDataset(eval_dataset_path, sequence_length, use_rolling_features=True)
    eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=False)  # No need to shuffle for evaluation

    # Make predictions and calculate metrics
    all_predictions = []
    all_targets = []
    with torch.no_grad():  # Disable gradient calculations during evaluation
        for sequences, ruls in eval_loader:
            predictions = model(sequences)
            all_predictions.extend(predictions.flatten().tolist())
            all_targets.extend(ruls.flatten().tolist())

    # Calculate evaluation metrics
    mae = mean_absolute_error(all_targets, all_predictions)
    rmse = np.sqrt(mean_squared_error(all_targets, all_predictions))
    mse = mean_squared_error(all_targets, all_predictions)

    # Calculate CMAPSS score
    cmapss_score = calculate_cmapss_score(all_targets, all_predictions)

    # Print the results
    print(f"Evaluation Results:")
    print(f"Average Evaluation Loss (MSE): {mse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"CMAPSS Score: {cmapss_score:.2f}")


# --- Main Execution ---

if __name__ == "__main__":
    # Authentication and Initialization
    auth.authenticate_user()

    # Get project details from environment variables
    PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
    REGION = os.environ.get("GOOGLE_CLOUD_REGION")
    BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
    STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

    # Initialize Vertex AI
    aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

    # --- Model Details ---
    input_size = 26  # Adjust if needed
    hidden_size = 256  # Adjust if needed
    sequence_length = 30  # Adjust if needed
    num_layers = 4  # Adjust if needed
    eval_dataset_path = f"gs://{BUCKET_NAME}/cmapss_FD004_test_sequences.jsonl"  # Adjust if needed
    model_path = "model-nasa-ftc.pth"  # Update with the correct model path in GCS


    # --- Model Loading ---
    model = RULPredictionModel(input_size, hidden_size, num_layers=num_layers)

    # --- Load model state dict using either local path or GCS URI: ---
    if model_path.startswith("gs://"):
        # Load from GCS
        client = storage.Client()
        bucket_name = model_path.split('/')[2]
        blob_name = '/'.join(model_path.split('/')[3:])
        bucket = client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        tmp_file = "/tmp/temp_model.pth"
        blob.download_to_filename(tmp_file)
        model.load_state_dict(torch.load(tmp_file))
        os.remove(tmp_file)  # Clean up temporary file
    else:
        # Load from local path
        model.load_state_dict(torch.load(model_path))

    # --- Evaluation ---
    evaluate_model_ftc(model, eval_dataset_path, input_size, hidden_size, sequence_length)

Evaluation Results:
Average Evaluation Loss (MSE): 675.44
Mean Absolute Error (MAE): 25.99
Root Mean Squared Error (RMSE): 25.99
CMAPSS Score: 1608.49
