<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/gemini_ft_vertexai_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-aiplatform -q
!pip install google-cloud-storage -q
!pip install google-cloud-bigquery -q
!pip install google-cloud-bigquery-storage -q
!pip install google-cloud-aiplatform -q
!pip install datasets -q
!pip install colab-env -q

# Install necessary libraries
!pip install  -q gcsfs==2024.3.1
!pip install  -q accelerate==0.31.0
!pip install  -q transformers==4.45.2
!pip install  -q  datasets==2.19.2
!pip install google-cloud-aiplatform[all] -q
!pip install vertexai  -q
!pip install tensorflow_datasets -q

In [None]:
import colab_env
import os
from google.cloud import aiplatform, storage
import logging
from google.colab import auth
import pandas as pd
import json
import zipfile
import requests
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from google.cloud import storage
from torch.utils.data import Dataset, DataLoader

# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
SERVICEACCOUNT = os.environ.get("GOOGLE_CLOUD_SERVICEACCOUNT")
PROJECT_NUMBER = os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

TRAIN_DATASET_URI = f"gs://{BUCKET_NAME}/cmapss_FD004_train_sequences.jsonl"
VALIDATION_DATASET_URI = f"gs://{BUCKET_NAME}/cmapss_FD004_test_sequences.jsonl"

Currently Supported Models for Supervised Fine-Tuning (SFT) via the vertexai.preview.tuning.sft module and Generative AI Studio:

* Gemini 2.0 Flash-Lite (gemini-2.0-flash-lite-001)
* Gemini 2.0 Flash (gemini-2.0-flash-001)
* Gemini 1.5 Flash (gemini-1.5-flash-002)
* Gemini 1.5 Pro (gemini-1.5-pro-002)
* Gemini 1.0 Pro (gemini-1.0-pro-002)

In [None]:
import os
import pandas as pd
import json
import zipfile
from google.cloud import storage
from google.colab import auth


# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
SERVICEACCOUNT = os.environ.get("GOOGLE_CLOUD_SERVICEACCOUNT")
PROJECT_NUMBER = os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# --- Data Loading from Google Drive ---
zip_path = '/content/gdrive/MyDrive/datasets/CMAPSSData.zip'
extract_dir = 'data/cmapss'
os.makedirs(extract_dir, exist_ok=True)

if not os.path.exists(zip_path):
    print(f"Error: CMAPSSData.zip not found at {zip_path}. Please ensure the file is correctly located in your Google Drive.")
    raise FileNotFoundError(f"CMAPSSData.zip not found at {zip_path}")

try:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        if zip_ref.testzip() is None:  # Check for ZIP file integrity
            zip_ref.extractall(extract_dir)
            print(f"Extracted dataset files to: {extract_dir}")
        else:
            print("Error: ZIP file integrity check failed. The file may not be a valid ZIP file.")
            raise zipfile.BadZipFile("ZIP file integrity check failed.")

except zipfile.BadZipFile as e:
    print(f"Error extracting ZIP file: {e}")
    print(
        "The uploaded file may not be a valid or complete ZIP file. "
        "Please ensure you have uploaded the correct file, that it is not corrupted, "
        "and that it is a standard ZIP archive."
    )
    raise  # Stop execution if extraction fails

# --- Prepare NASA CMAPSS Data and Save to JSONL in GCS ---
extract_dir = 'data/cmapss'
os.makedirs(extract_dir, exist_ok=True)

# Process all four subsets
data_subsets = ['FD001', 'FD002', 'FD003', 'FD004']

for data_subset in data_subsets:
    train_file = os.path.join(extract_dir, f'train_{data_subset}.txt')
    test_file = os.path.join(extract_dir, f'test_{data_subset}.txt')
    rul_file = os.path.join(extract_dir, f'RUL_{data_subset}.txt')

    SENSOR_COLUMNS = ['sensor' + str(i).zfill(2) for i in range(1, 22)]
    OP_SETTING_COLUMNS = ['op_setting_' + str(i) for i in range(1, 4)]
    DATA_COLUMNS = ['unit_nr', 'time_cycles'] + OP_SETTING_COLUMNS + SENSOR_COLUMNS

    # Load training data
    try:
        train_df = pd.read_csv(train_file, names=DATA_COLUMNS, delim_whitespace=True, header=None)
        test_df = pd.read_csv(test_file, names=DATA_COLUMNS, delim_whitespace=True, header=None)
        rul_df = pd.read_csv(rul_file, names=['RUL'], delim_whitespace=True, header=None)

        train_df.columns = DATA_COLUMNS
        test_df.columns = DATA_COLUMNS

        print(f"\nProcessing data subset: {data_subset}")
        print("Shape of train_df after loading:", train_df.shape)
        print("train_df head after loading:\n", train_df.head())
        print("Shape of test_df:", test_df.shape)
        print("test_df head after loading:\n", test_df.head())
        print("Shape of RUL data:", rul_df.shape)

    except FileNotFoundError as e:
        print(f"Error loading data files for subset {data_subset}: {e}")
        raise  # Stop execution if a file is missing

    def create_jsonl(df, rul_df, output_path, sequence_length=30, is_test=False):
        grouped_data = df.groupby('unit_nr')
        rul_values = rul_df.values.tolist()  # Convert RUL DataFrame to list
        engine_count = 0  # To track which RUL value to use

        with open(output_path, 'w') as f:
            for unit_nr, unit_data in grouped_data:
                num_cycles = len(unit_data)
                data_values = unit_data.drop(['unit_nr'], axis=1).values.tolist()
                json_data = []  # Initialize an empty list to hold JSON objects

                for i in range(max(0, num_cycles - sequence_length + 1)):
                    sequence = data_values[i:i + sequence_length]
                    rul = num_cycles - (i + sequence_length)

                    # Ensure RUL is not out of bounds
                    if engine_count < len(rul_values):
                        current_rul = rul_values[engine_count][0]  # Get the RUL value
                    else:
                        current_rul = 0  # Or some default value if RUL data is exhausted

                    if len(sequence) == sequence_length:
                        json_record = {"sequence": sequence, "sequence_length": len(sequence), "rul": current_rul}  # Include sequence length
                        json_data.append(json_record)

                # Write all JSON objects to the file at once
                with open(output_path, 'w') as f:
                    for json_record in json_data:
                        f.write(json.dumps(json_record) + '\n')

                engine_count += 1  # Increment engine counter

    local_train_jsonl_path = f"cmapss_{data_subset}_train_sequences.jsonl"
    local_test_jsonl_path = f"cmapss_{data_subset}_test_sequences.jsonl"

    # Create JSONL for training
    create_jsonl(train_df, rul_df, local_train_jsonl_path, is_test=False)
    print(f"Created {local_train_jsonl_path}")

    # Create JSONL for testing
    create_jsonl(test_df, rul_df, local_test_jsonl_path, is_test=True)
    print(f"Created {local_test_jsonl_path}")

    # --- Upload JSONL files to GCS ---
    client = storage.Client(project=PROJECT_ID)
    bucket = client.bucket(BUCKET_NAME)

    blob_train = bucket.blob(f"cmapss_{data_subset}_train_sequences.jsonl")  # Adapt to your naming scheme
    blob_test = bucket.blob(f"cmapss_{data_subset}_test_sequences.jsonl")   # Adapt to your naming scheme

    blob_train.upload_from_filename(local_train_jsonl_path)
    print(f"Uploaded training data to: gs://{BUCKET_NAME}/cmapss_{data_subset}_train_sequences.jsonl")

    blob_test.upload_from_filename(local_test_jsonl_path)
    print(f"Uploaded evaluation data to: gs://{BUCKET_NAME}/cmapss_{data_subset}_test_sequences.jsonl")

print("JSONL files created and uploaded.")

In [9]:
import json
import numpy as np

def create_textual_dataset(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            try:
                data = json.loads(line)
                sequence = data.get("sequence")
                rul = data.get("rul") # Assuming your data has an RUL

                if sequence:
                    # Create a simple textual description (you can make this more sophisticated)
                    description = f"Engine sensor readings over time: {np.array(sequence).flatten().tolist()}"
                    if rul is not None:
                        output_data = {"contents": [{"role": "user", "parts": [{"text": description}]}, {"role": "model", "parts": [{"text": f"Remaining Useful Life: {rul}"}]}]}
                        outfile.write(json.dumps(output_data) + '\n')
                    else:
                        output_data = {"contents": [{"role": "user", "parts": [{"text": description}]}, {"role": "model", "parts": [{"text": "RUL prediction needed."}]}]}
                        outfile.write(json.dumps(output_data) + '\n')
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
            except Exception as e:
                print(f"An error occurred: {e}")

# Define your input and output file paths
input_train_file = "cmapss_FD004_train_sequences.jsonl"
output_train_file_text = "cmapss_FD004_train_text.jsonl"

input_test_file = "cmapss_FD004_test_sequences.jsonl"
output_test_file_text = "cmapss_FD004_test_text.jsonl"

# Create the textual datasets
create_textual_dataset(input_train_file, output_train_file_text)
create_textual_dataset(input_test_file, output_test_file_text)

print(f"Textual training data created: {output_train_file_text}")
print(f"Textual testing data created: {output_test_file_text}")

Textual training data created: cmapss_FD004_train_text.jsonl
Textual testing data created: cmapss_FD004_test_text.jsonl


In [None]:
!gsutil cp cmapss_FD004_train_text.jsonl gs://{BUCKET_NAME}/
!gsutil cp cmapss_FD004_test_text.jsonl gs://{BUCKET_NAME}/

In [None]:
from vertexai.preview.tuning import sft
import vertexai
import os
from google.colab import auth

# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
SERVICEACCOUNT = os.environ.get("GOOGLE_CLOUD_SERVICEACCOUNT")
PROJECT_NUMBER = os.environ.get("GOOGLE_CLOUD_PROJECT_NUMBER")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
vertexai.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Define your tuning parameters
BASE_MODEL = "gemini-1.5-pro-002"  # Using the specific stable version
TRAIN_DATASET_URI = f"gs://{BUCKET_NAME}/cmapss_FD004_train_text.jsonl" # Point to the textual data
VALIDATION_DATASET_URI = f"gs://{BUCKET_NAME}/cmapss_FD004_test_text.jsonl" # Point to the textual data
TUNED_MODEL_DISPLAY_NAME = "cmapss-text-tuned-gemini-1.5-pro"
EPOCHS = 3  # Adjust as needed
LEARNING_RATE_MULTIPLIER = 1.0  # Adjust as needed

# Start the fine-tuning job
try:
    sft_tuning_job = sft.train(
        source_model=BASE_MODEL,
        train_dataset=TRAIN_DATASET_URI,
        validation_dataset=VALIDATION_DATASET_URI,
        tuned_model_display_name=TUNED_MODEL_DISPLAY_NAME,
        epochs=EPOCHS,
        learning_rate_multiplier=LEARNING_RATE_MULTIPLIER,
    )

    print(f"Tuning job started: {sft_tuning_job.resource_name}")

except Exception as e:
    print(f"An error occurred: {e}")
    print("Please double-check the base model name and your Vertex AI setup.")