<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/FT_GEMINI_NASA_VERTEXAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colab-env -q
!pip install google-generativeai -q
!pip install rouge-score -q


In [None]:
from vertexai.preview.tuning import sft
import vertexai
import os
from google.colab import auth
import colab_env
import time

# Project details (replace with your values if not using env vars)
PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")
STAGING_BUCKET = f"gs://{BUCKET_NAME}/staging"

# Authentication and Initialization
auth.authenticate_user()
vertexai.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

# Define your tuning parameters
BASE_MODEL = "gemini-2.0-flash-001"  # Using Gemini 2.0 Flash

TRAIN_DATASET_URI = f"gs://{BUCKET_NAME}/cmapss_FD004_train_text.jsonl"  # Path to your training data in JSONL format
VALIDATION_DATASET_URI = f"gs://{BUCKET_NAME}/cmapss_FD004_test_text.jsonl"  # Path to your validation data in JSONL format
TUNED_MODEL_DISPLAY_NAME = "cmapss-text-tuned-gemini-2.0-flash-001"
EPOCHS = 10  # Adjust as needed
LEARNING_RATE_MULTIPLIER = 1.0  # Adjust as needed



# Start the fine-tuning job
try:
    sft_tuning_job = sft.train(
        source_model=BASE_MODEL,
        train_dataset=TRAIN_DATASET_URI,
        validation_dataset=VALIDATION_DATASET_URI,
        tuned_model_display_name=TUNED_MODEL_DISPLAY_NAME,
        epochs=EPOCHS,
        learning_rate_multiplier=LEARNING_RATE_MULTIPLIER,
    )


    print(f"Tuning job started: {sft_tuning_job.resource_name}")

    # Periodically check the job status until it's complete
    while True:
        job_status = sft_tuning_job.state  # Get the job's state directly

        if job_status in ("SUCCEEDED", "FAILED", "CANCELLED"):
            break  # Exit the loop if the job is finished

        print(f"Job status: {job_status}, waiting...")
        time.sleep(60)  # Wait for 60 seconds before checking again

    print(f"Tuning job completed with status: {job_status}. Resource name: {sft_tuning_job.resource_name}")



except Exception as e:
    print(f"An error occurred: {e}")
    print("Please double-check the base model name and your Vertex AI setup.")

In [None]:
print(f"Tuning job completed with Resource name: {sft_tuning_job.resource_name}")

In [None]:
{
  "contents": [
    {
      "role": "user",
      "parts": [
        {
          "text": "Engine sensor readings over time: [1.0, 41.9993, 0.8409, 100.0, 445.0, 548.68, 1343.85, 1111.03, 3.91, 5.69, 137.26, 2211.96, 8296.96, ..., 8054.65, 9.2728, 0.02, 331.0, 2223.0, 100.0, 14.78, 8.8922]"
        }
      ]
    },
    {
      "role": "model",
      "parts": [
        {
          "text": "Remaining Useful Life: 0"
        }
      ]
    }
  ]
}

In [None]:
!gsutil ls gs://{BUCKET_NAME}

In [None]:
!gsutil cp -pr batch_prediction_output gs://{BUCKET_NAME}/
!gsutil ls gs://{BUCKET_NAME}/

In [None]:
from vertexai.preview.tuning import sft
import vertexai
import time
import os
import json
from google.colab import auth
from google.cloud import aiplatform
from google.api_core import exceptions
from google.cloud.aiplatform_v1.types import JobState
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=STAGING_BUCKET)

## Evaluation

Model name
projects/677155171887/locations/us-central1/models/1440268972921454592@1

Tuning job
projects/677155171887/locations/us-central1/tuningJobs/5437787329584431104

Status
Succeeded

Region
us-central1

Created
Apr 5, 2025, 5:49:28 AM
Ended
Apr 5, 2025, 6:19:46 AM

In [None]:
!pip install rouge-score -q
!pip install google-generativeai -q
!pip install colab-env -q

In [None]:
from rouge_score import rouge_scorer
import json
import os
from google.colab import userdata, auth
from google.cloud import aiplatform_v1beta1 as aiplatform

# Authentication and Initialization
auth.authenticate_user()

PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION = os.environ.get("GOOGLE_CLOUD_REGION")
BUCKET_NAME = os.environ.get("GOOGLE_CLOUD_BUCKET_NAME")

EVAL_DATASET_URI = f"gs://{BUCKET_NAME}/cmapss_FD004_test_text.jsonl"
tuned_model_resource_name = 'projects/677155171887/locations/us-central1/models/1440268972921454592@1'  # Replace with your tuned model name

# Get the prediction client
endpoint = f"{REGION}-aiplatform.googleapis.com"
client_options = {"api_endpoint": endpoint}
prediction_client = aiplatform.PredictionServiceClient(client_options=client_options)

# ROUGE Score Calculation
def calculate_rouge(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return scores

# Download and Evaluate
print(f"Evaluating model: {tuned_model_resource_name}\n")
!gsutil cp -pr {EVAL_DATASET_URI} .
print('\n')
eval_dataset_path = 'cmapss_FD004_test_text.jsonl'

rouge_scores = []
processed_lines = 0
with open(eval_dataset_path, 'r') as f:
    for line in f:
        processed_lines += 1
        try:
            print(f"Processing line {processed_lines}: {line.strip()[:100]}...")
            data = json.loads(line)

            # Extract prompt and completion
            prompt = data["contents"][0]["parts"][0]["text"]  # Extract prompt text directly
            completion = data["contents"][1]["parts"][0]["text"]  # Extract completion text

            # Validate prompt and completion
            if not prompt or not completion:
                raise ValueError(f"Empty prompt or completion in line: {line.strip()}")

            # Model prediction using the prediction client
            instance = {"content": prompt}  # Use 'content' as the key for the prompt
            request = aiplatform.PredictRequest(
                endpoint=tuned_model_resource_name,
                instances=[instance],  # Wrap instance in a list
            )
            response = prediction_client.predict(request=request)

            # Extract prediction text (adjust based on response format)
            generated_text = response.predictions[0]["content"]

            scores = calculate_rouge(completion, generated_text)
            rouge_scores.append(scores)


        except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e:
            print(f"Error processing line: {line.strip()[:100]}..., Error: {e}")
            print(f"Full line: {line.strip()}")
        except Exception as e:
            print(f"Unexpected error processing line: {line.strip()[:100]}..., Error: {e}")
            print(f"Full line: {line.strip()}")

# Calculate and Print Average Scores
if not rouge_scores:
    print(f"No ROUGE scores calculated. Processed {processed_lines} lines.")
    print("Check dataset format and model prediction logic.")
else:
    avg_rouge1 = sum([scores['rouge1'].fmeasure for scores in rouge_scores]) / len(rouge_scores)
    avg_rouge2 = sum([scores['rouge2'].fmeasure for scores in rouge_scores]) / len(rouge_scores)
    avg_rougeL = sum([scores['rougeL'].fmeasure for scores in rouge_scores]) / len(rouge_scores)
    print(f"Average ROUGE-1: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L: {avg_rougeL:.4f}")