<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/gemini_finetune_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-aiplatform -q
!pip install google-cloud-storage -q
!pip install google-cloud-bigquery -q
!pip install google-cloud-bigquery-storage -q
!pip install google-cloud-aiplatform -q
!pip install datasets -q

In [3]:
import json
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from google.colab import auth
from google.cloud import aiplatform

# **1. Data Preparation**

# Load the dataset
dataset = load_dataset("frankmorales2020/flight_plan_waypoints")

# Convert to JSONL format with prompt and completion
def convert_to_jsonl(data, filename):
    with open(filename, "w") as f:
        for row in data:
            data_point = {
                "prompt": row["input"],
                "completion": str(row["label"]),  # Convert label to string
            }
            f.write(json.dumps(data_point) + "\n")

# Convert the Hugging Face Dataset to a list of dictionaries
dataset_list = list(dataset["train"])

# Split the dataset into training and evaluation sets
train_data, eval_data = train_test_split(dataset_list, test_size=0.2, random_state=42)

# Convert and save to JSONL files
convert_to_jsonl(train_data, "training_data.jsonl")
convert_to_jsonl(eval_data, "eval_data.jsonl")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'label', 'distance', 'distance_category', 'waypoints', 'waypoint_names'],
        num_rows: 2000
    })
})

In [5]:
dataset['train'][0]

{'input': 'Calculate the waypoints from SIN to CUN. Departure: 2024-06-19, Aircraft: Airbus A320, Weather: Partly Cloudy',
 'label': 7,
 'distance': 4190.223965150766,
 'distance_category': 'long',
 'waypoints': [[25.0000001, -107.5000001],
  [13.075850948259008, -87.47663616549106],
  [19.819922201759134, -98.80146877121005],
  [9.260386709076178, -81.06960216515675],
  [10.868386703521901, -83.76980061837503],
  [11.748790002704872, -85.24819840125659],
  [13.307169797892762, -87.86507323149776],
  [14.392416690114095, -89.68745009086217],
  [5.1096596, -74.0995854]],
 'waypoint_names': ['SIN',
  'Choluteca',
  'Santo Domingo Aztacameca',
  'Veraguas',
  'San Juan del Norte',
  'Acoyapa',
  'Pasaje La Cruz',
  'El Pito',
  'CUN']}

In [None]:
# Authentication and Initialization**

auth.authenticate_user()
aiplatform.init(project="gen-lang-client-0870511801", location="us-central1") # Replace with your project and location

# **3. Create Vertex AI TextDatasets**

# Training dataset
train_dataset = aiplatform.TextDataset.create(
    display_name="waypoints-train",
    gcs_source=["gs://poc2025/training_data.jsonl"],  # Replace with your GCS bucket and path
    import_schema_uri=aiplatform.schema.dataset.ioformat.text.single_label_classification,
)

# Evaluation dataset
eval_dataset = aiplatform.TextDataset.create(
    display_name="waypoints-eval",
    gcs_source=["gs://poc2025/eval_data.jsonl"],  # Replace with your GCS bucket and path
    import_schema_uri=aiplatform.schema.dataset.ioformat.text.single_label_classification,
)

print(f"Training dataset created: {train_dataset.resource_name}")
print(f"Evaluation dataset created: {eval_dataset.resource_name}")

In [None]:
# Fine-tuning the Gemini Model
model_name = "gemini-pro"  # Or "gemini-ultra"

worker_pool_specs = [
    {
        "machine_spec": {
            "machine_type": "n1-standard-8",
            "accelerator_type": "NVIDIA_TESLA_T4",
            "accelerator_count": 1,
        },
        "replica_count": 1,
        "container_spec": {
            "image_uri": "us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest",
        },
    }
]

# Use CustomContainerTrainingJob for Gemini fine-tuning
# 1. Ensure staging_bucket is set in the constructor
tuning_job = aiplatform.CustomContainerTrainingJob(
    display_name="gemini-fine-tuning-job",
    container_uri="us-docker.pkg.dev/vertex-ai/training/pytorch-gpu.1-11:latest", # Replace with your training container URI
    staging_bucket=staging_bucket_uri # Pass staging_bucket_uri here
)

# 2. (Optional) If still facing the error, pass in training_pipeline_id
# and base_output_dir in the run() method
fine_tuned_model = tuning_job.run(
    model=model_name,
    training_input={
        "dataset": train_dataset.resource_name,
        "evaluation_dataset": eval_dataset.resource_name,
        "worker_pool_specs": worker_pool_specs,
        # (Optional) Additional arguments if needed:
        # "training_pipeline_id": "your_pipeline_id",
        # "base_output_dir": staging_bucket_uri + "/output/",
    },
    sync=True,
).get_tuned_model()

print(f"Fine-tuned model: {fine_tuned_model}")