<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/VertexAI_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Environment

In [None]:
!pip install google-cloud-aiplatform -q
!pip install google-cloud-storage -q
!pip install google-cloud-bigquery -q
!pip install google-cloud-bigquery-storage -q
!pip install google-cloud-aiplatform -q
!pip install datasets -q
!pip install colab-env -q

# Install necessary libraries
!pip install --upgrade --quiet gcsfs==2024.3.1
!pip install --upgrade --quiet accelerate==0.31.0
!pip install --upgrade --quiet transformers==4.45.2
!pip install --upgrade --quiet datasets==2.19.2
!pip install --upgrade google-cloud-aiplatform[all] -q
!pip install vertexai --upgrade -q
!pip install tensorflow_datasets -q

In [4]:
!pip install colab-env -q
import colab_env
import os
from google.cloud import aiplatform, storage
import logging
import subprocess

## Dataset - Vertex AI

In [None]:
import json
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from google.colab import auth
from google.cloud import aiplatform
import colab_env
import os

# ** Data Preparation**

# Authentication and Initialization**
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
REGION = os.environ["GOOGLE_CLOUD_REGION"]
BUCKET_NAME = os.environ["GOOGLE_CLOUD_BUCKET_NAME"]

# Load the dataset
dataset = load_dataset("frankmorales2020/flight_plan_waypoints")

# Convert to JSONL format with prompt and completion
def convert_to_jsonl(data, filename):
    with open(filename, "w") as f:
        for row in data:
            data_point = {
                "prompt": row["input"],
                "completion": str(row["label"]),  # Convert label to string
            }
            f.write(json.dumps(data_point) + "\n")

# Convert the Hugging Face Dataset to a list of dictionaries
dataset_list = list(dataset["train"])

# Split the dataset into training and evaluation sets
train_data, eval_data = train_test_split(dataset_list, test_size=0.2, random_state=42)

# Convert and save to JSONL files
convert_to_jsonl(train_data, "/content/training_data.jsonl")
convert_to_jsonl(eval_data, "/content/eval_data.jsonl")

print('\n\n')
print(f"Project ID: {PROJECT_ID}")
print(f"Region: {REGION}")
print(f"Bucket Name: {BUCKET_NAME}")
print('\n\n')

auth.authenticate_user()
aiplatform.init(project=PROJECT_ID, location=REGION)
print('\n\n')
# Upload to GCS
!gsutil ls gs://{BUCKET_NAME}/
print('\n\n')
!gsutil cp  /content/training_data.jsonl gs://{BUCKET_NAME}/
!gsutil cp  /content/eval_data.jsonl gs://{BUCKET_NAME}/
print('\n\n')

In [22]:
print(f"Length of train_data: {len(train_data)}")
print(train_data[0])
print(f"Length of eval_data: {len(eval_data)}")
print(eval_data[0])

Length of train_data: 1600
{'input': 'Calculate the waypoints from JFK to DUB. Departure: 2024-09-14, Aircraft: Boeing 757, Weather: Sunny', 'label': 7, 'distance': 6176.2203486565095, 'distance_category': 'long', 'waypoints': [[40.642947899999996, -73.7793733748521], [45.298594095237384, -17.78586892190225], [42.855448053630305, -47.16961137636025], [43.690065289044455, -37.13166134152348], [42.692446676984524, -49.13003060202848], [44.71641545571105, -24.787737379620857], [44.56731446277484, -26.580976548106896], [45.52410760535687, -15.0736156269691], [47.2956874, 6.2331927]], 'waypoint_names': ['JFK', 'Waypoint', 'Waypoint', 'Waypoint', 'Waypoint', 'Waypoint', 'Waypoint', 'Waypoint', 'DUB']}
Length of eval_data: 400
{'input': 'Calculate the waypoints from SAN to LAS. Departure: 2024-01-23, Aircraft: Boeing 777, Weather: Stormy', 'label': 7, 'distance': 16993.35324438778, 'distance_category': 'longhaul', 'waypoints': [[7.0000085, -73.2500086], [17.469538816793573, 68.81040414006927]

In [None]:
#  Create Vertex AI TextDatasets**

# Training dataset
train_dataset = aiplatform.TextDataset.create(
    display_name="waypoints-train",
    gcs_source=[f"gs://{BUCKET_NAME}/training_data.jsonl"],  # Replace with your GCS bucket and path
    import_schema_uri=aiplatform.schema.dataset.ioformat.text.single_label_classification,
)

# Evaluation dataset
eval_dataset = aiplatform.TextDataset.create(
    display_name="waypoints-eval",
    gcs_source=[f"gs://{BUCKET_NAME}/eval_data.jsonl"],  # Replace with your GCS bucket and path
    import_schema_uri=aiplatform.schema.dataset.ioformat.text.single_label_classification,
)

print(f"Training dataset created: {train_dataset.resource_name}")
print(f"Evaluation dataset created: {eval_dataset.resource_name}")

In [24]:
from google.cloud import storage
import json

# Initialize a GCS client
storage_client = storage.Client()

# Get the GCS bucket and file path from gcs_source
bucket_name = os.environ["GOOGLE_CLOUD_BUCKET_NAME"]
file_path = "training_data.jsonl"  # Replace if your file path is different

# Get the bucket and blob (file)
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(file_path)

# Download the file content as a string
file_content = blob.download_as_string().decode("utf-8")

# Iterate through the first five lines (JSON objects) and print
for i, line in enumerate(file_content.splitlines()):
    if i >= 5:
        break  # Stop after 5 records
    data_point = json.loads(line)
    print(f"Record {i + 1}:")
    print(f"  Prompt: {data_point['prompt']}")
    print(f"  Completion: {data_point['completion']}")
    print("-" * 20)  # Separator

Record 1:
  Prompt: Calculate the waypoints from JFK to DUB. Departure: 2024-09-14, Aircraft: Boeing 757, Weather: Sunny
  Completion: 7
--------------------
Record 2:
  Prompt: Calculate the waypoints from ORD to SEA. Departure: 2024-03-14, Aircraft: Boeing 757, Weather: Snowy
  Completion: 4
--------------------
Record 3:
  Prompt: Calculate the waypoints from SEA to KUL. Departure: 2024-03-23, Aircraft: Airbus A330, Weather: Rainy
  Completion: 8
--------------------
Record 4:
  Prompt: Calculate the waypoints from MUC to SCL. Departure: 2024-11-22, Aircraft: Boeing 777, Weather: Overcast
  Completion: 7
--------------------
Record 5:
  Prompt: Calculate the waypoints from DEN to NAS. Departure: 2024-04-08, Aircraft: Boeing 767, Weather: Snowy
  Completion: 5
--------------------


## Docker - Artifact Registry  

In [None]:
!gcloud artifacts repositories create my-repo-tmp \
    --repository-format=docker \
    --location=us-central1 \
    --description="My Docker repository" \
    --project={project_id}

In [None]:
import os
import colab_env

from google.cloud import storage # Import the storage module
storage_client = storage.Client()

# 1. Get Project ID and Region from Environment Variables
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
REGION = os.environ["GOOGLE_CLOUD_REGION"]
BUCKET_NAME = os.environ["GOOGLE_CLOUD_BUCKET_NAME"]

IMAGE_URI = os.environ.get(
    "IMAGE_URI", f"us-central1-docker.pkg.dev/{project_id}/my-repo-tmp/my-pytorch-image:latest"
)

### ---------


# *** Define and create trainer/train.py ***
train_py_content = """
import argparse
import os
import json
from google.cloud import storage
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer  # Import tokenizer (example using transformers)
import torch.distributed as dist

# Set up logging
logging.basicConfig(level=logging.INFO)

# CustomDataset definition
class CustomDataset(Dataset):
    def __init__(self, data_path):
        self.data = []
        storage_client = storage.Client()
        bucket_name = data_path.split('/')[2]
        blob_name = '/'.join(data_path.split('/')[3:])
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)

        tmp_file = "/tmp/temp_data.jsonl"
        blob.download_to_filename(tmp_file)

        with open(tmp_file, 'r') as f:
            for line in f:
                try:
                    example = json.loads(line)

                    # Validate and cast 'completion' to int
                    completion_value = example["completion"]
                    if not isinstance(completion_value, int):
                        try:
                            completion_value = int(completion_value)
                        except ValueError:
                            logging.warning(f"Skipping invalid 'completion' value: {completion_value} in line: {line}")
                            continue  # Skip this line

                    # Flatten and stringify 'prompt'
                    prompt = example["prompt"]
                    if isinstance(prompt, (list, tuple)):
                        prompt = " ".join([str(item) for item in prompt])
                    elif isinstance(prompt, dict):
                        prompt = json.dumps(prompt)
                    else:
                        prompt = str(prompt)

                    self.data.append((prompt, completion_value))
                except json.JSONDecodeError as e:
                    logging.warning(f"Skipping invalid JSON line: {line}, Error: {e}")
        os.remove(tmp_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt, completion = self.data[idx]
        return prompt, completion

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")

# MyModel definition
class MyModel(nn.Module):
    def __init__(self, embedding_dim=128, hidden_size=128):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, 1)  # Output layer for regression

    def forward(self, x):
        # Assuming x is a batch of tokenized prompts (indices)
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        out = self.fc(lstm_out[:, -1, :])  # Take the last hidden state
        return out

# Training function
def train_model(model_name, dataset_uri, eval_dataset_uri, staging_bucket, rank, world_size):
    logging.info(f"Model name: {model_name}")
    logging.info(f"Dataset URI: {dataset_uri}")
    logging.info(f"Eval Dataset URI: {eval_dataset_uri}")
    logging.info(f"Staging Bucket: {staging_bucket}")

    # Initialize process group for distributed training
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

    # Data loading
    train_dataset = CustomDataset(dataset_uri)
    eval_dataset = CustomDataset(eval_dataset_uri)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=world_size, rank=rank
    )
    train_loader = DataLoader(train_dataset, batch_size=32, sampler=train_sampler)
    eval_loader = DataLoader(eval_dataset, batch_size=32)

    # Model, optimizer, and loss
    model = MyModel()
    device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
    optimizer = optim.Adam(model.parameters())
    criterion = nn.MSELoss()  # Or other appropriate loss function

    # Tokenization (example using AutoTokenizer)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Replace with your tokenizer

    # Training loop
    for epoch in range(10):  # Adjust number of epochs as needed
        model.train()
        train_sampler.set_epoch(epoch)  # Important for shuffling data across epochs in DDP
        for batch_idx, (prompts, completions) in enumerate(train_loader):
            # Tokenize prompts
            inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
            prompts = inputs["input_ids"].to(device)  # Use tokenized input IDs and move to device

            # Convert completions to PyTorch tensor and move to device
            completions = torch.tensor(completions, dtype=torch.float32).to(device)  # Adjust dtype if needed

            optimizer.zero_grad()
            outputs = model(prompts)  # Pass tokenized prompts to the model
            loss = criterion(outputs, completions)
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0 and rank == 0:  # Only print on rank 0 to avoid duplicate logs
                logging.info(f"Epoch [{epoch + 1}/{10}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

        # ... (Evaluation on eval_loader) ...
        # Save the model if you are on the main process (rank 0)
        if rank == 0:
            torch.save(model.module.state_dict(), '/content/model.pth') # Save only the model parameters
            # Upload to GCS
            client = storage.Client()
            bucket = client.bucket(BUCKET_NAME)
            blob = bucket.blob('model.pth')  # Assuming 'model.pth' is your desired filename in GCS
            blob.upload_from_filename('/content/model.pth')
            logging.info(f"Uploaded model to gs://{BUCKET_NAME}/model.pth")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True, help="Name of the base Gemini model to fine-tune.")
    parser.add_argument("--dataset", type=str, required=True, help="URI of the training dataset.")
    parser.add_argument("--eval_dataset", type=str, required=True, help="URI of the evaluation dataset.")
    parser.add_argument("--staging_bucket", type=str, required=True, help="GCS bucket for staging model artifacts.")
    parser.add_argument('--local_rank', type=int, default=-1,
                        help='Local rank for distributed training')
    args = parser.parse_args()

    world_size = torch.cuda.device_count()
    train_model(args.model, args.dataset, args.eval_dataset, args.staging_bucket, args.local_rank, world_size)
"""


os.makedirs('/content/drive/myproject/', exist_ok=True)

# Create trainer/train.py
with open('/content/drive/myproject/train.py', 'w') as f:
    f.write(train_py_content)

# Create setup.py
setup_py_content = """
from setuptools import find_packages, setup

setup(
    name='trainer',
    version='0.1',
    packages=find_packages(),
    install_requires=[
        'google-cloud-aiplatform',
        'torch',
        'transformers'
    ]
)
"""
with open('/content/drive/myproject/setup.py', 'w') as f:
    f.write(setup_py_content)

print('\n\n')
print(setup_py_content)
print('\n\n')


# Package and upload training code
!python setup.py sdist --formats=gztar
PACKAGE_NAME = "trainer-0.1.tar.gz"

# **Authenticate explicitly for Google Cloud Storage**
from google.colab import auth
auth.authenticate_user()

# **Then create the storage client**
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(f"custom_training_models/{PACKAGE_NAME}")
blob.upload_from_filename(os.path.join("dist", PACKAGE_NAME))


# *** Create and upload cloudbuild.yaml and Dockerfile ***
cloudbuild_content = f"""
steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '{IMAGE_URI}', '.']
images:
- '{IMAGE_URI}'
"""

# Write cloudbuild.yaml locally
with open('cloudbuild.yaml', 'w') as f:
    f.write(cloudbuild_content)

blob = bucket.blob('/content/drive/myproject/cloudbuild.yaml')
blob.upload_from_filename('cloudbuild.yaml')


dockerfile_content = """
FROM pytorch/pytorch:latest

# Install other dependencies
RUN pip install google-cloud-aiplatform transformers

# Copy your training code
COPY . /trainer

# Set the entry point
ENTRYPOINT ["python", "-m", "torch.distributed.launch", "--nproc_per_node=NUM_GPUS_YOU_HAVE",  "/trainer/train.py"]
"""
with open('/content/drive/myproject/Dockerfile', 'w') as f:
    f.write(dockerfile_content)

#print('\n\n')
#!gsutil cp -pr trainer gs://{BUCKET_NAME}/
#print('\n\n')



# Create requirements.txt
requirements_content = """
google-cloud-aiplatform
torch
transformers
"""

# Write requirements.txt
with open("/content/drive/myproject/requirements.txt", "w") as f:
    f.write(requirements_content)



# Verify Artifact Registry Permissions
# Get Project Number (using project_id)
project_number_output = !gcloud projects describe {project_id} --format="value(projectNumber)"
project_number = project_number_output[0]

# Grant Storage Admin role to Cloud Build service account (using project_number)
!gsutil iam ch serviceAccount:{project_number}@cloudbuild.gserviceaccount.com:roles/storage.admin gs://{BUCKET_NAME}  # Replace YOUR_BUCKET


# Grant Artifact Registry Writer role to Cloud Build service account (using project_id and project_number)
!gcloud projects add-iam-policy-binding {project_id} \
  --member=serviceAccount:{project_number}@cloudbuild.gserviceaccount.com \
  --role=roles/artifactregistry.writer

# You can now print the configuration files or use them in your build process
print('\n')
print("Cloud Build Configuration:")
print('\n')
print("cloudbuild.yaml content:")
print(cloudbuild_content)
print('\n')
print("\nDockerfile content:")
print(dockerfile_content)
print('\n\n')

!gsutil cp gs://{BUCKET_NAME}/cloudbuild.yaml cloudbuild.yaml
print('\n')

!gsutil ls gs://{BUCKET_NAME}/
print('\n')

!gsutil ls gs://{BUCKET_NAME}/trainer
print('\n')

!gsutil ls gs://{BUCKET_NAME}/cloudbuild.yaml
print('\n')

!gsutil ls gs://{BUCKET_NAME}/logs
!gsutil cp -pr gs://{BUCKET_NAME}/logs .
print('\n')


print('\n\n')
print(f"Project ID: {project_id}")
print(f"Region: {REGION}")
print(f"Bucket Name: {BUCKET_NAME}")
print(f"Image URI: {IMAGE_URI}")
print('\n\n')


# Navigate to your project directory in Google Drive
project_path = '/content/drive/myproject'
os.chdir(project_path)
print(f"Current working directory: {os.getcwd()}")

In [None]:
!gcloud builds submit . --config cloudbuild.yaml --project {project_id}

ID                                    CREATE_TIME                DURATION  SOURCE                                                                                                    IMAGES                                                                                    STATUS
ad38b690-XXXXX-4b8d-xxxx-xxxxxxxxx  2025-03-31T02:29:07+00:00  3M39S     gs://LLL-KKKK-MMMMM-XXXXXXXXXX_cloudbuild/source/1743388147.235874-4248758bc9d2411cb5b407d317d2a1c2.tgz  us-central1-docker.pkg.dev/LLL-KKKK-MMMMM-XXXXXXXXXX/my-repo/my-pytorch-image (+1 more)  SUCCESS


## Tuning with CPU

In [None]:
### FINE TUNING ###

import colab_env

import os
from google.cloud import aiplatform, storage
import logging
import vertexai

# Set up logging
logging.basicConfig(level=logging.INFO)

# Google Cloud Project and Region
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
REGION = os.environ["GOOGLE_CLOUD_REGION"]
BUCKET_NAME = os.environ["GOOGLE_CLOUD_BUCKET_NAME"]
IMAGE_URI = f"us-central1-docker.pkg.dev/{PROJECT_ID}/my-repo/my-pytorch-image:latest"  # Your image URI in Artifact Registry

#print('\n\n')
#print(f"Project ID: {PROJECT_ID}")
#print(f"Region: {REGION}")
#print(f"Bucket Name: {BUCKET_NAME}")
#print(f"Image URI: {IMAGE_URI}")
#print('\n\n')

# Initialize Vertex AI and GCS
vertexai.init(project=PROJECT_ID, location=REGION)
storage_client = storage.Client()

# Create or verify GCS bucket
bucket = storage_client.bucket(BUCKET_NAME)
if not bucket.exists():
    bucket.create(location=REGION)
    logging.info(f"Bucket {BUCKET_NAME} created in {REGION}.")
else:
    logging.info(f"Bucket {BUCKET_NAME} already exists.")

# Configure staging bucket in aiplatform
aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=f"gs://{BUCKET_NAME}/staging")

# Print SDK version
logging.info(f"Vertex AI SDK version: {aiplatform.__version__}")


# Path to your saved model in GCS
MODEL_PATH = f"gs://{BUCKET_NAME}/model_output/" # Updated path


# *** Define and create trainer/train.py *** FOR CPU ONLY
train_py_content = """
import argparse
import os
import json
from google.cloud import storage
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer
import time  # For retry logic

# Set up logging
logging.basicConfig(level=logging.INFO)

# CustomDataset definition
class CustomDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        self.data = []
        self.tokenizer = tokenizer
        storage_client = storage.Client()
        bucket_name = data_path.split('/')[2]
        blob_name = '/'.join(data_path.split('/')[3:])
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)
        tmp_file = "/tmp/temp_data.jsonl"
        blob.download_to_filename(tmp_file)

        with open(tmp_file, 'r') as f:
            for line in f:
                try:
                    example = json.loads(line)
                    # Validate and cast 'completion' to int
                    completion_value = example["completion"]
                    if not isinstance(completion_value, int):
                        try:
                            completion_value = int(completion_value)
                        except ValueError:
                            logging.warning(
                                f"Skipping invalid 'completion' value: {completion_value} in line: {line}")
                            continue  # Skip this line
                    # Flatten and stringify 'prompt'
                    prompt = example["prompt"]
                    if isinstance(prompt, (list, tuple)):
                        prompt = "".join([str(item) for item in prompt])
                    elif isinstance(prompt, dict):
                        prompt = json.dumps(prompt)
                    else:
                        prompt = str(prompt)
                    self.data.append((prompt, completion_value))
                except json.JSONDecodeError as e:
                    logging.warning(f"Skipping invalid JSON line: {line}, error: {e}")
        os.remove(tmp_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt, completion = self.data[idx]
        # Ensure 'prompt' is a string before tokenization
        if isinstance(prompt, (list, tuple)):
            prompt = "".join([str(item) for item in prompt])  # Join list/tuple elements
        elif not isinstance(prompt, str):
            prompt = str(prompt)  # Convert to string if not already
        # Tokenize the prompt using the provided tokenizer
        inputs = self.tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt")
        # Return the input IDs and completion
        # inputs["input_ids"] will be a tensor of shape [1, sequence_length]
        # We squeeze it to get a tensor of shape [sequence_length]
        return inputs["input_ids"].squeeze(0), completion

# MyModel definition
class MyModel(nn.Module):
    def __init__(self, embedding_dim=128, hidden_size=128, tokenizer=None):  # Add tokenizer
        super(MyModel, self).__init__()
        self.tokenizer = tokenizer
        self.embedding = nn.Embedding(self.tokenizer.vocab_size, embedding_dim)  # Use self.tokenizer
        self.lstm = nn.LSTM(embedding_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, 1)  # Output layer for regression

    def forward(self, x):
        # No need to move x to device here, it's already on the correct device
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        out = self.fc(lstm_out[:, -1, :])  # Take the last hidden state
        return out

# Training function
def train_model(model_name, dataset_uri, eval_dataset_uri, staging_bucket, bucket_name, base_output_dir):
    logging.info(f"Model name: {model_name}")
    logging.info(f"Dataset URI: {dataset_uri}")
    logging.info(f"Eval Dataset URI: {eval_dataset_uri}")
    logging.info(f"Staging Bucket: {staging_bucket}")
    logging.info(f"Bucket Name: {bucket_name}")
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Tokenization (example using AutoTokenizer)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Replace if needed

    # Data loading
    dataset = CustomDataset(dataset_uri, tokenizer)  # Pass tokenizer to CustomDataset
    train_size = int(0.8 * len(dataset))  # 80% for training
    eval_size = len(dataset) - train_size  # Remaining 20% for evaluation
    train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=32)

    # Model, optimizer, and loss
    model = MyModel(tokenizer=tokenizer)  # Pass tokenizer to MyModel
    device = torch.device("cpu")  # Explicitly use CPU
    model.to(device)
    print(f"Using device: {device}")
    optimizer = optim.Adam(model.parameters())
    criterion = nn.MSELoss()  # Or other appropriate loss function

    # Debugging: Print working directory
    logging.info(f"Current working directory: {os.getcwd()}")

    # Simplified Training Loop
    try:
        for epoch in range(10):  # Train for only 1 epoch for debugging
            model.train()
            #num_training_steps = 10  # Limit training steps per epoch
            for batch_idx, (prompts, completions) in enumerate(train_loader):
                #if batch_idx >= num_training_steps:
                    #break  # Exit inner loop after the desired number of steps
                prompts = prompts.to(device)  # Move prompts to device
                completions = torch.tensor(completions, dtype=torch.float32).unsqueeze(1).to(device)
                optimizer.zero_grad()
                outputs = model(prompts)
                loss = criterion(outputs, completions)
                loss.backward()
                optimizer.step()
                if batch_idx % 100 == 0:  # Adjust logging frequency if needed
                    logging.info(f"Epoch [{epoch + 1}/1], Batch [{batch_idx}], Loss: {loss.item()}")
    except Exception as e:
        logging.error(f"An error occurred during training: {e}")
    finally:
        # Directory creation and model saving
        client = storage.Client()
        bucket = client.bucket(bucket_name)

        # Construct the directory path *carefully* with a trailing slash
        output_dir = base_output_dir.replace(f"gs://{bucket_name}/", "")  # Path within the bucket
        output_dir = output_dir.rstrip('/') + '/'  # Ensure trailing slash

        # Check if the directory exists and create it if necessary
        blob = bucket.blob(output_dir)
        if not blob.exists():
            logging.info(f"Creating directory: gs://{bucket_name}/{output_dir}")
            bucket.blob(output_dir).upload_from_string("")  # Create an empty object to simulate a directory

        # Construct the full model save path
        model_save_path = os.path.join(f"gs://{bucket_name}/" + output_dir, 'model.pth')
        logging.info(f"Saving model to: {model_save_path}")  # Log the final path

        # Retry logic for saving the model
        max_retries = 3
        for attempt in range(max_retries):
            try:
                torch.save(model.state_dict(), "/tmp/model.pth")  # Save locally first
                # Upload the model from the local temp file to GCS
                bucket.blob(output_dir + 'model.pth').upload_from_filename("/tmp/model.pth")
                logging.info(f"Model saved to: {model_save_path}")
                break  # Success, exit the loop
            except Exception as e:
                logging.error(f"Error saving model (attempt {attempt + 1}): {e}")
                if attempt < max_retries - 1:
                    time.sleep(5)  # Wait a few seconds before retrying
                else:
                    raise  # Re-raise the exception if retries fail

        # Create a symbolic link to satisfy Vertex AI's directory expectations
        symlink_path = os.path.join(base_output_dir, "model", "model.pth")
        os.makedirs(os.path.dirname(symlink_path), exist_ok=True)  # Create directory if it doesn't exist
        os.symlink(model_save_path, symlink_path)  # Create the symbolic link
        logging.info(f"Created symbolic link from {model_save_path} to {symlink_path}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True, help="Name of the model")
    parser.add_argument("--dataset", type=str, required=True, help="URI of the training dataset")
    parser.add_argument("--eval_dataset", type=str, required=True, help="URI of the evaluation dataset")
    parser.add_argument("--staging_bucket", type=str, required=True, help="Staging bucket URI")
    parser.add_argument("--bucket_name", type=str, required=True, help="Bucket Name")
    parser.add_argument("--base_output_dir", type=str, required=True, help="Base output directory")
    args = parser.parse_args()
    train_model(args.model, args.dataset, args.eval_dataset, args.staging_bucket, args.bucket_name,
                args.base_output_dir)
"""


# Create trainer/train.py
os.makedirs('trainer', exist_ok=True)
with open('trainer/train.py', 'w') as f:
    f.write(train_py_content)

# Create setup.py
setup_py_content = """
from setuptools import find_packages, setup

setup(
    name='trainer',
    version='0.1',
    packages=find_packages(),
    install_requires=[
        'google-cloud-aiplatform',
        'torch',
        'transformers'
    ]
)
"""
with open('setup.py', 'w') as f:
    f.write(setup_py_content)

# Package and upload training code
!python setup.py sdist --formats=gztar
PACKAGE_NAME = "trainer-0.1.tar.gz"
bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(f"custom_training_models/{PACKAGE_NAME}")
blob.upload_from_filename(os.path.join("dist", PACKAGE_NAME))

# *** Create and upload cloudbuild.yaml and Dockerfile ***
cloudbuild_content = f"""
steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '{IMAGE_URI}', '.']
images:
- '{IMAGE_URI}'
"""

# Write cloudbuild.yaml locally
with open('cloudbuild.yaml', 'w') as f:
    f.write(cloudbuild_content)

blob = bucket.blob('cloudbuild.yaml')
blob.upload_from_filename('cloudbuild.yaml')

# Dockerfile content - Specify requirements and copy necessary files
dockerfile_content = f"""
# Or a GPU-enabled PyTorch image if needed
FROM pytorch/pytorch:1.11-cpu
WORKDIR /app
COPY requirements.txt requirements.txt

# Install the required libraries in the Docker image
RUN pip install --no-cache-dir -r requirements.txt

# Copy the training code into the Docker image
COPY trainer trainer

# Set the entry point for the Docker image
ENTRYPOINT ["python", "trainer/train.py"]
"""
# Write Dockerfile locally
with open('Dockerfile', 'w') as f:
    f.write(dockerfile_content)

blob = bucket.blob('Dockerfile')
blob.upload_from_filename('Dockerfile')

# Create requirements.txt
requirements_content = """
google-cloud-aiplatform
torch
transformers
"""

# Write requirements.txt
with open("requirements.txt", "w") as f:
    f.write(requirements_content)


!gsutil cp -pr trainer gs://{BUCKET_NAME}/
!gsutil cp -pr Dockerfile gs://{BUCKET_NAME}/trainer/
!gsutil cp -pr requirements.txt gs://{BUCKET_NAME}/trainer/


# Define and run custom training job
TRAINING_DATA_PATH = f"gs://{BUCKET_NAME}/training_data.jsonl"
EVAL_DATA_PATH = f"gs://{BUCKET_NAME}/eval_data.jsonl"
BASE_MODEL_NAME = "chat-bison@001"  # Or your desired base model
BASE_OUTPUT_DIR = f"gs://{BUCKET_NAME}/model_output"  # Define base output directory

job = aiplatform.CustomTrainingJob(
    display_name="POC-my-custom-training-job",
    script_path="trainer/train.py",

    #container_uri=IMAGE_URI,  # my image URI in Artifact Registry by vertex ai does not validate it

    container_uri='us-docker.pkg.dev/vertex-ai/training/pytorch-xla.2-4.py310:latest',

    requirements=["google-cloud-aiplatform", "torch", "transformers"],
    model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/pytorch-cpu.1-11:latest"
)

model = job.run(
    args=[
        "--model", BASE_MODEL_NAME,
        "--dataset", TRAINING_DATA_PATH,
        "--eval_dataset", EVAL_DATA_PATH,
        "--staging_bucket", f"gs://{BUCKET_NAME}/staging",
        "--bucket_name", BUCKET_NAME,
        "--base_output_dir", BASE_OUTPUT_DIR,
    ],
    replica_count=1,
    machine_type="n1-standard-8",
    model_display_name="my-pytorch-model",
    base_output_dir=BASE_OUTPUT_DIR,
)

logging.info(f"Fine-tuned model: {model}")
print(f"Fine-tuned model: {model}")

In [71]:
print('Fine tuned model is successfully created')
print(f"Fine-tuned model: {model.display_name}")

Fine tuned model is successfully created
Fine-tuned model: my-pytorch-model


## GCP - VERTEX AI IAM

In [None]:
!gsutil ls gs://{BUCKET_NAME}/model_output

In [None]:
!gsutil cp /content/requirements.txt gs://{BUCKET_NAME}/model_output/model/

In [None]:
!gsutil ls gs://{BUCKET_NAME}/model_output/model

In [None]:
!gsutil ls gs://{BUCKET_NAME}/model_output/

In [None]:
!gsutil acl get gs://{BUCKET_NAME}

In [None]:
!gsutil iam get gs://{BUCKET_NAME}

In [55]:
!gsutil iam ch serviceAccount:{project_number}@cloudbuild.gserviceaccount.com:roles/storage.objectCreator gs://{BUCKET_NAME}