<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/VertexAI_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-aiplatform -q
!pip install google-cloud-storage -q
!pip install google-cloud-bigquery -q
!pip install google-cloud-bigquery-storage -q
!pip install google-cloud-aiplatform -q
!pip install datasets -q
!pip install colab-env -q

# Install necessary libraries
!pip install --upgrade --quiet gcsfs==2024.3.1
!pip install --upgrade --quiet accelerate==0.31.0
!pip install --upgrade --quiet transformers==4.45.2
!pip install --upgrade --quiet datasets==2.19.2
!pip install --upgrade google-cloud-aiplatform[all] -q
!pip install vertexai --upgrade -q
!pip install tensorflow_datasets -q

In [None]:
!pip install colab-env -q
import colab_env
import os
from google.cloud import aiplatform, storage
import logging
import subprocess

In [None]:
!gcloud artifacts repositories create my-repo-tmp \
    --repository-format=docker \
    --location=us-central1 \
    --description="My Docker repository" \
    --project={project_id}

In [None]:
import os
import colab_env

from google.cloud import storage # Import the storage module
storage_client = storage.Client()

# 1. Get Project ID and Region from Environment Variables
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
REGION = os.environ["GOOGLE_CLOUD_REGION"]
BUCKET_NAME = os.environ["GOOGLE_CLOUD_BUCKET_NAME"]

IMAGE_URI = os.environ.get(
    "IMAGE_URI", f"us-central1-docker.pkg.dev/{project_id}/my-repo-tmp/my-pytorch-image:latest"
)

### ---------


# *** Define and create trainer/train.py ***
train_py_content = """
import argparse
import os
import json
from google.cloud import storage
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer  # Import tokenizer (example using transformers)
import torch.distributed as dist

# Set up logging
logging.basicConfig(level=logging.INFO)

# CustomDataset definition
class CustomDataset(Dataset):
    def __init__(self, data_path):
        self.data = []
        storage_client = storage.Client()
        bucket_name = data_path.split('/')[2]
        blob_name = '/'.join(data_path.split('/')[3:])
        bucket = storage_client.bucket(bucket_name)
        blob = bucket.blob(blob_name)

        tmp_file = "/tmp/temp_data.jsonl"
        blob.download_to_filename(tmp_file)

        with open(tmp_file, 'r') as f:
            for line in f:
                try:
                    example = json.loads(line)

                    # Validate and cast 'completion' to int
                    completion_value = example["completion"]
                    if not isinstance(completion_value, int):
                        try:
                            completion_value = int(completion_value)
                        except ValueError:
                            logging.warning(f"Skipping invalid 'completion' value: {completion_value} in line: {line}")
                            continue  # Skip this line

                    # Flatten and stringify 'prompt'
                    prompt = example["prompt"]
                    if isinstance(prompt, (list, tuple)):
                        prompt = " ".join([str(item) for item in prompt])
                    elif isinstance(prompt, dict):
                        prompt = json.dumps(prompt)
                    else:
                        prompt = str(prompt)

                    self.data.append((prompt, completion_value))
                except json.JSONDecodeError as e:
                    logging.warning(f"Skipping invalid JSON line: {line}, Error: {e}")
        os.remove(tmp_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt, completion = self.data[idx]
        return prompt, completion

# MyModel definition
class MyModel(nn.Module):
    def __init__(self, vocab_size=10000, embedding_dim=128, hidden_size=128):
        super(MyModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size)
        self.fc = nn.Linear(hidden_size, 1)  # Output layer for regression

    def forward(self, x):
        # Assuming x is a batch of tokenized prompts (indices)
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        out = self.fc(lstm_out[:, -1, :])  # Take the last hidden state
        return out

# Training function
def train_model(model_name, dataset_uri, eval_dataset_uri, staging_bucket, rank, world_size):
    logging.info(f"Model name: {model_name}")
    logging.info(f"Dataset URI: {dataset_uri}")
    logging.info(f"Eval Dataset URI: {eval_dataset_uri}")
    logging.info(f"Staging Bucket: {staging_bucket}")

    # Initialize process group for distributed training
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

    # Data loading
    train_dataset = CustomDataset(dataset_uri)
    eval_dataset = CustomDataset(eval_dataset_uri)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=world_size, rank=rank
    )
    train_loader = DataLoader(train_dataset, batch_size=32, sampler=train_sampler)
    eval_loader = DataLoader(eval_dataset, batch_size=32)

    # Model, optimizer, and loss
    model = MyModel()
    device = torch.device(f"cuda:{rank}" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
    optimizer = optim.Adam(model.parameters())
    criterion = nn.MSELoss()  # Or other appropriate loss function

    # Tokenization (example using AutoTokenizer)
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Replace with your tokenizer

    # Training loop
    for epoch in range(10):  # Adjust number of epochs as needed
        model.train()
        train_sampler.set_epoch(epoch)  # Important for shuffling data across epochs in DDP
        for batch_idx, (prompts, completions) in enumerate(train_loader):
            # Tokenize prompts
            inputs = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt")
            prompts = inputs["input_ids"].to(device)  # Use tokenized input IDs and move to device

            # Convert completions to PyTorch tensor and move to device
            completions = torch.tensor(completions, dtype=torch.float32).to(device)  # Adjust dtype if needed

            optimizer.zero_grad()
            outputs = model(prompts)  # Pass tokenized prompts to the model
            loss = criterion(outputs, completions)
            loss.backward()
            optimizer.step()

            if batch_idx % 100 == 0 and rank == 0:  # Only print on rank 0 to avoid duplicate logs
                logging.info(f"Epoch [{epoch + 1}/{10}], Batch [{batch_idx + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")

        # ... (Evaluation on eval_loader) ...
        # Save the model if you are on the main process (rank 0)
        if rank == 0:
            torch.save(model.module.state_dict(), '/content/model.pth') # Save only the model parameters
            # Upload to GCS
            client = storage.Client()
            bucket = client.bucket(BUCKET_NAME)
            blob = bucket.blob('model.pth')  # Assuming 'model.pth' is your desired filename in GCS
            blob.upload_from_filename('/content/model.pth')
            logging.info(f"Uploaded model to gs://{BUCKET_NAME}/model.pth")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, required=True, help="Name of the base Gemini model to fine-tune.")
    parser.add_argument("--dataset", type=str, required=True, help="URI of the training dataset.")
    parser.add_argument("--eval_dataset", type=str, required=True, help="URI of the evaluation dataset.")
    parser.add_argument("--staging_bucket", type=str, required=True, help="GCS bucket for staging model artifacts.")
    parser.add_argument('--local_rank', type=int, default=-1,
                        help='Local rank for distributed training')
    args = parser.parse_args()

    world_size = torch.cuda.device_count()
    train_model(args.model, args.dataset, args.eval_dataset, args.staging_bucket, args.local_rank, world_size)
"""


os.makedirs('/content/drive/myproject/', exist_ok=True)

# Create trainer/train.py
with open('/content/drive/myproject/train.py', 'w') as f:
    f.write(train_py_content)

# Create setup.py
setup_py_content = """
from setuptools import find_packages, setup

setup(
    name='trainer',
    version='0.1',
    packages=find_packages(),
    install_requires=[
        'google-cloud-aiplatform',
        'torch',
        'transformers'
    ]
)
"""
with open('/content/drive/myproject/setup.py', 'w') as f:
    f.write(setup_py_content)

print('\n\n')
print(setup_py_content)
print('\n\n')


# Package and upload training code
!python setup.py sdist --formats=gztar
PACKAGE_NAME = "trainer-0.1.tar.gz"

# **Authenticate explicitly for Google Cloud Storage**
from google.colab import auth
auth.authenticate_user()

# **Then create the storage client**
storage_client = storage.Client()

bucket = storage_client.bucket(BUCKET_NAME)
blob = bucket.blob(f"custom_training_models/{PACKAGE_NAME}")
blob.upload_from_filename(os.path.join("dist", PACKAGE_NAME))


# *** Create and upload cloudbuild.yaml and Dockerfile ***
cloudbuild_content = f"""
steps:
- name: 'gcr.io/cloud-builders/docker'
  args: ['build', '-t', '{IMAGE_URI}', '.']
images:
- '{IMAGE_URI}'
"""

# Write cloudbuild.yaml locally
with open('cloudbuild.yaml', 'w') as f:
    f.write(cloudbuild_content)

blob = bucket.blob('/content/drive/myproject/cloudbuild.yaml')
blob.upload_from_filename('cloudbuild.yaml')


dockerfile_content = """
FROM pytorch/pytorch:latest

# Install other dependencies
RUN pip install google-cloud-aiplatform transformers

# Copy your training code
COPY . /trainer

# Set the entry point
ENTRYPOINT ["python", "-m", "torch.distributed.launch", "--nproc_per_node=NUM_GPUS_YOU_HAVE",  "/trainer/train.py"]
"""
with open('/content/drive/myproject/Dockerfile', 'w') as f:
    f.write(dockerfile_content)

#print('\n\n')
#!gsutil cp -pr trainer gs://{BUCKET_NAME}/
#print('\n\n')



# Create requirements.txt
requirements_content = """
google-cloud-aiplatform
torch
transformers
"""

# Write requirements.txt
with open("/content/drive/myproject/requirements.txt", "w") as f:
    f.write(requirements_content)



# Verify Artifact Registry Permissions
# Get Project Number (using project_id)
project_number_output = !gcloud projects describe {project_id} --format="value(projectNumber)"
project_number = project_number_output[0]

# Grant Storage Admin role to Cloud Build service account (using project_number)
!gsutil iam ch serviceAccount:{project_number}@cloudbuild.gserviceaccount.com:roles/storage.admin gs://{BUCKET_NAME}  # Replace YOUR_BUCKET


# Grant Artifact Registry Writer role to Cloud Build service account (using project_id and project_number)
!gcloud projects add-iam-policy-binding {project_id} \
  --member=serviceAccount:{project_number}@cloudbuild.gserviceaccount.com \
  --role=roles/artifactregistry.writer

# You can now print the configuration files or use them in your build process
print('\n')
print("Cloud Build Configuration:")
print('\n')
print("cloudbuild.yaml content:")
print(cloudbuild_content)
print('\n')
print("\nDockerfile content:")
print(dockerfile_content)
print('\n\n')

!gsutil cp gs://{BUCKET_NAME}/cloudbuild.yaml cloudbuild.yaml
print('\n')

!gsutil ls gs://{BUCKET_NAME}/
print('\n')

!gsutil ls gs://{BUCKET_NAME}/trainer
print('\n')

!gsutil ls gs://{BUCKET_NAME}/cloudbuild.yaml
print('\n')

!gsutil ls gs://{BUCKET_NAME}/logs
!gsutil cp -pr gs://{BUCKET_NAME}/logs .
print('\n')


print('\n\n')
print(f"Project ID: {project_id}")
print(f"Region: {REGION}")
print(f"Bucket Name: {BUCKET_NAME}")
print(f"Image URI: {IMAGE_URI}")
print('\n\n')


# Navigate to your project directory in Google Drive
project_path = '/content/drive/myproject'
os.chdir(project_path)
print(f"Current working directory: {os.getcwd()}")

In [None]:
!gcloud builds submit . --config cloudbuild.yaml --project {project_id}

ID                                    CREATE_TIME                DURATION  SOURCE                                                                                                    IMAGES                                                                                    STATUS
ad38b690-XXXXX-4b8d-xxxx-xxxxxxxxx  2025-03-31T02:29:07+00:00  3M39S     gs://LLL-KKKK-MMMMM-XXXXXXXXXX_cloudbuild/source/1743388147.235874-4248758bc9d2411cb5b407d317d2a1c2.tgz  us-central1-docker.pkg.dev/LLL-KKKK-MMMMM-XXXXXXXXXX/my-repo/my-pytorch-image (+1 more)  SUCCESS
