In [1]:
from azureml.core import Workspace

# Load Azure ML Workspace (Assumes config.json is in the same directory)
ws = Workspace.from_config()
print("Azure ML Workspace loaded:", ws.name)


Azure ML Workspace loaded: azure_ml


In [2]:
from azureml.core import Environment

pytorch_env = Environment("pytorch-env")
pytorch_env.python.conda_dependencies.add_pip_package("torch==2.1.0")
pytorch_env.python.conda_dependencies.add_pip_package("torchvision==0.16.0")

pytorch_env.register(workspace=ws)
print("New PyTorch environment created.")


New PyTorch environment created.


In [3]:
import torch
import torchvision



In [5]:
import kagglehub

In [6]:

import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
import os
from azureml.core import Workspace, Experiment, Model, Run

run = Run.get_context()

In [7]:
# Download latest version
path = kagglehub.dataset_download("ajayrana/hymenoptera-data")

print("Path to dataset files:", path)

 79%|███████▊  | 71.0M/90.2M [00:00<00:00, 73.4MB/s]


Path to dataset files: /home/azureuser/.cache/kagglehub/datasets/ajayrana/hymenoptera-data/versions/1


In [9]:
import os

print("Downloaded path:", path)
print("Files in dataset folder:", os.listdir(path))


Downloaded path: /home/azureuser/.cache/kagglehub/datasets/ajayrana/hymenoptera-data/versions/1
Files in dataset folder: ['hymenoptera_data']


In [10]:
import shutil

dataset_path = "./hymenoptera_data"

# Move files if they are already extracted
if os.path.isdir(path):
    shutil.copytree(path, dataset_path, dirs_exist_ok=True)
    print("Dataset moved successfully.")


Dataset moved successfully.


In [12]:
import os

# Update path to the correct dataset directory
dataset_path = "./hymenoptera_data/hymenoptera_data/hymenoptera_data"

# Define train and validation directories
train_dir = os.path.join(dataset_path, "train")
val_dir = os.path.join(dataset_path, "val")

print("Train Directory:", train_dir)
print("Validation Directory:", val_dir)

# Check if the directories exist
assert os.path.isdir(train_dir), "Train directory not found!"
assert os.path.isdir(val_dir), "Validation directory not found!"


Train Directory: ./hymenoptera_data/hymenoptera_data/hymenoptera_data/train
Validation Directory: ./hymenoptera_data/hymenoptera_data/hymenoptera_data/val


In [13]:
print("Classes in train folder:", os.listdir(train_dir))
print("Classes in val folder:", os.listdir(val_dir))


Classes in train folder: ['ants', 'bees']
Classes in val folder: ['ants', 'bees']


In [14]:
data_transforms = {
    "train": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    "val": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

image_datasets = {
    "train": datasets.ImageFolder(train_dir, data_transforms["train"]),
    "val": datasets.ImageFolder(val_dir, data_transforms["val"]),
}

dataloaders = {
    "train": torch.utils.data.DataLoader(image_datasets["train"], batch_size=32, shuffle=True),
    "val": torch.utils.data.DataLoader(image_datasets["val"], batch_size=32, shuffle=False),
}

class_names = image_datasets["train"].classes


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load VGG16 model
model = models.vgg16(pretrained=True)

# Modify the classifier
num_ftrs = model.classifier[6].in_features
model.classifier[6] = nn.Linear(num_ftrs, len(class_names))

model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=0.001)


Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /home/azureuser/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:02<00:00, 232MB/s]  


In [16]:
num_epochs = 5

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    for phase in ["train", "val"]:
        if phase == "train":
            model.train()
        else:
            model.eval()

        running_loss, correct = 0.0, 0
        
        for inputs, labels in dataloaders[phase]:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            with torch.set_grad_enabled(phase == "train"):
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                if phase == "train":
                    loss.backward()
                    optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            correct += (outputs.argmax(1) == labels).sum().item()

        epoch_loss = running_loss / len(image_datasets[phase])
        epoch_acc = correct / len(image_datasets[phase])
        print(f"{phase} Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.4f}")

print("Training complete.")


Epoch 1/5
train Loss: 2.8378, Acc: 0.8033
val Loss: 1.4083, Acc: 0.8954
Epoch 2/5
train Loss: 1.1733, Acc: 0.8934
val Loss: 2.6172, Acc: 0.9085
Epoch 3/5
train Loss: 0.4943, Acc: 0.9713
val Loss: 2.2651, Acc: 0.9412
Epoch 4/5
train Loss: 0.3815, Acc: 0.9713
val Loss: 1.8385, Acc: 0.9412
Epoch 5/5
train Loss: 0.2831, Acc: 0.9754
val Loss: 1.6123, Acc: 0.9477
Training complete.


In [17]:
model_path = "vgg16_hymenoptera.pth"
torch.save(model.state_dict(), model_path)
print("Model saved.")


Model saved.


In [18]:
Model.register(
    workspace=ws,
    model_path=model_path,
    model_name="vgg16_hymenoptera",
    description="Fine-tuned VGG16 model for hymenoptera classification",
)

print("Model registered in Azure ML.")


Registering model vgg16_hymenoptera
Model registered in Azure ML.
