In [18]:
import numpy as np
import os
import tempfile
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from filelock import FileLock
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from torchvision import models, datasets
from torch.optim import lr_scheduler
from typing import Dict, Optional, Any
import ray
import time
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler
from tempfile import TemporaryDirectory
from ray.air.integrations.wandb import WandbLoggerCallback, setup_wandb
from ray.tune.search.optuna import OptunaSearch
from ray.tune.search import ConcurrencyLimiter

In [19]:
import wandb

In [10]:
import shutil
source_dir = <DIR_PATH_TO_ASL_TRAIN>
root_dir = <ROOT_PROJECT_PATH>
train_dir = os.path.join(root_dir,'asl_train')
test_dir = os.path.join(root_dir,'asl_test')

if not os.path.exists(train_dir):
    os.mkdir(train_dir)
if not os.path.exists(test_dir):
    os.mkdir(test_dir)

for class_ in os.listdir(source_dir):
    source_folder = os.path.join(source_dir,class_)
    dest_folder = os.path.join(train_dir,class_)
    os.mkdir(dest_folder)
    
    file_list = os.listdir(source_folder)
    ceil_95 = int(len(file_list) * .95)
    for file in file_list[:ceil_95]:
        shutil.copy2(os.path.join(source_folder,file),os.path.join(dest_folder,file))
    dest_folder_test = os.path.join(test_dir,class_)
    os.mkdir(dest_folder_test)
    for file in file_list[ceil_95:]:
        shutil.copy2(os.path.join(source_folder,file),os.path.join(dest_folder_test,file))
    

FileExistsError: [Errno 17] File exists: '/kaggle/working/asl_train/N'

In [20]:
def load_data(data_dir="./data"):
    test_transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    full_dataset = datasets.ImageFolder(<DATA_TRAIN_PATH>)
    test_dataset = datasets.ImageFolder(<DATA_TEST_PATH>,test_transform)
    train_dataset, val_dataset = random_split(full_dataset, [0.95, 0.05])

    train_dataset.dataset.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    
    val_dataset.dataset.transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
    val_loader =  torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)
    
    
    class_names = full_dataset.classes

    return train_loader, val_loader, test_loader,{'train': len(train_dataset), 'val': len(val_dataset)}, class_names


In [21]:
def train_model(config):
    print('model loaded')
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model_ft = models.resnet18(weights='IMAGENET1K_V1')
    train_loader, val_loader, _, dataset_sizes, class_names = load_data()
    for param in model_ft.parameters():
        param.requires_grad = False
    num_ftrs = model_ft.fc.in_features
    if config['normalization']:
        model_ft.fc = nn.Sequential(
            nn.BatchNorm1d(num_ftrs),  
            nn.Linear(num_ftrs, len(class_names))  
        )
    else:
        model_ft.fc = nn.Linear(num_ftrs, len(class_names))

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            model_ft = nn.DataParallel(model_ft)

    model_ft = model_ft.to(device)
    criterion = nn.CrossEntropyLoss()
    
    if config['optimizer'] == 'SGD':
        optimizer_ft = optim.SGD(model_ft.parameters(), lr=config['lr'], momentum=config['momentum'])
    elif config['optimizer'] == 'ADAM':
        optimizer_ft = optim.Adam(model_ft.parameters(), lr=config['lr'], betas=(config['beta1'], config['beta2']), weight_decay=config['decay'])
    else:
        optimizer_ft = optim.RMSprop(model_ft.parameters(), lr=config['lr'], alpha=config['alpha'], momentum=config['momentum'])
    
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=config['step_size'], gamma=config['gamma'])
    since = time.time()
    dataloaders = {'train' : train_loader, 'val': val_loader}


    if train.get_checkpoint():
        loaded_checkpoint = train.get_checkpoint()
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
            model_state, optimizer_state, scheduler_state = torch.load(
                os.path.join(loaded_checkpoint_dir, "checkpoint.pt")
            )
            model_ft.load_state_dict(model_state)
            optimizer_ft.load_state_dict(optimizer_state)
            exp_lr_scheduler.load_state_dict(scheduler_state)

    # Create a temporary directory to save training checkpoints
    with TemporaryDirectory() as tempdir:
        temp_checkpoint_dir = os.path.join(tempdir, 'checkpoint.pt')

        for epoch in range(config['epochs']):
            print(f"epoch: {epoch}")
            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    model_ft.train()  # Set model to training mode
                else:
                    model_ft.eval()   # Set model to evaluate mode

                running_loss = 0.0
                running_corrects = 0

                # Iterate over data.
                iterator = 0
                for inputs, labels in dataloaders[phase]:
                    if iterator % 100 == 0:
                        print(f'Batch {iterator}/{len(dataloaders[phase])}')
                    iterator += 1
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer_ft.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model_ft(inputs)
                        _, preds = torch.max(outputs, 1)
                        loss = criterion(outputs, labels)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer_ft.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                    
                if phase == 'train':
                    exp_lr_scheduler.step()

                epoch_loss = running_loss / dataset_sizes[phase]
                epoch_acc = running_corrects.double() / dataset_sizes[phase]

                if phase == 'val':
                    torch.save((model_ft.state_dict(),optimizer_ft.state_dict(), exp_lr_scheduler.state_dict()), temp_checkpoint_dir)    
                    checkpoint = Checkpoint.from_directory(tempdir)
                    print(f"Val Loss: {epoch_loss}, Val Acc: {epoch_acc}")
                    train.report(
                    {"loss": float(epoch_loss), "accuracy": float(epoch_acc)},
                    checkpoint=checkpoint)

        time_elapsed = time.time() - since
        print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')

In [22]:
def test_best_model(best_result, smoke_test=False):
    config = best_result.config
    _, _, testloader, dataset_sizes, class_names = load_data()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    best_trained_model = models.resnet18(weights='IMAGENET1K_V1')
    for param in best_trained_model.parameters():
        param.requires_grad = False
    num_ftrs = best_trained_model.fc.in_features
    if config['normalization']:
        best_trained_model.fc = nn.Sequential(
            nn.BatchNorm1d(num_ftrs),  
            nn.Linear(num_ftrs, len(class_names))  
        )
    else:
        best_trained_model.fc = nn.Linear(num_ftrs, len(class_names))
    
    best_trained_model = best_trained_model.to(device)
    criterion = nn.CrossEntropyLoss()
    checkpoint_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")

    model_state, optimizer_state, scheduler_state = torch.load(checkpoint_path)
    torch.save((model_state,optimizer_state, scheduler_state), 'model_state.pt') 
    best_trained_model.load_state_dict(model_state)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = best_trained_model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()


    print("Best trial test set accuracy: {}".format(correct / total))

In [23]:
def define_by_run_func(trial) -> Optional[Dict[str, Any]]:
    optimizer = trial.suggest_categorical("optimizer", ["SGD", "ADAM",'RMSprop'])

    trial.suggest_float("lr", 1e-5, 1e-2, log=True)
    if optimizer == "SGD":
        trial.suggest_float("momentum", 0, 0.9)
    elif optimizer == 'ADAM':
        trial.suggest_float("beta1", .9, 0.999)
        trial.suggest_float("beta2", .9, 0.999)
        trial.suggest_float("decay", 0, .2)
    else:
        trial.suggest_float("momentum", 0, 0.9)
        trial.suggest_float('alpha',0.9,0.999)

    trial.suggest_categorical("normalization", [False, True])
    trial.suggest_float('gamma',1e-3,1e-1)
    trial.suggest_int("step_size", 5, 15, step=2)
        
    return {"epochs": 4}

In [24]:
! wandb login <YOUR_WAND_API_KEY>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [25]:
def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2, smoke_test=False):
    scheduler = ASHAScheduler(
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2,
        metric='accuracy',
        mode='min')

    algo = OptunaSearch(space=define_by_run_func,metric="accuracy", mode="min")
    algo = ConcurrencyLimiter(algo, max_concurrent=2)
    
    tuner = tune.Tuner(
        tune.with_resources(
            tune.with_parameters(train_model),
            resources={"cpu": 2, "gpu": gpus_per_trial}
        ),
        tune_config=tune.TuneConfig(
            scheduler=scheduler,
            search_alg=algo,
            num_samples=num_samples
        ),
        run_config=train.RunConfig(
            callbacks=[WandbLoggerCallback(project="Sign language final2")]
        )
    )
    results = tuner.fit()
    
    best_result = results.get_best_result("loss", "min")

    print("Best trial config: {}".format(best_result.config))
    print("Best trial final validation loss: {}".format(
        best_result.metrics["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_result.metrics["accuracy"]))

    test_best_model(best_result, smoke_test=smoke_test)

main(num_samples=2, max_num_epochs=10, gpus_per_trial=1)

0,1
Current time:,2025-01-26 22:01:00
Running for:,00:10:23.12
Memory:,3.8/31.4 GiB

Trial name,status,loc,alpha,beta1,beta2,decay,epochs,gamma,lr,momentum,normalization,optimizer,step_size,iter,total time (s),loss,accuracy
train_model_213866e6,TERMINATED,172.19.2.2:1372,,0.993583,0.94833,0.0963138,4,0.00799826,1.79877e-05,,False,ADAM,9,4,615.671,1.80127,0.77759
train_model_bf99453e,TERMINATED,172.19.2.2:1413,0.991016,,,,4,0.0790526,0.000458005,0.80171,False,RMSprop,9,1,237.662,0.204931,0.939739


2025-01-26 21:50:37,621	INFO wandb.py:319 -- Already logged into W&B.


[36m(train_model pid=1372)[0m model loaded
[36m(train_model pid=1372)[0m epoch: 0
[36m(train_model pid=1372)[0m Batch 0/1227


[36m(_WandbLoggingActor pid=1411)[0m wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[36m(_WandbLoggingActor pid=1411)[0m wandb: Currently logged in as: zogfryt. Use `wandb login --relogin` to force relogin
[36m(_WandbLoggingActor pid=1411)[0m wandb: Tracking run with wandb version 0.19.1
[36m(_WandbLoggingActor pid=1411)[0m wandb: Run data is saved locally in /tmp/ray/session_2025-01-26_21-37-50_142043_40/artifacts/2025-01-26_21-50-37/train_model_2025-01-26_21-50-37/driver_artifacts/train_model_213866e6_1_beta1=0.9936,beta2=0.9483,decay=0.0963,epochs=4,gamma=0.0080,lr=0.0000,normalization=False,optimizer=ADAM,s_2025-01-26_21-50-37/wandb/run-20250126_215045-213866e6
[36m(_WandbLoggingActor pid=1411)[0m wandb: Run `wandb offline` to turn off syncing.
[36m(_WandbLoggingActor pid=1411)[0m wandb: Syncing run train_model_213866e6
[36m(_WandbLoggingActor pid=1411)[0m wandb: ⭐️ View project at https://wandb.ai/zogfry

[36m(train_model pid=1413)[0m model loaded
[36m(train_model pid=1413)[0m epoch: 0
[36m(train_model pid=1413)[0m Batch 0/1227


[36m(_WandbLoggingActor pid=1547)[0m wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[36m(_WandbLoggingActor pid=1547)[0m wandb: Currently logged in as: zogfryt. Use `wandb login --relogin` to force relogin
[36m(_WandbLoggingActor pid=1547)[0m wandb: Tracking run with wandb version 0.19.1
[36m(_WandbLoggingActor pid=1547)[0m wandb: Run data is saved locally in /tmp/ray/session_2025-01-26_21-37-50_142043_40/artifacts/2025-01-26_21-50-37/train_model_2025-01-26_21-50-37/driver_artifacts/train_model_bf99453e_2_alpha=0.9910,epochs=4,gamma=0.0791,lr=0.0005,momentum=0.8017,normalization=False,optimizer=RMSprop,step_siz_2025-01-26_21-50-42/wandb/run-20250126_215059-bf99453e
[36m(_WandbLoggingActor pid=1547)[0m wandb: Run `wandb offline` to turn off syncing.
[36m(_WandbLoggingActor pid=1547)[0m wandb: Syncing run train_model_bf99453e
[36m(_WandbLoggingActor pid=1547)[0m wandb: ⭐️ View project at https://wandb.ai/zogfry

[36m(train_model pid=1413)[0m Batch 100/1227[32m [repeated 2x across cluster][0m
[36m(train_model pid=1413)[0m Batch 200/1227[32m [repeated 2x across cluster][0m
[36m(train_model pid=1413)[0m Batch 300/1227[32m [repeated 2x across cluster][0m
[36m(train_model pid=1372)[0m Batch 400/1227
[36m(train_model pid=1413)[0m Batch 400/1227
[36m(train_model pid=1372)[0m Batch 500/1227
[36m(train_model pid=1413)[0m Batch 500/1227
[36m(train_model pid=1372)[0m Batch 600/1227
[36m(train_model pid=1413)[0m Batch 600/1227
[36m(train_model pid=1413)[0m Batch 700/1227[32m [repeated 2x across cluster][0m
[36m(train_model pid=1372)[0m Batch 800/1227
[36m(train_model pid=1413)[0m Batch 800/1227
[36m(train_model pid=1372)[0m Batch 900/1227
[36m(train_model pid=1413)[0m Batch 900/1227
[36m(train_model pid=1372)[0m Batch 1000/1227
[36m(train_model pid=1413)[0m Batch 1000/1227
[36m(train_model pid=1372)[0m Batch 1100/1227
[36m(train_model pid=1413)[0m Batch 1100/122

[36m(train_model pid=1372)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_model_2025-01-26_21-50-37/train_model_213866e6_1_beta1=0.9936,beta2=0.9483,decay=0.0963,epochs=4,gamma=0.0080,lr=0.0000,normalization=False,optimizer=ADAM,s_2025-01-26_21-50-37/checkpoint_000000)


[36m(train_model pid=1372)[0m Val Loss: 2.801012395081423, Val Acc: 0.377541142303969




[36m(train_model pid=1372)[0m epoch: 1
[36m(train_model pid=1372)[0m Batch 0/1227
[36m(train_model pid=1413)[0m Val Loss: 0.20493078955953142, Val Acc: 0.9397386253630203


[36m(train_model pid=1413)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_model_2025-01-26_21-50-37/train_model_bf99453e_2_alpha=0.9910,epochs=4,gamma=0.0791,lr=0.0005,momentum=0.8017,normalization=False,optimizer=RMSprop,step_siz_2025-01-26_21-50-42/checkpoint_000000)
[36m(_WandbLoggingActor pid=1547)[0m wandb:                                                                                
[36m(_WandbLoggingActor pid=1547)[0m wandb: 
[36m(_WandbLoggingActor pid=1547)[0m wandb: Run history:
[36m(_WandbLoggingActor pid=1547)[0m wandb:                 accuracy ▁
[36m(_WandbLoggingActor pid=1547)[0m wandb: iterations_since_restore ▁
[36m(_WandbLoggingActor pid=1547)[0m wandb:                     loss ▁
[36m(_WandbLoggingActor pid=1547)[0m wandb:       time_since_restore ▁
[36m(_WandbLoggingActor pid=1547)[0m wandb:         time_this_iter_s ▁
[36m(_WandbLoggingActor pid=1547)[0m wandb:             time_total_s ▁
[36m(_W

[36m(train_model pid=1372)[0m Batch 100/1227
[36m(train_model pid=1372)[0m Batch 200/1227
[36m(train_model pid=1372)[0m Batch 300/1227
[36m(train_model pid=1372)[0m Batch 400/1227
[36m(train_model pid=1372)[0m Batch 500/1227
[36m(train_model pid=1372)[0m Batch 600/1227
[36m(train_model pid=1372)[0m Batch 700/1227
[36m(train_model pid=1372)[0m Batch 800/1227
[36m(train_model pid=1372)[0m Batch 900/1227
[36m(train_model pid=1372)[0m Batch 1000/1227
[36m(train_model pid=1372)[0m Batch 1100/1227
[36m(train_model pid=1372)[0m Batch 1200/1227
[36m(train_model pid=1372)[0m Batch 0/65
[36m(train_model pid=1372)[0m Val Loss: 2.352248901780785, Val Acc: 0.6055179090029041
[36m(train_model pid=1372)[0m epoch: 2


[36m(train_model pid=1372)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_model_2025-01-26_21-50-37/train_model_213866e6_1_beta1=0.9936,beta2=0.9483,decay=0.0963,epochs=4,gamma=0.0080,lr=0.0000,normalization=False,optimizer=ADAM,s_2025-01-26_21-50-37/checkpoint_000001)


[36m(train_model pid=1372)[0m Batch 0/1227
[36m(train_model pid=1372)[0m Batch 100/1227
[36m(train_model pid=1372)[0m Batch 200/1227
[36m(train_model pid=1372)[0m Batch 300/1227
[36m(train_model pid=1372)[0m Batch 400/1227
[36m(train_model pid=1372)[0m Batch 500/1227
[36m(train_model pid=1372)[0m Batch 600/1227
[36m(train_model pid=1372)[0m Batch 700/1227
[36m(train_model pid=1372)[0m Batch 800/1227
[36m(train_model pid=1372)[0m Batch 900/1227
[36m(train_model pid=1372)[0m Batch 1000/1227
[36m(train_model pid=1372)[0m Batch 1100/1227
[36m(train_model pid=1372)[0m Batch 1200/1227
[36m(train_model pid=1372)[0m Batch 0/65


[36m(train_model pid=1372)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_model_2025-01-26_21-50-37/train_model_213866e6_1_beta1=0.9936,beta2=0.9483,decay=0.0963,epochs=4,gamma=0.0080,lr=0.0000,normalization=False,optimizer=ADAM,s_2025-01-26_21-50-37/checkpoint_000002)


[36m(train_model pid=1372)[0m Val Loss: 2.0326963865053043, Val Acc: 0.7166021297192643
[36m(train_model pid=1372)[0m epoch: 3
[36m(train_model pid=1372)[0m Batch 0/1227
[36m(train_model pid=1372)[0m Batch 100/1227
[36m(train_model pid=1372)[0m Batch 200/1227
[36m(train_model pid=1372)[0m Batch 300/1227
[36m(train_model pid=1372)[0m Batch 400/1227
[36m(train_model pid=1372)[0m Batch 500/1227
[36m(train_model pid=1372)[0m Batch 600/1227
[36m(train_model pid=1372)[0m Batch 700/1227
[36m(train_model pid=1372)[0m Batch 800/1227
[36m(train_model pid=1372)[0m Batch 900/1227
[36m(train_model pid=1372)[0m Batch 1000/1227
[36m(train_model pid=1372)[0m Batch 1100/1227
[36m(train_model pid=1372)[0m Batch 1200/1227
[36m(train_model pid=1372)[0m Batch 0/65


2025-01-26 22:01:00,740	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/root/ray_results/train_model_2025-01-26_21-50-37' in 0.0105s.


[36m(train_model pid=1372)[0m Val Loss: 1.801271633164811, Val Acc: 0.7775895450145208
[36m(train_model pid=1372)[0m Training complete in 10m 17s


[36m(train_model pid=1372)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/train_model_2025-01-26_21-50-37/train_model_213866e6_1_beta1=0.9936,beta2=0.9483,decay=0.0963,epochs=4,gamma=0.0080,lr=0.0000,normalization=False,optimizer=ADAM,s_2025-01-26_21-50-37/checkpoint_000003)
[36m(_WandbLoggingActor pid=1411)[0m wandb:                                                                                
2025-01-26 22:01:02,235	INFO tune.py:1041 -- Total run time: 624.64 seconds (623.11 seconds for the tuning loop).
[36m(_WandbLoggingActor pid=1411)[0m wandb: 
[36m(_WandbLoggingActor pid=1411)[0m wandb: Run history:
[36m(_WandbLoggingActor pid=1411)[0m wandb:                 accuracy ▁▅▇█
[36m(_WandbLoggingActor pid=1411)[0m wandb: iterations_since_restore ▁▃▆█
[36m(_WandbLoggingActor pid=1411)[0m wandb:                     loss █▅▃▁
[36m(_WandbLoggingActor pid=1411)[0m wandb:       time_since_restore ▁▃▆█
[36m(_WandbLoggingActor pid

Best trial config: {'optimizer': 'RMSprop', 'lr': 0.00045800507405258556, 'momentum': 0.8017096272982344, 'alpha': 0.9910161904431056, 'normalization': False, 'gamma': 0.07905262424749769, 'step_size': 9, 'epochs': 4}
Best trial final validation loss: 0.20493078955953142
Best trial final validation accuracy: 0.9397386253630203


  model_state, optimizer_state, scheduler_state = torch.load(checkpoint_path)


Best trial test set accuracy: 0.10758620689655173


In [None]:
setup_wandb(config, project="Sign language")