In [2]:
import torch_directml
from torch_directml import torch
from torchvision import datasets
from torchvision import transforms


transform = transforms.Compose([
    transforms.PILToTensor(),
    lambda n: n.float(),
    transforms.Normalize((0.0), (0.5))
])

train_dataset = datasets.FashionMNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST('./data', train=False, download=True, transform=transform)

print(f"# of training data {len(train_dataset)} / # of test data {len(test_dataset)}")
print(f"shape of image {train_dataset[0][0].shape}") 


# of training data 60000 / # of test data 10000
shape of image torch.Size([1, 28, 28])


In [3]:
# define simple network 
import pytorch_lightning as pl
from torch_directml import torch
from torch import nn
from torchvision import transforms
import torchmetrics


class SimpleConvModel(pl.LightningModule):
    def __init__(self, num_classes, lr=1e-4):
        super().__init__()
        self.save_hyperparameters()
        self.lr = lr
        self.network = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=2, stride=1), nn.ReLU(), # (n,1,28,28) => (n, 8, 27, 27)
            nn.Conv2d(16, 32, kernel_size=3, stride=1), nn.ReLU(), # (n,8,27,27) => (n, 16, 25, 25)
            nn.Conv2d(32, 64, kernel_size=3, stride=1), nn.ReLU(), 
            nn.Conv2d(64, 96, kernel_size=3, padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool2d((2,2)),
            nn.Flatten(),
            nn.Linear(384, 256),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(128, num_classes),
        )
        
        self.metric = torchmetrics.Accuracy(task='multiclass', num_classes=num_classes)
        
    def forward(self, input):
        return self.network(input)
    
    
    def training_step(self, batch, batch_idx):
        X, y = batch
        loss = nn.functional.cross_entropy(self.network(X), y)
        self.log("train_loss", loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        X, y = batch
        y_hat = self.network(X)
        loss = nn.functional.cross_entropy(y_hat, y)
        acc = self.metric(y_hat.argmax(dim=1), y)
        self.log("val_loss", loss)
        self.log("val_acc", acc)
        
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(),lr=self.lr)
    
model = SimpleConvModel(len(test_dataset.classes))
with torch.no_grad():
    print(model.network(torch.rand(size=(1, 1, 28, 28))).shape)
        
        

torch.Size([1, 10])


In [4]:
from torch.utils import data

batch_size = 128
train_dataset, val_dataset = data.random_split(train_dataset, [0.9, 0.1])
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, num_workers=8)
val_loader = data.DataLoader(val_dataset, batch_size=batch_size, num_workers=8)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, num_workers=8)



In [6]:
from pytorch_lightning.loggers import WandbLogger
import wandb
from pytorch_lightning import callbacks

ckpt_callback = callbacks.ModelCheckpoint('./models', monitor='val_acc', mode='max', save_top_k=10)

wandb.init(project='minimal_fashion_mnist_classifier')
wandb_logger = WandbLogger()

trainer = pl.Trainer(max_epochs=30, accelerator="gpu", logger=wandb_logger, callbacks=[ckpt_callback])
trainer.fit(model=model, train_dataloaders=train_loader, val_dataloaders=val_loader)

0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train_loss,█▅▃▄▃▃▃▂▂▃▂▃▂▃▃▂▃▂▂▂▂▂▂▁▂▂▁▂▂▂▂▁▁▁▂▂▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
val_acc,▁▂▃▄▄▅▅▅▅▆▆▆▆▇▆▇▇▇▇█▇▇████████
val_loss,█▇▆▅▅▄▄▄▃▃▃▃▃▂▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁

0,1
epoch,29.0
train_loss,0.26961
trainer/global_step,12659.0
val_acc,0.91183
val_loss,0.24109


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668304633337055, max=1.0…

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type               | Params
-----------------------------------------------
0 | network | Sequential         | 211 K 
1 | metric  | MulticlassAccuracy | 0     
-----------------------------------------------
211 K     Trainable params
0         Non-trainable params
211 K     Total params
0.845     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=30` reached.


In [4]:
from torchvision import models
from torchvision import transforms
from torch import nn
import torchmetrics
import pytorch_lightning as pl

class ResNetBasedClassifier(pl.LightningModule):
    def __init__(self, input_image_size, num_class, is_base_frozen=True) -> None:
        super().__init__()
        resnet = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V2)
        if is_base_frozen:
            for param in resnet.parameters():
                param.requires_grad = False
        
        self.network = nn.Sequential(
            transforms.Resize(224),
            nn.Conv2d(1,3, kernel_size=1),
            resnet,
            nn.ReLU(),
            nn.Linear(resnet.fc.out_features, num_class)
        )
        
        self.loss = nn.CrossEntropyLoss()
        self.metric = torchmetrics.Accuracy('multiclass', num_classes=num_class)
        
    def forward(self, X):
        return self.network(X)
    
    def training_step(self, batch, batch_index) -> torch.Tensor:
        X, y = batch
        loss = self.loss(self.network(X), y)
        self.log("train_loss", loss)
        return loss
        
    def validation_step(self, batch, batch_index):
        X, y = batch
        y_hat = self.network(X)
        val_loss = nn.functional.cross_entropy(y_hat, y)
        val_acc = self.metric(y_hat.argmax(dim=1), y)
        self.log("val_loss", val_loss)
        self.log("val_acc", val_acc)
    
    def configure_optimizers(self) -> torch.optim.Optimizer:
        return torch.optim.Adam(self.parameters(), lr=1e-5)
        


In [5]:
model = ResNetBasedClassifier(input_image_size=28, num_class=10, is_base_frozen=False)
with torch.no_grad():
    print(model(torch.rand((1,1,28,28))).shape)

torch.Size([1, 10])


In [6]:
import pytorch_lightning as pl
import wandb
from pytorch_lightning import loggers

wandb.init(project='resnet_fashion_classifier')
wandb_logger = loggers.WandbLogger()


trainer = pl.Trainer(logger=wandb_logger,accelerator='gpu',max_epochs=180)
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668125700000046, max=1.0…

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type               | Params
-----------------------------------------------
0 | network | Sequential         | 25.6 M
1 | loss    | CrossEntropyLoss   | 0     
2 | metric  | MulticlassAccuracy | 0     
-----------------------------------------------
25.6 M    Trainable params
0         Non-trainable params
25.6 M    Total params
102.268   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
