<a href="https://colab.research.google.com/github/eisbetterthanpi/vision/blob/main/Meta_Pseudo_Labels_down.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

# Meta Pseudo Labels mar 2021 https://arxiv.org/pdf/2003.10580v4.pdf
# https://github.com/kekmodel/MPL-pytorch


In [1]:
# @title torch augment
# https://github.com/facebookresearch/vicreg/blob/main/augmentations.py
import torch
import torchvision.transforms as transforms

class TrainTransform(object):
    def __init__(self):
        # self.transform = transforms.RandomApply([transforms.Compose([
        self.transform = transforms.Compose([
                transforms.RandomPerspective(distortion_scale=0.3, p=0.5), # me
                # transforms.RandomResizedCrop((400,640), scale=(0.7, 1.0), ratio=(0.8, 1.25), interpolation=transforms.InterpolationMode.BICUBIC),
                transforms.RandomResizedCrop((32,32), scale=(0.7, 1.0), ratio=(0.8, 1.25), interpolation=transforms.InterpolationMode.BICUBIC),
                transforms.RandomHorizontalFlip(p=0.5), # 0.5
                transforms.Lambda(lambda x : torch.clamp(x, 0., 1.)), # clamp else ColorJitter will return nan https://discuss.pytorch.org/t/input-is-nan-after-transformation/125455/6
                transforms.RandomApply([transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)], p=0.8,), # brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)], p=0.8
                transforms.RandomGrayscale(p=0.2), # 0.2
                # # transforms.RandomChoice(transforms.ColorJitter , transforms.RandomGrayscale(p=1.)
                transforms.RandomApply([transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),], p=1.0),
                # # transforms.RandomSolarize(threshold=130, p=0.5)
                transforms.RandomErasing(p=1., scale=(0.1, 0.11), ratio=(1,1), value=(0.485, 0.456, 0.406)),
                # transforms.ToTensor(), # ToTensored at dataset level, no need to ToTensor again
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # normalised at dataset level. default 0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225
                ])
            # ], p=1.)

    def __call__(self, sample):
        dims = len(sample.shape)
        if dims==3: x1 = self.transform(sample) # same transforms per minibatch
        elif dims==4: x1 = transforms.Lambda(lambda x: torch.stack([self.transform(x_) for x_ in x]))(sample) # diff transforms per img in minibatch
        # x1 = self.transform(sample)
        return x1

trs=TrainTransform()


In [2]:
# @title utils
# https://github.com/kekmodel/MPL-pytorch/blob/main/utils.py
import os
import torch
from torch import nn
from torch.nn import functional as F

def create_loss_fn():
    label_smoothing = 0 # default 0 / mainargs 0.15
    # if label_smoothing > 0:
    #     criterion = SmoothCrossEntropyV2(alpha=label_smoothing)
    # else:
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    return criterion.to(device)

from collections import OrderedDict
def module_load_state_dict(model, state_dict):
    try:
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]  # remove `module.`
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)
    except:
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = f'module.{k}'  # add `module.`
            new_state_dict[name] = v
        model.load_state_dict(new_state_dict)

def model_load_state_dict(model, state_dict):
    try: model.load_state_dict(state_dict)
    except: module_load_state_dict(model, state_dict)


def accuracy(output, target, topk=(1,)):
    output = output.to(torch.device('cpu'))
    target = target.to(torch.device('cpu'))
    maxk = max(topk)
    batch_size = target.shape[0]
    _, idx = output.sort(dim=1, descending=True)
    pred = idx.narrow(1, 0, maxk).t()
    correct = pred.eq(target.reshape(1, -1).expand_as(pred))
    res = []
    for k in topk:
        correct_k = correct[:k].reshape(-1).float().sum(dim=0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


class SmoothCrossEntropy(nn.Module):
    def __init__(self, alpha=0.1):
        super(SmoothCrossEntropy, self).__init__()
        self.alpha = alpha

    def forward(self, logits, labels):
        if self.alpha == 0:
            loss = F.cross_entropy(logits, labels)
        else:
            num_classes = logits.shape[-1]
            alpha_div_k = self.alpha / num_classes
            target_probs = F.one_hot(labels, num_classes=num_classes).float() * (1. - self.alpha) + alpha_div_k
            loss = (-(target_probs * torch.log_softmax(logits, dim=-1)).sum(dim=-1)).mean()
        return loss

class SmoothCrossEntropyV2(nn.Module):
    """NLL loss with label smoothing."""
    def __init__(self, label_smoothing=0.1):
        super().__init__()
        assert label_smoothing < 1.0
        self.smoothing = label_smoothing
        self.confidence = 1. - label_smoothing

    def forward(self, x, target):
        if self.smoothing == 0:
            loss = F.cross_entropy(x, target)
        else:
            logprobs = F.log_softmax(x, dim=-1)
            nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
            nll_loss = nll_loss.squeeze(1)
            smooth_loss = -logprobs.mean(dim=-1)
            loss = (self.confidence * nll_loss + self.smoothing * smooth_loss).mean()
        return loss



In [3]:
# @title ModelEMA
# expopnential moving average, smoothen model parameters
# https://github.com/kekmodel/MPL-pytorch/blob/main/models.py
import torch
import torch.nn as nn
from copy import deepcopy

class ModelEMA(nn.Module):
    def __init__(self, model, decay=0.9999, device=None):
        super().__init__()
        self.module = deepcopy(model)
        self.module.eval()
        self.decay = decay
        self.device = device
        if self.device is not None:
            self.module.to(device=device)

    def forward(self, input):
        return self.module(input)

    def _update(self, model, update_fn):
        with torch.no_grad():
            for ema_v, model_v in zip(self.module.parameters(), model.parameters()):
                if self.device is not None:
                    model_v = model_v.to(device=self.device)
                ema_v.copy_(update_fn(ema_v, model_v))
            for ema_v, model_v in zip(self.module.buffers(), model.buffers()):
                if self.device is not None:
                    model_v = model_v.to(device=self.device)
                ema_v.copy_(model_v)

    def update_parameters(self, model):
        self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m)

    def state_dict(self):
        return self.module.state_dict()

    def load_state_dict(self, state_dict):
        self.module.load_state_dict(state_dict)



In [6]:

# @title main down
# https://github.com/kekmodel/MPL-pytorch/blob/main/main.py
import math
import os
import random
import time
import numpy as np
import torch
from torch.cuda import amp
from torch import nn
from torch.nn import functional as F
from torch import optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm

ema = 0.995 # default 0 / mainargs 0.995
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)


def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_wait_steps=0, num_cycles=0.5, last_epoch=-1):
    def lr_lambda(current_step):
        if current_step < num_wait_steps:
            return 0.0
        if current_step < num_warmup_steps + num_wait_steps:
            return float(current_step) / float(max(1, num_warmup_steps + num_wait_steps))
        progress = float(current_step - num_warmup_steps - num_wait_steps) / float(max(1, num_training_steps - num_warmup_steps - num_wait_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
    return LambdaLR(optimizer, lr_lambda, last_epoch)


t_scaler = amp.GradScaler()
s_scaler = amp.GradScaler()
# def train_loop(labeled_loader, unlabeled_loader, test_loader, finetune_dataset,
#                teacher_model, student_model, avg_student_model, criterion,
#                t_optimizer, s_optimizer, t_scheduler, s_scheduler, t_scaler, s_scaler):
def train(labeled_loader, unlabeled_loader, teacher_model, student_model,
        avg_student_model, criterion, t_optimizer, s_optimizer, t_scheduler, s_scheduler):
    labeled_iter = iter(labeled_loader)
    unlabeled_iter = iter(unlabeled_loader)

    # for author's code formula
    # moving_dot_product = torch.empty(1).to(device)
    # limit = 3.0**(0.5)  # 3 = 6 / (f_in + f_out)
    # nn.init.uniform_(moving_dot_product, -limit, limit)

    # eval_step = 10#1000
    # start_step=0
    # for step in range(start_step, total_steps):
    for step in range(len(unlabeled_loader)):
        teacher_model.train()
        student_model.train()

        try:
            images_l, targets = next(labeled_iter)
        except:
            labeled_iter = iter(labeled_loader)
            images_l, targets = next(labeled_iter)
        try:
            # (images_uw, images_us), _ = next(unlabeled_iter)
            images_uw, _ = next(unlabeled_iter)
            images_us = trs(images_uw)
        except:
            unlabeled_iter = iter(unlabeled_loader)
            # (images_uw, images_us), _ = next(unlabeled_iter)
            images_uw, _ = next(unlabeled_iter)
            images_us = trs(images_uw)
        images_l, targets = images_l.to(device), targets.to(device)
        images_uw, images_us = images_uw.to(device), images_us.to(device)

        with amp.autocast():
            batch_size = images_l.shape[0]
            # print(images_l.shape, images_uw.shape, images_us.shape) # [64, 3, 32, 32]. [448, 3, 32, 32], [448, 3, 32, 32]
            t_images = torch.cat((images_l, images_uw, images_us))
            t_logits = teacher_model(t_images)
            t_logits_l = t_logits[:batch_size]
            t_logits_uw, t_logits_us = t_logits[batch_size:].chunk(2)
            del t_logits

            t_loss_l = criterion(t_logits_l, targets)

            temperature = 1 # default 1 / mainargs 0.7
            soft_pseudo_label = torch.softmax(t_logits_uw.detach() / temperature, dim=-1)
            max_probs, hard_pseudo_label = torch.max(soft_pseudo_label, dim=-1)

            threshold = 0.95 # default 0.95 / mainargs 0.6
            mask = max_probs.ge(threshold).float()
            t_loss_u = torch.mean(-(soft_pseudo_label * torch.log_softmax(t_logits_us, dim=-1)).sum(dim=-1) * mask)
            lambda_u = 8 # default 1 / mainargs 8 coefficient of unlabeled loss
            uda_steps = 10 # default 1 / mainargs 5000 warmup steps of lambda-u
            weight_u = lambda_u * min(1., (step + 1) / uda_steps)
            t_loss_uda = t_loss_l + weight_u * t_loss_u

            s_images = torch.cat((images_l, images_us))
            s_logits = student_model(s_images)
            s_logits_l = s_logits[:batch_size]
            s_logits_us = s_logits[batch_size:]
            del s_logits

            s_loss_l_old = F.cross_entropy(s_logits_l.detach(), targets)

            # print("s_logits_us, hard_pseudo_label: ", s_logits_us.shape, hard_pseudo_label.shape) # [448, 10] [224]
            s_loss = criterion(s_logits_us, hard_pseudo_label)

        s_scaler.scale(s_loss).backward()

        # if grad_clip > 0:
        s_scaler.unscale_(s_optimizer)
        nn.utils.clip_grad_norm_(student_model.parameters(), 1e9)

        s_scaler.step(s_optimizer)
        s_scaler.update()
        s_scheduler.step()

        if ema > 0: avg_student_model.update_parameters(student_model)

        with amp.autocast():
            with torch.no_grad():
                s_logits_l = student_model(images_l)
            s_loss_l_new = F.cross_entropy(s_logits_l.detach(), targets)

            dot_product = s_loss_l_old - s_loss_l_new # theoretically correct formula (https://github.com/kekmodel/MPL-pytorch/issues/6)
            # dot_product = s_loss_l_new - s_loss_l_old # author's code formula
            # # moving_dot_product = moving_dot_product * 0.99 + dot_product * 0.01
            # # dot_product = dot_product - moving_dot_product

            _, hard_pseudo_label = torch.max(t_logits_us.detach(), dim=-1)
            t_loss_mpl = dot_product * F.cross_entropy(t_logits_us, hard_pseudo_label)

            # t_loss_mpl = torch.tensor(0.).to(device) # test
            t_loss = t_loss_uda + t_loss_mpl

        t_scaler.scale(t_loss).backward()

        # if grad_clip > 0:
        t_scaler.unscale_(t_optimizer)
        nn.utils.clip_grad_norm_(teacher_model.parameters(), 1e9)

        t_scaler.step(t_optimizer)
        t_scaler.update()
        t_scheduler.step()

        teacher_model.zero_grad()
        student_model.zero_grad()

        # if (step + 1) % eval_step == 0:
        #     # print(s_losses.avg, t_losses.avg, t_losses_l.avg, t_losses_u.avg, t_losses_mpl.avg, mean_mask.avg)

        #     test_model = avg_student_model if avg_student_model is not None else student_model
        #     test_loss, top1, top5 = evaluate(test_loader, test_model, criterion)

    # # finetune
    # ckpt_name = f'{save_path}/{name}_best.pth.tar'
    # gpu=0
    # loc = f'cuda:{gpu}'
    # checkpoint = torch.load(ckpt_name, map_location=loc)
    # if checkpoint['avg_state_dict'] is not None:
    #     model_load_state_dict(student_model, checkpoint['avg_state_dict'])
    # else:
    #     model_load_state_dict(student_model, checkpoint['student_state_dict'])
    # finetune(finetune_dataset, test_loader, student_model, criterion)
    return


def evaluate(test_loader, model, criterion, verbose=True):
    size = len(test_loader.dataset)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for step, (images, targets) in enumerate(test_loader):
            batch_size = images.shape[0]
            images = images.to(device)
            targets = targets.to(device)
            with amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, targets)
            test_loss += loss.item()
        #     acc1, acc5 = accuracy(outputs, targets, (1, 5))
        #     loss.item(), acc1[0], acc5[0]
        # # return losses.avg, top1.avg, top5.avg
            correct += (outputs.argmax(1) == targets).type(torch.float).sum().item()
    test_loss /= size
    correct /= size
    try: wandb.log({"test loss": test_loss})
    except: pass
    if verbose: print(f"Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")
    return correct, test_loss



# # def finetune(finetune_dataset, test_loader, model, criterion):
# def finetune(finetune_dataset, model, criterion):
#     # model.drop = nn.Identity()
#     # labeled_loader = DataLoader(finetune_dataset, batch_size=512, num_workers=4, pin_memory=True)
#     # optimizer = optim.SGD(model.parameters(), lr=3e-5, momentum=0.9, weight_decay=0, nesterov=True)
#     # scaler = amp.GradScaler()
#     # for epoch in range(1): #625


# @title train test function
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
scaler = torch.cuda.amp.GradScaler()
# https://github.com/prigoyal/pytorch_memonger/blob/master/models/optimized/resnet_new.py
from torch.utils.checkpoint import checkpoint, checkpoint_sequential

trs=TrainTransform() # for image augmentation during train time
# train function with automatic mixed precision
def strain(dataloader, model, loss_fn, optimizer, scheduler=None, verbose=True):
    size = len(dataloader.dataset)
    model.train()
    loss_list = []
    for batch, (x, y) in enumerate(dataloader):
        x, y = x.to(device), y.to(device)
        with torch.cuda.amp.autocast(): # automatic mixed percision
            x = trs(x) # image augmentation during train time to use gpu
            pred = model(x) # default
            loss = loss_fn(pred, y)
        scaler.scale(loss).backward()
        if ((batch + 1) % 4 == 0) or (batch + 1 == len(dataloader)): # gradient accumulation
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler is not None:
                scheduler.step()
                # print("### lr: ", optimizer.param_groups[0]["lr"])

        # print(model.state_dict()['_orig_mod.bn1.running_mean'][0])
        train_loss = loss.item()/len(y)
        loss_list.append(train_loss)
        try: wandb.log({"train loss": train_loss})
        except: pass
        if batch % (size//(10* len(y))) == 0:
            current = batch * len(x)
            if verbose: print(f"loss: {train_loss:>7f}  [{current:>5d}/{size:>5d}]")
    return loss_list




In [7]:
# @title wwwwwwwwwww


# labeled_dataset, unlabeled_dataset, test_dataset, finetune_dataset = DATASET_GETTERS[dataset](args)

from torch.utils.data import DataLoader
from torchvision import datasets, transforms
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
train_data = datasets.CIFAR10(root="data", train=True, download=True,transform=transform)
# test_data = datasets.CIFAR10(root="data", train=False, download=True,transform=transform)
labeled_dataset, unlabeled_dataset = torch.utils.data.random_split(train_data, [.1,.9])
test_dataset = datasets.CIFAR10(root="data", train=False, download=True,transform=transform)
finetune_dataset = labeled_dataset


batch_size = 64 # default 64/ mainargs128
train_sampler = RandomSampler
labeled_loader = DataLoader(labeled_dataset, sampler=train_sampler(labeled_dataset), batch_size=batch_size, num_workers=4, drop_last=True)
unlabeled_loader = DataLoader(unlabeled_dataset, sampler=train_sampler(unlabeled_dataset), batch_size=batch_size * 7, num_workers=4, drop_last=True) # mu=7 ,coefficient of unlabeled batch size
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size, num_workers=4)

num_classes = 10
# if dataset == "cifar10": depth, widen_factor = 28, 2
# elif dataset == 'cifar100': depth, widen_factor = 28, 8
# teacher_model = WideResNet(num_classes=num_classes, depth=depth, widen_factor=widen_factor, dropout=0, dense_dropout=0.2)
# student_model = WideResNet(num_classes=num_classes, depth=depth, widen_factor=widen_factor, dropout=0, dense_dropout=0.2)


from torchvision import models
def get_resnet():
    model = models.resnet152(weights='DEFAULT') # 18 34 50 101 152
    num_ftrs = model.fc.in_features
    model.fc = nn.Sequential( # og (fc): Linear(in_features=2048, out_features=1000, bias=True)
        nn.Linear(num_ftrs, num_classes, bias=False),
        nn.Softmax(dim=1),
        )
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # model = model.to(device)
    model = torch.compile(model.to(device))
    return model

teacher_model = get_resnet()
student_model = get_resnet()


# teacher_model.to(device)
# student_model.to(device)
avg_student_model = None
if ema > 0: avg_student_model = ModelEMA(student_model, ema)


criterion = create_loss_fn()

no_decay = ['bn']
weight_decay = 5e-4 # default 0 / mainargs 5e-4
teacher_parameters = [{'params': [p for n, p in teacher_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in teacher_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
student_parameters = [{'params': [p for n, p in student_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in student_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]

# lr default 0.01/ mainargs 0.05
t_optimizer = optim.SGD(teacher_parameters, lr=0.05, momentum=0.9, nesterov=True)
s_optimizer = optim.SGD(student_parameters, lr=0.05, momentum=0.9, nesterov=True)

total_steps=30 # 300000
warmup_steps = 10 # default 0 / mainargs 5000
t_scheduler = get_cosine_schedule_with_warmup(t_optimizer, warmup_steps, total_steps)
student_wait_steps = 3 # default 0 / mainargs 3000
s_scheduler = get_cosine_schedule_with_warmup(s_optimizer, warmup_steps, total_steps, student_wait_steps)


# finetune(finetune_dataset, test_loader, student_model, criterion)
# evaluate(test_loader, student_model, criterion)

# teacher_model.zero_grad()
# student_model.zero_grad()
# train_loop(labeled_loader, unlabeled_loader, test_loader, finetune_dataset,
#             teacher_model, student_model, avg_student_model, criterion,
#             t_optimizer, s_optimizer, t_scheduler, s_scheduler, t_scaler, s_scaler)



# import time
# start = time.time()

# optimizer = bnb.optim.AdamW(model.parameters(), lr=1e-5, betas=(0.9, 0.999), optim_bits=8)
# optimizer = Lamb(model.parameters(), lr=1e-5, betas=(0.9, 0.999), eps=1e-08, weight_decay=3e-6)

# scheduler = torch.optim.lr_scheduler.PolynomialLR(optimizer, total_iters=int(np.ceil(num_batches/4)*3), power=1.0)
pth='/content/mpl.pth' # ty

# scheduler = PolynomialLR(optimizer, total_iters=4, power=1.0)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=10**(-1/2))
# for epoch in range(5):
#     scheduler.step()
epochs = 2
for t in range(0,epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    t_lr=t_optimizer.param_groups[0]["lr"]
    s_lr=s_optimizer.param_groups[0]["lr"]
    print(t_lr,s_lr)
    train(labeled_loader, unlabeled_loader, teacher_model, student_model,
        avg_student_model, criterion, t_optimizer, s_optimizer, t_scheduler, s_scheduler)

    # correct, test_loss = test(test_loader, student_model, loss_fn)
    # correct, test_loss = test(test_loader, avg_student_model, loss_fn)
    # evaluate(test_loader, student_model, criterion)
    evaluate(test_loader, avg_student_model, criterion)

    # train_lst.extend(train_ls)
    # test_lst.append(test_loss)
    # acc_lst.append(correct)

    checkpoint = {
    'epoch': t+1,
    'teacher_model': teacher_model.state_dict(),
    'student_model': student_model.state_dict(),
    'avg_student_model': avg_student_model.state_dict(),
    't_optimizer': t_optimizer.state_dict(),
    's_optimizer': s_optimizer.state_dict(),
    't_scheduler': t_scheduler.state_dict(),
    's_scheduler': s_scheduler.state_dict(),}
    torch.save(checkpoint, pth)



# model = student_model
model = avg_student_model
model.drop = nn.Identity()
labeled_loader = DataLoader(finetune_dataset, batch_size=128, num_workers=4, pin_memory=True) # batch_size=512
optimizer = optim.SGD(model.parameters(), lr=3e-5, momentum=0.9, weight_decay=0, nesterov=True)
# scaler = amp.GradScaler()
for epoch in range(1): #625
    # train_ls = strain(labeled_loader, model, loss_fn, optimizer, scheduler)
    train_ls = strain(labeled_loader, model, criterion, optimizer)
    evaluate(test_loader, student_model, criterion)





Files already downloaded and verified
Files already downloaded and verified




Epoch 1
-------------------------------
0.0 0.0




Accuracy: 14.7%, Avg loss: 0.035975
Epoch 2
-------------------------------
0.025000000000000012 0.001688194264891091




Accuracy: 22.4%, Avg loss: 0.035331
loss: 0.017570  [    0/ 5000]




loss: 0.017508  [  384/ 5000]
loss: 0.017203  [  768/ 5000]
loss: 0.017683  [ 1152/ 5000]
loss: 0.017355  [ 1536/ 5000]
loss: 0.017640  [ 1920/ 5000]
loss: 0.017366  [ 2304/ 5000]
loss: 0.017347  [ 2688/ 5000]
loss: 0.017757  [ 3072/ 5000]
loss: 0.017323  [ 3456/ 5000]
loss: 0.017645  [ 3840/ 5000]
loss: 0.017492  [ 4224/ 5000]
loss: 0.017430  [ 4608/ 5000]




Accuracy: 32.5%, Avg loss: 0.033508
