In [1]:
from os.path import join as pjoin  # pylint: disable=g-importing-member
import time

import numpy as np
import torch
import torchvision as tv
from tensorflow import keras


import bit_pytorch.fewshot as fs
import bit_pytorch.lbtoolbox as lb
import bit_pytorch.models as models

import bit_common
import bit_hyperrule

In [2]:
def topk(output, target, ks=(1,)):
    """Returns one boolean vector for each k, whether the target is within the output's top-k."""
    _, pred = output.topk(max(ks), 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [correct[:k].max(0)[0] for k in ks]

In [3]:
def recycle(iterable):
    """Variant of itertools.cycle that does not save iterates."""
    while True:
        for i in iterable:
            yield i

In [4]:
def mktrainval(dataset_name):
    """Returns train and validation datasets."""
    precrop, crop = bit_hyperrule.get_resolution_from_dataset(dataset_name)
    train_tx = tv.transforms.Compose([
      tv.transforms.Resize((precrop, precrop)),
      tv.transforms.RandomCrop((crop, crop)),
      tv.transforms.RandomHorizontalFlip(),
      tv.transforms.ToTensor(),
      tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])
    val_tx = tv.transforms.Compose([
      tv.transforms.Resize((crop, crop)),
      tv.transforms.ToTensor(),
      tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ])
    
    if dataset_name == "cifar10":
        train_set = tv.datasets.CIFAR10(args.datadir, transform=train_tx, train=True, download=True)
        valid_set = tv.datasets.CIFAR10(args.datadir, transform=val_tx, train=False, download=True)
    elif dataset_name == "cifar100":
        train_set = tv.datasets.CIFAR100('C:/Users/dhkim/Desktop/directory/cifar100_data', transform=train_tx, train=True, download=True)
        valid_set = tv.datasets.CIFAR100('C:/Users/dhkim/Desktop/directory/cifar100_data', transform=val_tx, train=False, download=True)
    elif dataset_name == "imagenet2012":
        train_set = tv.datasets.ImageFolder(pjoin(args.datadir, "train"), train_tx)
        valid_set = tv.datasets.ImageFolder(pjoin(args.datadir, "val"), val_tx)
    
    micro_batch_size = 16
        
    valid_loader = torch.utils.data.DataLoader(
        valid_set, batch_size=micro_batch_size, shuffle=False,
        num_workers=0, pin_memory=True, drop_last=False)
    
    train_loader = torch.utils.data.DataLoader(
        train_set, batch_size=micro_batch_size, num_workers=0, pin_memory=True,
        sampler=torch.utils.data.RandomSampler(train_set, replacement=True, num_samples=micro_batch_size))
        

    return train_set, valid_set, train_loader, valid_loader   

In [71]:
def run_eval(model, data_loader, device, chrono, step):
        # switch to evaluate mode
        model.eval()

        print("Running validation...")
        
        all_c, all_top1, all_top5 = [], [], []
        end = time.time()
        for b, (x, y) in enumerate(data_loader):
            with torch.no_grad():
                x = x.to(device, non_blocking=True)
                y = y.to(device, non_blocking=True)
                
                # measure data loading time
                chrono._done("eval load", time.time() - end)
                
                # compute output, measure accuracy and record loss.
                with chrono.measure("eval fprop"):
                    logits = model(x)
                    c = torch.nn.CrossEntropyLoss(reduction='none')(logits, y)
                    top1, top5 = topk(logits, y, ks=(1, 5))
                    all_c.extend(c.cpu())  # Also ensures a sync point.
                    all_top1.extend(top1.cpu())
                    all_top5.extend(top5.cpu())
                    
                 # measure elapsed time
                end = time.time()
                
            model.train()
            print(f"Validation@{step} loss {np.mean(all_c):.5f}, "
                      f"top1 {np.mean(all_top1):.2%}, "
                      f"top5 {np.mean(all_top5):.2%}")
            
            return all_c, all_top1, all_top5
            

In [72]:
def mixup_data(x, y, l):
        """Returns mixed inputs, pairs of targets, and lambda"""
        indices = torch.randperm(x.shape[0]).to(x.device)

        mixed_x = l * x + (1 - l) * x[indices]
        y_a, y_b = y, y[indices]
        return mixed_x, y_a, y_b


In [73]:
def mixup_criterion(criterion, pred, y_a, y_b, l):
    return l * criterion(pred, y_a) + (1 - l) * criterion(pred, y_b)

In [74]:
pwd

'C:\\Users\\dhkim\\Desktop\\directory\\big_transfer-master'

In [75]:
def main():
#     logger = bit_common.setup_logger(args)
    
    # Lets cuDNN benchmark conv implementations and choose the fastest.
    # Only good if sizes stay the same within the main loop!
    torch.backends.cudnn.benchmark = True
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#     logger.info(f"Going to train on {device}")
    
    
    train_set, valid_set, train_loader, valid_loader = mktrainval('cifar100')
    
    
    model = models.KNOWN_MODELS['BiT-M-R50x1'](head_size=len(valid_set.classes), zero_head=True)
    model.load_from(np.load(f"BiT-M-R50x1.npz"))
    
    model = torch.nn.DataParallel(model)
    
    # Optionally resume from a checkpoint.
    # Load it to CPU first as we'll move the model to GPU later.
    # This way, we save a little bit of GPU memory when loading.
    step = 0
    
    # Note: no weight-decay!
    optim = torch.optim.SGD(model.parameters(), lr=0.003, momentum=0.9)
    
    # Rne-tuning if esume fiwe find a saved model.
    savename = pjoin('C:\\Users\\dhkim\\Desktop\\directory\\big_transfer-master', "bit.pth.tar")
    
    try:
        print(f"Model will be saved in '{savename}'")
        checkpoint = torch.load(savename, map_location="cpu")
        print(f"Found saved model to resume from at '{savename}'")

        step = checkpoint["step"]
        model.load_state_dict(checkpoint["model"])
        optim.load_state_dict(checkpoint["optim"])
        print(f"Resumed at step {step}")
    except FileNotFoundError:
        print("Fine-tuning from BiT")
    
    
    model = model.to(device)
    optim.zero_grad()
    
    model.train()
    mixup = bit_hyperrule.get_mixup(len(train_set))
    cri = torch.nn.CrossEntropyLoss().to(device)
    
    chrono = lb.Chrono()
    accum_steps = 0
    mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1
    end = time.time()
    
    with lb.Uninterrupt() as u:
        for x, y in recycle(train_loader):
            # measure data loading time, which is spent in the `for` statement.
            chrono._done("load", time.time() - end)
            
            if u.interrupted:
                break
                
            # Schedule sending to GPU(s)
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            
            # Update learning-rate, including stop training if over.
            lr = bit_hyperrule.get_lr(step, len(train_set), 0.003)
            if lr is None:
                break
            for param_group in optim.param_groups:
                param_group["lr"] = lr
                
            if mixup > 0.0:
                x, y_a, y_b = mixup_data(x, y, mixup_l)
                
             # compute output
            with chrono.measure("fprop"):
                logits = model(x)
                if mixup > 0.0:
                    c = mixup_criterion(cri, logits, y_a, y_b, mixup_l)
                else:
                    c = cri(logits, y)
                c_num = float(c.data.cpu().numpy())  # Also ensures a sync point.
            
            # Accumulate grads
            with chrono.measure("grads"):
                (c / 1).backward()
                accum_steps += 1
                
            accstep = f" ({accum_steps}/{1})"
            print(f"[step {step}/accstep : {accstep}]: loss={c_num:.5f} (lr={lr:.1e})")  # pylint: disable=logging-format-interpolation
            
            
             # Update params
            if accum_steps == 1:
                with chrono.measure("update"):
                    optim.step()
                    optim.zero_grad()
                step += 1
                accum_steps = 0
                # Sample new mixup ratio for next batch
                mixup_l = np.random.beta(mixup, mixup) if mixup > 0 else 1
                
#                 print("!!!!!!!!!!!!!!!!!!!!!!!!")
                # Run evaluation and save the model.
                if 0 and step % 0 == 0:
                    run_eval(model, valid_loader, device, chrono, logger, step)
                    print("!!!!!!!!!!!!!!!!!!!!!!!!")
                    if args.save:
                        torch.save({
                            "step": step,
                            "model": model.state_dict(),
                            "optim" : optim.state_dict(),
                        }, savename)
            end = time.time()
            if(step == 2500):
                break
            
         # Final eval at end of training.
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        run_eval(model, valid_loader, device, chrono, step='end')

    print(f"Timings:\n{chrono}")
    
    
 
            
            


In [76]:
main()

Files already downloaded and verified
Files already downloaded and verified
Model will be saved in 'C:\Users\dhkim\Desktop\directory\big_transfer-master\bit.pth.tar'
Fine-tuning from BiT
[step 0/accstep :  (1/1)]: loss=4.60517 (lr=0.0e+00)
[step 1/accstep :  (1/1)]: loss=4.60517 (lr=6.0e-06)
[step 2/accstep :  (1/1)]: loss=4.60461 (lr=1.2e-05)
[step 3/accstep :  (1/1)]: loss=4.60279 (lr=1.8e-05)
[step 4/accstep :  (1/1)]: loss=4.60334 (lr=2.4e-05)
[step 5/accstep :  (1/1)]: loss=4.59243 (lr=3.0e-05)
[step 6/accstep :  (1/1)]: loss=4.60260 (lr=3.6e-05)
[step 7/accstep :  (1/1)]: loss=4.60124 (lr=4.2e-05)
[step 8/accstep :  (1/1)]: loss=4.60037 (lr=4.8e-05)
[step 9/accstep :  (1/1)]: loss=4.56253 (lr=5.4e-05)
[step 10/accstep :  (1/1)]: loss=4.57058 (lr=6.0e-05)
[step 11/accstep :  (1/1)]: loss=4.53340 (lr=6.6e-05)
[step 12/accstep :  (1/1)]: loss=4.54870 (lr=7.2e-05)
[step 13/accstep :  (1/1)]: loss=4.58302 (lr=7.8e-05)
[step 14/accstep :  (1/1)]: loss=4.52834 (lr=8.4e-05)
[step 15/accs

[step 150/accstep :  (1/1)]: loss=3.62720 (lr=9.0e-04)
[step 151/accstep :  (1/1)]: loss=2.52214 (lr=9.1e-04)
[step 152/accstep :  (1/1)]: loss=2.46938 (lr=9.1e-04)
[step 153/accstep :  (1/1)]: loss=3.16761 (lr=9.2e-04)
[step 154/accstep :  (1/1)]: loss=2.15886 (lr=9.2e-04)
[step 155/accstep :  (1/1)]: loss=2.71255 (lr=9.3e-04)
[step 156/accstep :  (1/1)]: loss=1.78272 (lr=9.4e-04)
[step 157/accstep :  (1/1)]: loss=2.17160 (lr=9.4e-04)
[step 158/accstep :  (1/1)]: loss=2.52218 (lr=9.5e-04)
[step 159/accstep :  (1/1)]: loss=2.67304 (lr=9.5e-04)
[step 160/accstep :  (1/1)]: loss=1.48138 (lr=9.6e-04)
[step 161/accstep :  (1/1)]: loss=2.07914 (lr=9.7e-04)
[step 162/accstep :  (1/1)]: loss=3.44594 (lr=9.7e-04)
[step 163/accstep :  (1/1)]: loss=2.71082 (lr=9.8e-04)
[step 164/accstep :  (1/1)]: loss=4.63071 (lr=9.8e-04)
[step 165/accstep :  (1/1)]: loss=1.88943 (lr=9.9e-04)
[step 166/accstep :  (1/1)]: loss=2.68162 (lr=1.0e-03)
[step 167/accstep :  (1/1)]: loss=2.83301 (lr=1.0e-03)
[step 168/

[step 300/accstep :  (1/1)]: loss=1.78845 (lr=1.8e-03)
[step 301/accstep :  (1/1)]: loss=4.29195 (lr=1.8e-03)
[step 302/accstep :  (1/1)]: loss=3.07781 (lr=1.8e-03)
[step 303/accstep :  (1/1)]: loss=4.36802 (lr=1.8e-03)
[step 304/accstep :  (1/1)]: loss=2.24074 (lr=1.8e-03)
[step 305/accstep :  (1/1)]: loss=1.68109 (lr=1.8e-03)
[step 306/accstep :  (1/1)]: loss=2.10942 (lr=1.8e-03)
[step 307/accstep :  (1/1)]: loss=2.49856 (lr=1.8e-03)
[step 308/accstep :  (1/1)]: loss=2.15299 (lr=1.8e-03)
[step 309/accstep :  (1/1)]: loss=4.70872 (lr=1.9e-03)
[step 310/accstep :  (1/1)]: loss=2.63390 (lr=1.9e-03)
[step 311/accstep :  (1/1)]: loss=2.57980 (lr=1.9e-03)
[step 312/accstep :  (1/1)]: loss=1.96624 (lr=1.9e-03)
[step 313/accstep :  (1/1)]: loss=2.88015 (lr=1.9e-03)
[step 314/accstep :  (1/1)]: loss=2.28235 (lr=1.9e-03)
[step 315/accstep :  (1/1)]: loss=2.25302 (lr=1.9e-03)
[step 316/accstep :  (1/1)]: loss=2.94083 (lr=1.9e-03)
[step 317/accstep :  (1/1)]: loss=2.29136 (lr=1.9e-03)
[step 318/

[step 450/accstep :  (1/1)]: loss=2.39375 (lr=2.7e-03)
[step 451/accstep :  (1/1)]: loss=2.49055 (lr=2.7e-03)
[step 452/accstep :  (1/1)]: loss=1.66521 (lr=2.7e-03)
[step 453/accstep :  (1/1)]: loss=2.10436 (lr=2.7e-03)
[step 454/accstep :  (1/1)]: loss=2.90610 (lr=2.7e-03)
[step 455/accstep :  (1/1)]: loss=2.22133 (lr=2.7e-03)
[step 456/accstep :  (1/1)]: loss=1.93229 (lr=2.7e-03)
[step 457/accstep :  (1/1)]: loss=2.86369 (lr=2.7e-03)
[step 458/accstep :  (1/1)]: loss=1.81945 (lr=2.7e-03)
[step 459/accstep :  (1/1)]: loss=1.41834 (lr=2.8e-03)
[step 460/accstep :  (1/1)]: loss=2.59692 (lr=2.8e-03)
[step 461/accstep :  (1/1)]: loss=1.41431 (lr=2.8e-03)
[step 462/accstep :  (1/1)]: loss=1.36542 (lr=2.8e-03)
[step 463/accstep :  (1/1)]: loss=1.92526 (lr=2.8e-03)
[step 464/accstep :  (1/1)]: loss=2.59565 (lr=2.8e-03)
[step 465/accstep :  (1/1)]: loss=4.05447 (lr=2.8e-03)
[step 466/accstep :  (1/1)]: loss=2.47030 (lr=2.8e-03)
[step 467/accstep :  (1/1)]: loss=2.69814 (lr=2.8e-03)
[step 468/

[step 600/accstep :  (1/1)]: loss=1.58262 (lr=3.0e-03)
[step 601/accstep :  (1/1)]: loss=2.89970 (lr=3.0e-03)
[step 602/accstep :  (1/1)]: loss=1.82691 (lr=3.0e-03)
[step 603/accstep :  (1/1)]: loss=2.94096 (lr=3.0e-03)
[step 604/accstep :  (1/1)]: loss=2.23150 (lr=3.0e-03)
[step 605/accstep :  (1/1)]: loss=3.53608 (lr=3.0e-03)
[step 606/accstep :  (1/1)]: loss=2.06506 (lr=3.0e-03)
[step 607/accstep :  (1/1)]: loss=2.26299 (lr=3.0e-03)
[step 608/accstep :  (1/1)]: loss=4.03053 (lr=3.0e-03)
[step 609/accstep :  (1/1)]: loss=2.58543 (lr=3.0e-03)
[step 610/accstep :  (1/1)]: loss=2.27389 (lr=3.0e-03)
[step 611/accstep :  (1/1)]: loss=1.71170 (lr=3.0e-03)
[step 612/accstep :  (1/1)]: loss=2.74217 (lr=3.0e-03)
[step 613/accstep :  (1/1)]: loss=2.32069 (lr=3.0e-03)
[step 614/accstep :  (1/1)]: loss=3.65269 (lr=3.0e-03)
[step 615/accstep :  (1/1)]: loss=2.41734 (lr=3.0e-03)
[step 616/accstep :  (1/1)]: loss=1.93204 (lr=3.0e-03)
[step 617/accstep :  (1/1)]: loss=1.37257 (lr=3.0e-03)
[step 618/

[step 750/accstep :  (1/1)]: loss=2.71218 (lr=3.0e-03)
[step 751/accstep :  (1/1)]: loss=1.28430 (lr=3.0e-03)
[step 752/accstep :  (1/1)]: loss=1.70269 (lr=3.0e-03)
[step 753/accstep :  (1/1)]: loss=1.94346 (lr=3.0e-03)
[step 754/accstep :  (1/1)]: loss=2.10947 (lr=3.0e-03)
[step 755/accstep :  (1/1)]: loss=3.51571 (lr=3.0e-03)
[step 756/accstep :  (1/1)]: loss=1.65331 (lr=3.0e-03)
[step 757/accstep :  (1/1)]: loss=1.62953 (lr=3.0e-03)
[step 758/accstep :  (1/1)]: loss=1.10810 (lr=3.0e-03)
[step 759/accstep :  (1/1)]: loss=1.75237 (lr=3.0e-03)
[step 760/accstep :  (1/1)]: loss=4.33605 (lr=3.0e-03)
[step 761/accstep :  (1/1)]: loss=4.12247 (lr=3.0e-03)
[step 762/accstep :  (1/1)]: loss=2.74499 (lr=3.0e-03)
[step 763/accstep :  (1/1)]: loss=2.17716 (lr=3.0e-03)
[step 764/accstep :  (1/1)]: loss=2.26265 (lr=3.0e-03)
[step 765/accstep :  (1/1)]: loss=2.28349 (lr=3.0e-03)
[step 766/accstep :  (1/1)]: loss=1.99484 (lr=3.0e-03)
[step 767/accstep :  (1/1)]: loss=2.39501 (lr=3.0e-03)
[step 768/

[step 901/accstep :  (1/1)]: loss=2.08505 (lr=3.0e-03)
[step 902/accstep :  (1/1)]: loss=2.14092 (lr=3.0e-03)
[step 903/accstep :  (1/1)]: loss=1.60528 (lr=3.0e-03)
[step 904/accstep :  (1/1)]: loss=2.04431 (lr=3.0e-03)
[step 905/accstep :  (1/1)]: loss=1.97718 (lr=3.0e-03)
[step 906/accstep :  (1/1)]: loss=1.70460 (lr=3.0e-03)
[step 907/accstep :  (1/1)]: loss=2.15047 (lr=3.0e-03)
[step 908/accstep :  (1/1)]: loss=1.49729 (lr=3.0e-03)
[step 909/accstep :  (1/1)]: loss=2.31881 (lr=3.0e-03)
[step 910/accstep :  (1/1)]: loss=1.24831 (lr=3.0e-03)
[step 911/accstep :  (1/1)]: loss=2.59057 (lr=3.0e-03)
[step 912/accstep :  (1/1)]: loss=1.68223 (lr=3.0e-03)
[step 913/accstep :  (1/1)]: loss=2.01802 (lr=3.0e-03)
[step 914/accstep :  (1/1)]: loss=2.25340 (lr=3.0e-03)
[step 915/accstep :  (1/1)]: loss=1.84202 (lr=3.0e-03)
[step 916/accstep :  (1/1)]: loss=2.09899 (lr=3.0e-03)
[step 917/accstep :  (1/1)]: loss=2.12248 (lr=3.0e-03)
[step 918/accstep :  (1/1)]: loss=1.71871 (lr=3.0e-03)
[step 919/

[step 1052/accstep :  (1/1)]: loss=1.37616 (lr=3.0e-03)
[step 1053/accstep :  (1/1)]: loss=2.02758 (lr=3.0e-03)
[step 1054/accstep :  (1/1)]: loss=1.87461 (lr=3.0e-03)
[step 1055/accstep :  (1/1)]: loss=2.52253 (lr=3.0e-03)
[step 1056/accstep :  (1/1)]: loss=1.62294 (lr=3.0e-03)
[step 1057/accstep :  (1/1)]: loss=3.11374 (lr=3.0e-03)
[step 1058/accstep :  (1/1)]: loss=1.86312 (lr=3.0e-03)
[step 1059/accstep :  (1/1)]: loss=2.23010 (lr=3.0e-03)
[step 1060/accstep :  (1/1)]: loss=0.80216 (lr=3.0e-03)
[step 1061/accstep :  (1/1)]: loss=1.57412 (lr=3.0e-03)
[step 1062/accstep :  (1/1)]: loss=3.84842 (lr=3.0e-03)
[step 1063/accstep :  (1/1)]: loss=2.07588 (lr=3.0e-03)
[step 1064/accstep :  (1/1)]: loss=2.34979 (lr=3.0e-03)
[step 1065/accstep :  (1/1)]: loss=1.75390 (lr=3.0e-03)
[step 1066/accstep :  (1/1)]: loss=2.71894 (lr=3.0e-03)
[step 1067/accstep :  (1/1)]: loss=1.93121 (lr=3.0e-03)
[step 1068/accstep :  (1/1)]: loss=2.39649 (lr=3.0e-03)
[step 1069/accstep :  (1/1)]: loss=2.17597 (lr=3

[step 1202/accstep :  (1/1)]: loss=1.09559 (lr=3.0e-03)
[step 1203/accstep :  (1/1)]: loss=3.08263 (lr=3.0e-03)
[step 1204/accstep :  (1/1)]: loss=2.26099 (lr=3.0e-03)
[step 1205/accstep :  (1/1)]: loss=2.41379 (lr=3.0e-03)
[step 1206/accstep :  (1/1)]: loss=2.38563 (lr=3.0e-03)
[step 1207/accstep :  (1/1)]: loss=0.99725 (lr=3.0e-03)
[step 1208/accstep :  (1/1)]: loss=1.83176 (lr=3.0e-03)
[step 1209/accstep :  (1/1)]: loss=1.32496 (lr=3.0e-03)
[step 1210/accstep :  (1/1)]: loss=1.81284 (lr=3.0e-03)
[step 1211/accstep :  (1/1)]: loss=2.11524 (lr=3.0e-03)
[step 1212/accstep :  (1/1)]: loss=1.53822 (lr=3.0e-03)
[step 1213/accstep :  (1/1)]: loss=2.56862 (lr=3.0e-03)
[step 1214/accstep :  (1/1)]: loss=2.27720 (lr=3.0e-03)
[step 1215/accstep :  (1/1)]: loss=4.01947 (lr=3.0e-03)
[step 1216/accstep :  (1/1)]: loss=2.41846 (lr=3.0e-03)
[step 1217/accstep :  (1/1)]: loss=1.26658 (lr=3.0e-03)
[step 1218/accstep :  (1/1)]: loss=3.85648 (lr=3.0e-03)
[step 1219/accstep :  (1/1)]: loss=1.83635 (lr=3

[step 1349/accstep :  (1/1)]: loss=1.65755 (lr=3.0e-03)
[step 1350/accstep :  (1/1)]: loss=2.31733 (lr=3.0e-03)
[step 1351/accstep :  (1/1)]: loss=1.39356 (lr=3.0e-03)
[step 1352/accstep :  (1/1)]: loss=1.79053 (lr=3.0e-03)
[step 1353/accstep :  (1/1)]: loss=1.37409 (lr=3.0e-03)
[step 1354/accstep :  (1/1)]: loss=1.33404 (lr=3.0e-03)
[step 1355/accstep :  (1/1)]: loss=2.33832 (lr=3.0e-03)
[step 1356/accstep :  (1/1)]: loss=1.59361 (lr=3.0e-03)
[step 1357/accstep :  (1/1)]: loss=1.20304 (lr=3.0e-03)
[step 1358/accstep :  (1/1)]: loss=1.91167 (lr=3.0e-03)
[step 1359/accstep :  (1/1)]: loss=1.67642 (lr=3.0e-03)
[step 1360/accstep :  (1/1)]: loss=1.44250 (lr=3.0e-03)
[step 1361/accstep :  (1/1)]: loss=1.36555 (lr=3.0e-03)
[step 1362/accstep :  (1/1)]: loss=1.47106 (lr=3.0e-03)
[step 1363/accstep :  (1/1)]: loss=3.70873 (lr=3.0e-03)
[step 1364/accstep :  (1/1)]: loss=3.88177 (lr=3.0e-03)
[step 1365/accstep :  (1/1)]: loss=1.12371 (lr=3.0e-03)
[step 1366/accstep :  (1/1)]: loss=1.38166 (lr=3

[step 1499/accstep :  (1/1)]: loss=1.11164 (lr=3.0e-03)
[step 1500/accstep :  (1/1)]: loss=1.77868 (lr=3.0e-03)
[step 1501/accstep :  (1/1)]: loss=2.29035 (lr=3.0e-03)
[step 1502/accstep :  (1/1)]: loss=1.11137 (lr=3.0e-03)
[step 1503/accstep :  (1/1)]: loss=1.74306 (lr=3.0e-03)
[step 1504/accstep :  (1/1)]: loss=1.62403 (lr=3.0e-03)
[step 1505/accstep :  (1/1)]: loss=1.71723 (lr=3.0e-03)
[step 1506/accstep :  (1/1)]: loss=2.13056 (lr=3.0e-03)
[step 1507/accstep :  (1/1)]: loss=1.63989 (lr=3.0e-03)
[step 1508/accstep :  (1/1)]: loss=1.96282 (lr=3.0e-03)
[step 1509/accstep :  (1/1)]: loss=1.72810 (lr=3.0e-03)
[step 1510/accstep :  (1/1)]: loss=1.78724 (lr=3.0e-03)
[step 1511/accstep :  (1/1)]: loss=2.31704 (lr=3.0e-03)
[step 1512/accstep :  (1/1)]: loss=2.66922 (lr=3.0e-03)
[step 1513/accstep :  (1/1)]: loss=1.70660 (lr=3.0e-03)
[step 1514/accstep :  (1/1)]: loss=2.44375 (lr=3.0e-03)
[step 1515/accstep :  (1/1)]: loss=0.87973 (lr=3.0e-03)
[step 1516/accstep :  (1/1)]: loss=1.28471 (lr=3

[step 1648/accstep :  (1/1)]: loss=1.63666 (lr=3.0e-03)
[step 1649/accstep :  (1/1)]: loss=1.62118 (lr=3.0e-03)
[step 1650/accstep :  (1/1)]: loss=3.50508 (lr=3.0e-03)
[step 1651/accstep :  (1/1)]: loss=4.67949 (lr=3.0e-03)
[step 1652/accstep :  (1/1)]: loss=1.63652 (lr=3.0e-03)
[step 1653/accstep :  (1/1)]: loss=1.75745 (lr=3.0e-03)
[step 1654/accstep :  (1/1)]: loss=1.75430 (lr=3.0e-03)
[step 1655/accstep :  (1/1)]: loss=2.86231 (lr=3.0e-03)
[step 1656/accstep :  (1/1)]: loss=2.38354 (lr=3.0e-03)
[step 1657/accstep :  (1/1)]: loss=1.76648 (lr=3.0e-03)
[step 1658/accstep :  (1/1)]: loss=2.00432 (lr=3.0e-03)
[step 1659/accstep :  (1/1)]: loss=1.78566 (lr=3.0e-03)
[step 1660/accstep :  (1/1)]: loss=2.30196 (lr=3.0e-03)
[step 1661/accstep :  (1/1)]: loss=2.89350 (lr=3.0e-03)
[step 1662/accstep :  (1/1)]: loss=1.59864 (lr=3.0e-03)
[step 1663/accstep :  (1/1)]: loss=2.53849 (lr=3.0e-03)
[step 1664/accstep :  (1/1)]: loss=1.76677 (lr=3.0e-03)
[step 1665/accstep :  (1/1)]: loss=1.67889 (lr=3

[step 1795/accstep :  (1/1)]: loss=1.28428 (lr=3.0e-03)
[step 1796/accstep :  (1/1)]: loss=0.50469 (lr=3.0e-03)
[step 1797/accstep :  (1/1)]: loss=1.18377 (lr=3.0e-03)
[step 1798/accstep :  (1/1)]: loss=1.07570 (lr=3.0e-03)
[step 1799/accstep :  (1/1)]: loss=1.34520 (lr=3.0e-03)
[step 1800/accstep :  (1/1)]: loss=1.62287 (lr=3.0e-03)
[step 1801/accstep :  (1/1)]: loss=1.41613 (lr=3.0e-03)
[step 1802/accstep :  (1/1)]: loss=1.24278 (lr=3.0e-03)
[step 1803/accstep :  (1/1)]: loss=2.07609 (lr=3.0e-03)
[step 1804/accstep :  (1/1)]: loss=3.38634 (lr=3.0e-03)
[step 1805/accstep :  (1/1)]: loss=0.78071 (lr=3.0e-03)
[step 1806/accstep :  (1/1)]: loss=1.11626 (lr=3.0e-03)
[step 1807/accstep :  (1/1)]: loss=1.22563 (lr=3.0e-03)
[step 1808/accstep :  (1/1)]: loss=4.18168 (lr=3.0e-03)
[step 1809/accstep :  (1/1)]: loss=1.58446 (lr=3.0e-03)
[step 1810/accstep :  (1/1)]: loss=1.74646 (lr=3.0e-03)
[step 1811/accstep :  (1/1)]: loss=1.50250 (lr=3.0e-03)
[step 1812/accstep :  (1/1)]: loss=3.61119 (lr=3

[step 1942/accstep :  (1/1)]: loss=1.77442 (lr=3.0e-03)
[step 1943/accstep :  (1/1)]: loss=1.50272 (lr=3.0e-03)
[step 1944/accstep :  (1/1)]: loss=3.40598 (lr=3.0e-03)
[step 1945/accstep :  (1/1)]: loss=0.97586 (lr=3.0e-03)
[step 1946/accstep :  (1/1)]: loss=1.90468 (lr=3.0e-03)
[step 1947/accstep :  (1/1)]: loss=0.78273 (lr=3.0e-03)
[step 1948/accstep :  (1/1)]: loss=1.24576 (lr=3.0e-03)
[step 1949/accstep :  (1/1)]: loss=3.67530 (lr=3.0e-03)
[step 1950/accstep :  (1/1)]: loss=1.26722 (lr=3.0e-03)
[step 1951/accstep :  (1/1)]: loss=2.15248 (lr=3.0e-03)
[step 1952/accstep :  (1/1)]: loss=1.19801 (lr=3.0e-03)
[step 1953/accstep :  (1/1)]: loss=3.32256 (lr=3.0e-03)
[step 1954/accstep :  (1/1)]: loss=2.93247 (lr=3.0e-03)
[step 1955/accstep :  (1/1)]: loss=0.97913 (lr=3.0e-03)
[step 1956/accstep :  (1/1)]: loss=0.73364 (lr=3.0e-03)
[step 1957/accstep :  (1/1)]: loss=0.85020 (lr=3.0e-03)
[step 1958/accstep :  (1/1)]: loss=1.80560 (lr=3.0e-03)
[step 1959/accstep :  (1/1)]: loss=2.60596 (lr=3

[step 2091/accstep :  (1/1)]: loss=1.91169 (lr=3.0e-03)
[step 2092/accstep :  (1/1)]: loss=3.48672 (lr=3.0e-03)
[step 2093/accstep :  (1/1)]: loss=0.69597 (lr=3.0e-03)
[step 2094/accstep :  (1/1)]: loss=4.12178 (lr=3.0e-03)
[step 2095/accstep :  (1/1)]: loss=1.48143 (lr=3.0e-03)
[step 2096/accstep :  (1/1)]: loss=3.48433 (lr=3.0e-03)
[step 2097/accstep :  (1/1)]: loss=2.20089 (lr=3.0e-03)
[step 2098/accstep :  (1/1)]: loss=1.59587 (lr=3.0e-03)
[step 2099/accstep :  (1/1)]: loss=1.99224 (lr=3.0e-03)
[step 2100/accstep :  (1/1)]: loss=2.14954 (lr=3.0e-03)
[step 2101/accstep :  (1/1)]: loss=1.59435 (lr=3.0e-03)
[step 2102/accstep :  (1/1)]: loss=2.45318 (lr=3.0e-03)
[step 2103/accstep :  (1/1)]: loss=3.05461 (lr=3.0e-03)
[step 2104/accstep :  (1/1)]: loss=3.26836 (lr=3.0e-03)
[step 2105/accstep :  (1/1)]: loss=2.62210 (lr=3.0e-03)
[step 2106/accstep :  (1/1)]: loss=1.84492 (lr=3.0e-03)
[step 2107/accstep :  (1/1)]: loss=1.34767 (lr=3.0e-03)
[step 2108/accstep :  (1/1)]: loss=1.30034 (lr=3

[step 2239/accstep :  (1/1)]: loss=1.68349 (lr=3.0e-03)
[step 2240/accstep :  (1/1)]: loss=1.57083 (lr=3.0e-03)
[step 2241/accstep :  (1/1)]: loss=1.70781 (lr=3.0e-03)
[step 2242/accstep :  (1/1)]: loss=1.65426 (lr=3.0e-03)
[step 2243/accstep :  (1/1)]: loss=1.58610 (lr=3.0e-03)
[step 2244/accstep :  (1/1)]: loss=3.05754 (lr=3.0e-03)
[step 2245/accstep :  (1/1)]: loss=1.16120 (lr=3.0e-03)
[step 2246/accstep :  (1/1)]: loss=1.36971 (lr=3.0e-03)
[step 2247/accstep :  (1/1)]: loss=1.53077 (lr=3.0e-03)
[step 2248/accstep :  (1/1)]: loss=1.48128 (lr=3.0e-03)
[step 2249/accstep :  (1/1)]: loss=2.86999 (lr=3.0e-03)
[step 2250/accstep :  (1/1)]: loss=1.56508 (lr=3.0e-03)
[step 2251/accstep :  (1/1)]: loss=2.77701 (lr=3.0e-03)
[step 2252/accstep :  (1/1)]: loss=3.42052 (lr=3.0e-03)
[step 2253/accstep :  (1/1)]: loss=1.82771 (lr=3.0e-03)
[step 2254/accstep :  (1/1)]: loss=1.33887 (lr=3.0e-03)
[step 2255/accstep :  (1/1)]: loss=1.29788 (lr=3.0e-03)
[step 2256/accstep :  (1/1)]: loss=1.65839 (lr=3

[step 2387/accstep :  (1/1)]: loss=1.21105 (lr=3.0e-03)
[step 2388/accstep :  (1/1)]: loss=1.83746 (lr=3.0e-03)
[step 2389/accstep :  (1/1)]: loss=0.72455 (lr=3.0e-03)
[step 2390/accstep :  (1/1)]: loss=1.19292 (lr=3.0e-03)
[step 2391/accstep :  (1/1)]: loss=1.35115 (lr=3.0e-03)
[step 2392/accstep :  (1/1)]: loss=1.01328 (lr=3.0e-03)
[step 2393/accstep :  (1/1)]: loss=0.57754 (lr=3.0e-03)
[step 2394/accstep :  (1/1)]: loss=4.19527 (lr=3.0e-03)
[step 2395/accstep :  (1/1)]: loss=1.99391 (lr=3.0e-03)
[step 2396/accstep :  (1/1)]: loss=0.81982 (lr=3.0e-03)
[step 2397/accstep :  (1/1)]: loss=3.34830 (lr=3.0e-03)
[step 2398/accstep :  (1/1)]: loss=1.80918 (lr=3.0e-03)
[step 2399/accstep :  (1/1)]: loss=1.34247 (lr=3.0e-03)
[step 2400/accstep :  (1/1)]: loss=1.25317 (lr=3.0e-03)
[step 2401/accstep :  (1/1)]: loss=1.31735 (lr=3.0e-03)
[step 2402/accstep :  (1/1)]: loss=0.67932 (lr=3.0e-03)
[step 2403/accstep :  (1/1)]: loss=1.36641 (lr=3.0e-03)
[step 2404/accstep :  (1/1)]: loss=1.86541 (lr=3