In [97]:
import argparse
import os
import sys
import datetime
import time
import math
import json
from pathlib import Path

import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torchvision import models as torchvision_models

import utils
import vision_transformer as vits
from vision_transformer import DINOHead

import easydict
import torchvision
from tqdm import tqdm

In [98]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [99]:
args = easydict.EasyDict({
     # Model parameters
    "arch" : 'vit_small',
    'patch_size' : 16,
    'out_dim' : 100,
#     'out_dim' : 65536,
    'norm_last_layer' : True,
    'momentum_teacher' : 0.996,
    'use_bn_in_head' : False,
    # Temperature teacher parameters
    'warmup_teacher_temp' : 0.04,
    'teacher_temp' : 0.04,
    'warmup_teacher_temp_epochs' : 0,
    # Training/Optimization parameters
    'use_fp16' : True,
    'weight_decay' : 0.04,
    'weight_decay_end' : 0.4,
    'clip_grad' : 3.0,
    'batch_size_per_gpu' : 64,
    'epochs' : 100,
    'freeze_last_layer' : 1,
    'lr' : 0.0005,
    'warmup_epochs' : 10,
    'min_lr' : 1e-6,
    'optimizer' : 'adamw',
    'drop_path_rate' : 0.1,
    # Multi-crop parameters
    'global_crops_scale': (0.4, 1.),
    'local_crops_number' : 8,
    'local_crops_scale' : (0.05, 0.4),
     # Misc
    'data_path' : 'C:\\Users\\dhkim\\Desktop\\directory\\imagenet_data',
    'output_dir' : 'C:\\Users\\dhkim\\Desktop\\directory\\dino-main/main_pth',
    'saveckp_freq' : 20,
    'seed' : 0,
    'num_workers' : 0,
#     'dist_url' : 'env://'
    'local_rank' : 0,
    'gpu' : device
   })

start_epoch = 0

In [100]:
utils.fix_random_seeds(args.seed)
cudnn.benchmark = True

In [101]:
class DataAugmentationDINO(object):
    def __init__(self, global_crops_scale, local_crops_scale, local_crops_number):
        flip_and_color_jitter = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomApply(
                [transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.2, hue=0.1)],
                p=0.8
            ),
            transforms.RandomGrayscale(p=0.2),
        ])
        normalize = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])

        # first global crop
        self.global_transfo1 = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(1.0),
            normalize,
        ])
        # second global crop
        self.global_transfo2 = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=global_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(0.1),
            utils.Solarization(0.2),
            normalize,
        ])
        # transformation for the local small crops
        self.local_crops_number = local_crops_number
        self.local_transfo = transforms.Compose([
            transforms.RandomResizedCrop(96, scale=local_crops_scale, interpolation=Image.BICUBIC),
            flip_and_color_jitter,
            utils.GaussianBlur(p=0.5),
            normalize,
        ])

    def __call__(self, image):
        crops = []
        crops.append(self.global_transfo1(image))
        crops.append(self.global_transfo2(image))
        for _ in range(self.local_crops_number):
            crops.append(self.local_transfo(image))
        return crops

In [102]:
class DINOLoss(nn.Module):
    def __init__(self, out_dim, ncrops, warmup_teacher_temp, teacher_temp,
                 warmup_teacher_temp_epochs, nepochs, student_temp=0.1,
                 center_momentum=0.9):
        super().__init__()
        self.student_temp = student_temp
        self.center_momentum = center_momentum
        self.ncrops = ncrops
        self.register_buffer("center", torch.zeros(1, out_dim))
        # we apply a warm up for the teacher temperature because
        # a too high temperature makes the training instable at the beginning
        self.teacher_temp_schedule = np.concatenate((
            np.linspace(warmup_teacher_temp,
                        teacher_temp, warmup_teacher_temp_epochs),
            np.ones(nepochs - warmup_teacher_temp_epochs) * teacher_temp
        ))
        

    def forward(self, student_output, teacher_output, epoch, target_labels):
        """
        Cross-entropy between softmax outputs of the teacher and student networks.
        """
        criterion = nn.CrossEntropyLoss()
        
        student_out = student_output / self.student_temp
#         print("student_out :", student_out)
#         print("student_out_shape :", student_out.shape)        
        student_out = student_out.chunk(self.ncrops)
#         for i, chunk in enumerate(student_out):
#             print(f"Chunk {i} shape: {chunk.shape}")
        

        # teacher centering and sharpening
        temp = self.teacher_temp_schedule[epoch]
        teacher_out = F.softmax((teacher_output - self.center) / temp, dim=-1)
#         print("teacher_out : ", teacher_out)
#         print("teacher_out_shape : ", teacher_out.shape)
        teacher_out = teacher_out.detach().chunk(2)

        total_loss = 0
        n_loss_terms = 0
        correct_predictions = 0  # Initialize correct_predictions variable
        total_predictions = 0

        for iq, q in enumerate(teacher_out):
            for v in range(len(student_out)):
                if v == iq:
                    # we skip cases where student and teacher operate on the same view
                    continue
                loss = criterion(student_out[v], target_labels)
                total_loss += loss
                n_loss_terms += 1
        total_loss /= n_loss_terms
        self.update_center(teacher_output)
        
         # Calculate accuracy
        _, predicted_labels = torch.max(student_out[v], dim=1)
        print("predicted_labels :",predicted_labels )
        correct_predictions += (predicted_labels == target_labels).sum().item()
        total_predictions += target_labels.size(0)
        accuracy = correct_predictions / total_predictions *100
        print("accuracy :", accuracy)
        
        return total_loss, accuracy

    @torch.no_grad()
    def update_center(self, teacher_output):
        """
        Update center used for teacher output.
        """
        batch_center = torch.sum(teacher_output, dim=0, keepdim=True)
#         dist.all_reduce(batch_center)
        batch_center = batch_center / (len(teacher_output))

        # ema update
        self.center = self.center * self.center_momentum + batch_center * (1 - self.center_momentum)

In [103]:
transform = DataAugmentationDINO(
    args.global_crops_scale,
    args.local_crops_scale,
    args.local_crops_number,
)

dataset = torchvision.datasets.CIFAR100(root='../data', train=True,
                                download=True, transform=transform)

data_loader = torch.utils.data.DataLoader(
    dataset,
    batch_size=args.batch_size_per_gpu,
    num_workers=args.num_workers,
    pin_memory=True,
    drop_last=True,
)
print(f"Data loaded: there are {len(dataset)} images.")

Files already downloaded and verified
Data loaded: there are 50000 images.


In [104]:
student = vits.__dict__[args.arch](
    patch_size=args.patch_size,
    drop_path_rate=args.drop_path_rate,  # stochastic depth
)
teacher = vits.__dict__[args.arch](patch_size=args.patch_size)
embed_dim = student.embed_dim

In [105]:
student = utils.MultiCropWrapper(
    student, DINOHead(
    embed_dim,
    args.out_dim,
    use_bn=args.use_bn_in_head,
    norm_last_layer=args.norm_last_layer,
))
teacher = utils.MultiCropWrapper(
    teacher,
    DINOHead(embed_dim, args.out_dim, args.use_bn_in_head),
)
# fc_input_dim = 65536  # 이전 레이어의 출력 차원
# fc_output_dim = 100  # 원하는 출력 차원

# # 마지막 FC 레이어 추가
# student.fc = nn.Linear(fc_input_dim, fc_output_dim)
# teacher.fc = nn.Linear(fc_input_dim, fc_output_dim)



# move networks to gpu
student, teacher = student.cuda(), teacher.cuda()

In [106]:
if utils.has_batchnorms(student):
    student = nn.SyncBatchNorm.convert_sync_batchnorm(student)
    teacher = nn.SyncBatchNorm.convert_sync_batchnorm(teacher)

    # we need DDP wrapper to have synchro batch norms working...
    teacher = nn.parallel.DistributedDataParallel(teacher, device_ids=[args.gpu])
    teacher_without_ddp = teacher.module
else:
    # teacher_without_ddp and teacher are the same thing
    teacher_without_ddp = teacher
student.to(args.gpu)
# teacher and student start with the same weights
teacher_without_ddp.load_state_dict(student.state_dict())

<All keys matched successfully>

In [107]:
for p in teacher.parameters():
    p.requires_grad = False
print(f"Student and Teacher are built: they are both {args.arch} network.")

Student and Teacher are built: they are both vit_small network.


In [108]:
dino_loss = DINOLoss(
        args.out_dim,
        args.local_crops_number + 2,  # total number of crops = 2 global crops + local_crops_number
        args.warmup_teacher_temp,
        args.teacher_temp,
        args.warmup_teacher_temp_epochs,
        args.epochs,
    ).cuda()

In [109]:
params_groups = utils.get_params_groups(student)
if args.optimizer == "adamw":
    optimizer = torch.optim.AdamW(params_groups)  # to use with ViTs

In [110]:
# for mixed precision training
fp16_scaler = None
if args.use_fp16:
    fp16_scaler = torch.cuda.amp.GradScaler()

In [111]:
lr_schedule = utils.cosine_scheduler(
    args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256.,  # linear scaling rule
    args.min_lr,
    args.epochs, len(data_loader),
    warmup_epochs=args.warmup_epochs,
)
wd_schedule = utils.cosine_scheduler(
    args.weight_decay,
    args.weight_decay_end,
    args.epochs, len(data_loader),
)
# momentum parameter is increased to 1. during training with a cosine schedule
momentum_schedule = utils.cosine_scheduler(args.momentum_teacher, 1, args.epochs, len(data_loader))
print(f"Loss, optimizer and schedulers ready.")

Loss, optimizer and schedulers ready.


In [112]:
print(student)

MultiCropWrapper(
  (backbone): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
       

In [113]:
print(teacher)

MultiCropWrapper(
  (backbone): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 384, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=384, out_features=1152, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
       

In [114]:
print("Starting DINO training !")
for epoch in tqdm(range(start_epoch, args.epochs)):
    for it, (images, target_label) in tqdm(enumerate(data_loader)):         
#         print("Number of images in the list:", len(images))
        
#         for i, image_tensor in enumerate(images):
#             print(f"Shape of image {i}: {image_tensor.shape}")
        
#         print("Target label size:", len(target_label))
        
        if it % 10 == 0:
            print(f"이터레이션: {it}")
        it = len(data_loader) * epoch + it  # global training iteration
        for i, param_group in enumerate(optimizer.param_groups):
            param_group["lr"] = lr_schedule[it]
            if i == 0:  # only the first group is regularized
                param_group["weight_decay"] = wd_schedule[it]
                
        images = [im.cuda(non_blocking=True) for im in images]
#         targets = [tl.cuda(non_blocking=True) for tl in target_label]
        
#         print(len(images))
#         print(len(targets))
        targets = torch.tensor(targets).to(device)
        
        ##muti_view_crop 때문에```
#                 Number of images in the list: 10
#         Shape of image 0: torch.Size([64, 3, 224, 224])
#         Shape of image 1: torch.Size([64, 3, 224, 224])
#         Shape of image 2: torch.Size([64, 3, 96, 96])
#         Shape of image 3: torch.Size([64, 3, 96, 96])
#         Shape of image 4: torch.Size([64, 3, 96, 96])
#         Shape of image 5: torch.Size([64, 3, 96, 96])
#         Shape of image 6: torch.Size([64, 3, 96, 96])
#         Shape of image 7: torch.Size([64, 3, 96, 96])
#         Shape of image 8: torch.Size([64, 3, 96, 96])
#         Shape of image 9: torch.Size([64, 3, 96, 96])
#         Target label size: 64
#         이런구조임
        
         
        with torch.cuda.amp.autocast(fp16_scaler is not None):
            teacher_output = teacher(images[:2])  # only the 2 global views pass through the teacher
            student_output = student(images)
            loss = dino_loss(student_output, teacher_output, epoch, targets)
            print("loss : ", loss)
            
        # student update
        optimizer.zero_grad()
        param_norms = None
        if fp16_scaler is None:
            loss.backward()
            if args.clip_grad:
                param_norms = utils.clip_gradients(student, args.clip_grad)
#             utils.cancel_gradients_last_layer(epoch, student,
#                                               args.freeze_last_layer)
            optimizer.step()
    

    
        with torch.no_grad():
            m = momentum_schedule[it]  # momentum parameter
            #teacher의 가중치를 student에게 가르쳐줌
            for param_q, param_k in zip(student.parameters(), teacher_without_ddp.parameters()):
                param_k.data.mul_(m).add_((1 - m) * param_q.detach().data)
                
    # ============ writing logs ... ============
    save_dict = {
        'student': student.state_dict(),
        'teacher': teacher.state_dict(),
        'optimizer': optimizer.state_dict(),
        'epoch': epoch + 1,
        'args': args,
        'dino_loss': dino_loss.state_dict(),
    }
    
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))





Starting DINO training !


  0%|                                                        | 0/100 [00:00<?, ?it/s]
  targets = torch.tensor(targets).to(device)

1it [00:01,  1.68s/it][A

이터레이션: 0
predicted_labels : tensor([57,  8, 61,  8, 62,  8, 47,  8,  8, 62,  8,  8, 57, 42, 62, 15, 72, 98,
        62, 42,  8, 87,  7, 57, 57, 19,  8, 36,  7, 70, 62, 70,  8, 15, 87,  8,
        37,  8,  8,  8,  6,  8,  8, 39, 61,  8,  8, 44,  8, 62,  8, 72, 57,  8,
         8, 96, 87,  8, 87,  8, 71,  8,  8, 38], device='cuda:0')
accuracy : 0.0
loss :  (tensor(4.8090, device='cuda:0', grad_fn=<DivBackward0>), 0.0)



2it [00:03,  1.65s/it][A

predicted_labels : tensor([96,  6,  5, 61, 38, 96, 93, 25, 93, 71, 49, 52,  8, 42, 87, 62, 62, 62,
        62, 87, 99, 28,  8, 86,  8, 42, 44, 62,  8,  8,  8, 30,  7,  6, 42, 61,
        87, 78, 52, 57,  8, 87, 15, 71, 25, 44, 62, 83,  5,  5, 52, 87, 84, 42,
        62, 61, 84,  8,  7,  5,  8, 52, 78,  5], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7084, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



3it [00:04,  1.59s/it][A

predicted_labels : tensor([ 5, 49,  8, 87, 49, 92,  8, 52, 87, 78, 16, 84, 85, 93,  8,  6, 84, 19,
        42, 42, 87,  8,  6,  8, 28, 42, 42, 52, 57, 30,  8,  8, 49, 72, 15, 52,
        61, 61,  7,  8,  8,  6,  8, 85, 71,  8,  7,  8, 93, 19, 61, 19,  7,  8,
        88, 78, 27, 28, 42,  8, 44,  8,  8,  8], device='cuda:0')
accuracy : 0.03125
loss :  (tensor(4.7347, device='cuda:0', grad_fn=<DivBackward0>), 0.03125)



4it [00:06,  1.55s/it][A

predicted_labels : tensor([86,  8, 44, 62, 62,  8,  8,  8, 84, 25,  8, 87, 52, 87,  5,  8, 25, 57,
        47, 36,  8,  8,  8, 57, 88, 28, 84,  8,  8, 39, 84,  8, 96, 84, 70,  8,
         8, 19, 62, 44, 15,  8, 57, 52,  8, 42,  7,  8,  6,  8, 44, 72, 87, 27,
        91,  3,  8, 62, 62, 52, 71, 62, 37,  1], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7428, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



5it [00:07,  1.58s/it][A

predicted_labels : tensor([ 8, 52, 52, 61, 28, 61,  8,  8, 78, 62, 42,  8, 19, 42, 95,  1, 72, 87,
        72, 42,  1, 62, 63,  8, 57,  8,  7,  8,  5, 62, 44,  8,  8, 96,  3, 93,
         8, 52, 24,  8, 15, 62,  8, 56, 84,  7, 72,  5, 30, 19, 52, 61, 42,  8,
        57,  8, 25,  8,  5,  3, 27, 87,  7, 78], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7926, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



6it [00:09,  1.58s/it][A

predicted_labels : tensor([ 8,  8, 62, 57, 72,  7,  8,  6,  7, 42, 37, 49, 96, 22, 52, 15,  7, 42,
        71, 87, 62,  8, 87, 62,  8, 81,  8, 52, 16, 15,  8, 93, 93, 93, 42, 93,
         7, 39,  8, 24, 66, 45, 66, 87,  8, 30, 91, 87,  8, 84, 52, 79, 57, 52,
         8, 62, 47, 44, 33,  7, 87, 52, 49,  1], device='cuda:0')
accuracy : 0.0625
loss :  (tensor(4.7435, device='cuda:0', grad_fn=<DivBackward0>), 0.0625)



7it [00:11,  1.57s/it][A

predicted_labels : tensor([19, 99,  8,  7,  8,  6, 27, 62, 57, 42, 52, 15, 87, 24,  8, 37, 61, 62,
        39,  8,  8,  8, 45, 49, 85, 62, 93,  7, 74, 30,  6, 52, 27, 49, 36,  8,
         8, 84,  8, 49, 22,  8, 25, 57, 52,  8, 19,  8, 84,  8, 52, 92,  8, 52,
        99, 30,  6, 76, 45, 84, 72, 87,  8, 62], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7885, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



8it [00:12,  1.54s/it][A

predicted_labels : tensor([ 8, 61, 42, 85, 84, 42,  8,  8, 15, 98,  8, 19, 85,  7, 62, 62,  6, 19,
        96,  8,  8, 93, 44,  6,  6,  8, 74,  8, 25, 90, 70, 62, 47, 39,  8,  8,
        87, 27, 84,  8, 90, 62, 62, 47, 44, 44,  8, 78, 62, 52, 93,  6, 96, 27,
         8, 49, 62,  8,  8, 87, 36,  8, 83,  8], device='cuda:0')
accuracy : 0.03125
loss :  (tensor(4.7789, device='cuda:0', grad_fn=<DivBackward0>), 0.03125)



9it [00:14,  1.53s/it][A

predicted_labels : tensor([62, 57, 19, 24, 84, 28,  8,  8, 38, 57,  6, 42, 61,  8, 52, 15, 76, 38,
        87, 87, 93, 44, 16, 78, 38,  8, 15, 62, 62, 57, 84,  8, 15, 49, 61, 15,
        47, 27, 96,  8, 42, 57, 16, 52,  8,  8,  8, 72, 49, 57, 57, 78,  8, 42,
        61, 30, 84,  8,  8,  8,  8,  8,  8, 38], device='cuda:0')
accuracy : 0.0
loss :  (tensor(4.6963, device='cuda:0', grad_fn=<DivBackward0>), 0.0)



10it [00:15,  1.52s/it][A

predicted_labels : tensor([15, 21, 44, 15,  6, 42, 78,  8, 62, 84, 42,  8, 56, 93,  5, 39, 44, 42,
        19, 52, 30,  7,  8,  6, 91, 87,  7, 57, 19, 52,  8,  8, 25, 52, 62, 87,
        52, 85,  8, 42,  8,  8, 78, 52,  6, 15, 38,  8, 62, 36, 96, 47,  2, 70,
        96,  8, 93, 52,  7,  8,  8, 52, 19,  8], device='cuda:0')
accuracy : 0.03125
loss :  (tensor(4.8046, device='cuda:0', grad_fn=<DivBackward0>), 0.03125)



11it [00:17,  1.52s/it][A

이터레이션: 10
predicted_labels : tensor([ 7, 57, 93, 16, 52,  8,  6, 42, 42, 57, 99, 42,  8, 61, 42, 62, 52, 49,
        15,  8, 52, 52,  6,  8, 52, 38, 19, 62,  8, 92,  8,  5, 75, 61,  8, 42,
        42,  8, 93, 52,  3, 30,  8, 27,  8, 52, 62, 71,  8, 62,  8, 96, 30, 96,
        19, 62,  8,  8, 52, 38,  8, 15, 62,  8], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.6950, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



12it [00:18,  1.50s/it][A

predicted_labels : tensor([27,  1,  8, 85, 62,  8, 30,  8, 87,  7, 87, 36, 52, 52, 57,  8,  8,  8,
        36,  8,  8, 96, 47,  8, 84, 30, 27, 52, 87, 47, 62, 37, 27,  8, 42, 78,
         8,  8, 42,  8, 42,  8, 96, 76, 42, 87, 62,  6,  7,  8, 44, 45, 87,  8,
         8,  8, 96, 71, 79, 44,  8, 47,  8, 62], device='cuda:0')
accuracy : 0.046875
loss :  (tensor(4.7714, device='cuda:0', grad_fn=<DivBackward0>), 0.046875)



13it [00:20,  1.50s/it][A

predicted_labels : tensor([49,  8, 52,  8, 15, 15, 88, 92,  8, 19, 42,  8, 42,  1,  1, 30, 49, 70,
         8, 38, 93, 42, 52, 99, 59, 87,  8,  3, 87,  7, 19, 30, 61, 25,  8, 62,
         8, 87, 24,  8, 62, 62, 87, 25,  8, 27,  8, 30, 87, 61, 90, 24, 85, 93,
        39, 28, 15, 84, 91,  8,  6, 99, 36,  8], device='cuda:0')
accuracy : 0.03125
loss :  (tensor(4.7010, device='cuda:0', grad_fn=<DivBackward0>), 0.03125)



14it [00:21,  1.50s/it][A

predicted_labels : tensor([19, 44, 87,  5,  8, 37,  8, 57,  5, 42,  8, 61, 96, 52, 52, 52,  8, 87,
         8,  8, 84, 61, 42, 19, 57, 78,  8, 57, 42, 57,  8, 62,  8,  8, 42, 52,
        87,  8, 36, 96, 61, 84, 98, 71, 57, 49, 66, 42, 30,  8, 92, 42, 42,  8,
         8,  8,  8,  3, 49, 52, 45, 24,  2, 61], device='cuda:0')
accuracy : 0.0625
loss :  (tensor(4.7523, device='cuda:0', grad_fn=<DivBackward0>), 0.0625)



15it [00:22,  1.49s/it][A

predicted_labels : tensor([96, 78,  8, 49,  8, 84,  8,  8,  8,  8, 93, 30,  8,  6, 44, 87, 57,  8,
        44, 93, 93, 28, 33, 15, 62,  8,  8,  8, 84, 52, 90, 57, 87, 70,  8,  7,
        62, 52,  8, 87, 57,  8,  8, 62,  8,  8,  7, 84, 19,  8, 30, 19, 87, 72,
        57, 62, 42, 71, 62,  8, 85,  8, 52,  6], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7418, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



16it [00:24,  1.44s/it][A

predicted_labels : tensor([ 8, 39, 71,  6,  8,  8, 36, 44, 87, 37, 62,  5, 42, 80, 33,  8, 30, 42,
         8,  8, 85, 71, 59,  8, 62, 42, 52, 87,  8, 38,  6, 25, 39, 15, 59, 62,
        47, 42,  8, 96, 57, 92, 42,  8, 52, 61, 42,  5, 57,  8, 44,  6, 52,  8,
         6,  8,  8, 36, 42, 52, 87, 62, 20,  8], device='cuda:0')
accuracy : 0.0
loss :  (tensor(4.7876, device='cuda:0', grad_fn=<DivBackward0>), 0.0)



17it [00:25,  1.40s/it][A

predicted_labels : tensor([ 8, 15, 84, 52, 79, 96, 93, 52, 96,  8, 84, 84, 15,  8,  6, 62, 62, 19,
        72,  5, 42, 87, 52,  8, 42,  8, 87, 96, 27,  6, 72,  8, 39,  7, 30,  7,
         7,  2,  8, 15, 91, 30, 47, 87, 42,  3,  6, 72, 30,  8, 57, 62, 52, 52,
         6,  8, 62, 27, 92, 61, 52, 42, 49,  3], device='cuda:0')
accuracy : 0.0
loss :  (tensor(4.8719, device='cuda:0', grad_fn=<DivBackward0>), 0.0)



18it [00:26,  1.37s/it][A

predicted_labels : tensor([92,  8, 98,  8,  8,  8, 85, 98, 62, 84, 44, 52, 57, 49, 78, 52, 78, 44,
        42, 62, 62,  8, 62,  8, 52, 49,  6, 15,  8, 70, 45, 52,  8, 57, 62, 62,
        42,  8, 86, 24, 72, 74,  8, 15, 61,  8,  8, 16, 74,  8, 52, 52,  1,  6,
         8,  8, 52, 42, 78, 42, 96, 57, 96, 62], device='cuda:0')
accuracy : 0.0
loss :  (tensor(4.7498, device='cuda:0', grad_fn=<DivBackward0>), 0.0)



19it [00:28,  1.35s/it][A

predicted_labels : tensor([ 8, 42, 24,  8, 84, 62,  8, 72,  8, 15,  8,  3, 44, 72, 92, 62, 96,  8,
        96,  8,  3,  8, 44, 42, 61, 27, 78, 96, 19, 75, 61, 47,  8, 42,  8, 39,
        57,  8, 42, 47, 61, 42,  8,  8, 37, 19, 42, 93, 87, 78,  8, 36, 61, 44,
         8,  8, 84, 96, 30, 62, 62,  8, 61, 96], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.8473, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



20it [00:29,  1.35s/it][A

predicted_labels : tensor([42, 33,  8, 85, 52, 91, 36,  8, 39,  8,  5, 96, 19,  8, 92, 30, 44, 52,
         8, 62,  8, 52, 57,  3, 15,  8, 96, 30, 57,  8, 52, 42, 36, 33,  8,  7,
        62,  8, 47,  8, 28,  8, 30, 96, 47, 44, 72,  6, 19, 52, 62, 62, 52,  8,
        19, 42,  8, 42, 36, 30, 42, 84, 30, 27], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7906, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



21it [00:30,  1.35s/it][A

이터레이션: 20
predicted_labels : tensor([52, 52, 84, 52, 61, 27, 15,  6, 19, 62, 57, 57, 96, 96, 85, 42, 96,  7,
        79,  6,  8, 39, 70, 38, 52,  8, 57, 42,  7,  8, 30, 56,  8,  8,  8, 15,
        61, 93, 93, 70, 42,  8, 84,  8, 62, 45,  8, 47, 93,  8, 30, 25, 75, 84,
        62,  8, 70, 96, 62, 87, 57, 84, 49, 96], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7631, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



22it [00:32,  1.34s/it][A

predicted_labels : tensor([93, 57, 62, 87,  8, 52, 19, 52, 93, 19,  8,  8, 84, 57,  8,  6,  8, 44,
        25, 87, 57, 52, 62,  5, 49, 62,  8, 92,  8, 52, 52, 52, 44, 87, 96, 44,
         7, 44,  8,  8,  8, 42, 61,  8,  8, 57, 84, 52,  2, 52, 61, 84, 44, 47,
        71, 39, 75,  8, 59,  8, 15, 76,  6, 62], device='cuda:0')
accuracy : 0.03125
loss :  (tensor(4.7454, device='cuda:0', grad_fn=<DivBackward0>), 0.03125)



23it [00:33,  1.33s/it][A

predicted_labels : tensor([ 1, 84,  8, 44, 52, 27,  8, 96, 19, 39, 52, 44,  8,  8, 52, 36, 62,  8,
        96, 78, 19, 61, 87, 27, 99,  8,  8, 72,  6,  8, 42,  8, 27,  8, 42, 36,
        62,  8, 57, 44,  8, 96,  6, 62,  8,  8, 48,  8, 96, 91, 62, 52, 36, 61,
         3, 57, 62, 91, 42, 56, 52, 52, 87, 22], device='cuda:0')
accuracy : 0.0
loss :  (tensor(4.8048, device='cuda:0', grad_fn=<DivBackward0>), 0.0)



24it [00:34,  1.33s/it][A

predicted_labels : tensor([96, 59, 70, 25, 52,  8,  8, 87, 96, 57, 62, 96, 39, 49, 36, 19,  8,  8,
        52, 52,  8,  6,  8,  8,  8, 57,  8, 36, 48, 45, 52, 85, 30,  8, 96, 93,
         8, 71, 87, 62, 96,  6, 61, 66, 57, 84, 85, 62, 85, 62,  8, 72,  8, 62,
         5, 15,  5, 52, 52, 93, 62, 45,  8, 30], device='cuda:0')
accuracy : 0.03125
loss :  (tensor(4.7044, device='cuda:0', grad_fn=<DivBackward0>), 0.03125)



25it [00:36,  1.33s/it][A

predicted_labels : tensor([ 8, 42, 62,  7, 84, 87,  8, 93, 36, 66, 42,  8,  8, 19, 37, 72,  8,  3,
        30,  8, 57, 92,  8, 36, 52, 61, 52,  8,  5, 42, 45, 62, 52, 39, 72, 28,
        52, 49, 42,  8,  1,  8, 24, 96,  8, 91,  8, 52, 38, 52,  8, 62, 85, 52,
         8, 76, 91,  5, 99, 96,  8, 27,  8,  8], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7086, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



26it [00:37,  1.32s/it][A

predicted_labels : tensor([72, 62, 87, 44, 45, 42,  8, 84,  1,  8,  8,  8, 52,  8,  8, 40, 62, 52,
        42, 39,  7, 30, 42, 42, 19, 19,  8, 52, 52, 42, 63,  8,  8, 42,  5,  8,
        44, 47, 91, 96, 92, 62,  6, 87,  8,  7, 81, 93, 49, 52,  8, 42, 62, 71,
        62,  6, 42, 62,  8, 36,  8,  8, 44, 42], device='cuda:0')
accuracy : 0.03125
loss :  (tensor(4.7863, device='cuda:0', grad_fn=<DivBackward0>), 0.03125)



27it [00:38,  1.33s/it][A

predicted_labels : tensor([61, 93, 70, 96, 84, 84, 96, 96, 27, 62, 85,  5, 15, 49,  8, 72, 42,  7,
        97,  6, 84, 52, 70, 84, 91,  8,  8,  8,  8, 87, 27, 38,  7, 84, 19, 38,
        52, 93, 15, 33, 62,  8,  8, 90, 87, 61, 39, 52, 84,  3,  1,  8, 42, 62,
        62,  1, 62,  8,  8, 87,  8, 52, 15, 62], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.7751, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)



28it [00:40,  1.33s/it][A

predicted_labels : tensor([57, 52, 52, 87,  8,  8,  8,  8, 76, 87,  8, 38, 93, 52,  8, 44, 52, 44,
        84, 47,  8, 85,  8, 45, 71,  8, 96, 52,  7,  8, 96,  8, 93, 62,  6, 44,
        52, 24, 76, 61, 91,  8, 87,  8,  4, 59, 49,  8, 38,  8, 57, 62, 30, 37,
        90, 61,  7, 90,  8, 62,  8, 44, 38, 87], device='cuda:0')
accuracy : 0.015625
loss :  (tensor(4.8214, device='cuda:0', grad_fn=<DivBackward0>), 0.015625)


28it [00:40,  1.46s/it]
  0%|                                                        | 0/100 [00:40<?, ?it/s]


KeyboardInterrupt: 