In [1]:
from PIL import Image
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import torchvision
from torchvision import transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import utils
from tqdm import tqdm
import warnings

warnings.simplefilter('ignore', Image.DecompressionBombWarning)
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True


In [2]:
lr = 5e-4
n_epochs = 100
training_path = "/DAS_Storage4/hyungseok/Training"
validation_path = "/DAS_Storage4/hyungseok/Validation"
path = "/DAS_Storage4/hyungseok/2020-02-140.동의보감약초_sample/img"

In [3]:
train_trans = transforms.Compose([                          
                           transforms.ToTensor(),
                           transforms.RandomResizedCrop(224),                          
                           transforms.RandomHorizontalFlip(),
                           transforms.RandomVerticalFlip(),                           
                           transforms.Normalize((0.4455003,0.530581,0.30614495),(0.17718968,0.1856016,0.16673586))                           
                           ])
valid_trans = transforms.Compose([
                           transforms.ToTensor(),
                           transforms.RandomResizedCrop(224),
                           transforms.Normalize((0.4455003,0.530581,0.30614495),(0.17718968,0.1856016,0.16673586))
                           ])
trainset = torchvision.datasets.ImageFolder(root = training_path,
                                           transform = train_trans)
validset = torchvision.datasets.ImageFolder(root = validation_path,
                                           transform = valid_trans)
train_loader = DataLoader(trainset, batch_size = 256, shuffle = True, num_workers = 8)
valid_loader = DataLoader(validset, batch_size = 256, shuffle = False, num_workers = 8)

In [4]:
class PatchEmbed(nn.Module):
    """Split image into patches and then embed them.
    Parameters
    ----------
    img_size : int
        Size of the image (it is a square).
    patch_size : int
        Size of the patch (it is a square).
    in_chans : int
        Number of input channels.
    embed_dim : int
        The emmbedding dimension.
    Attributes
    ----------
    n_patches : int
        Number of patches inside of our image.
    proj : nn.Conv2d
        Convolutional layer that does both the splitting into patches
        and their embedding.
    """
    def __init__(self, img_size, patch_size, in_chans=3, embed_dim=768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size) ** 2


        self.proj = nn.Conv2d(
                in_chans,
                embed_dim,
                kernel_size=patch_size,
                stride=patch_size,
        )

    def forward(self, x):
        """Run forward pass.
        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, in_chans, img_size, img_size)`.
        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches, embed_dim)`.
        """
        x = self.proj(
                x
            )  # (n_samples, embed_dim, n_patches ** 0.5, n_patches ** 0.5)
        x = x.flatten(2)  # (n_samples, embed_dim, n_patches)
        x = x.transpose(1, 2)  # (n_samples, n_patches, embed_dim)

        return x


class Attention(nn.Module):
    """Attention mechanism.
    Parameters
    ----------
    dim : int
        The input and out dimension of per token features.
    n_heads : int
        Number of attention heads.
    qkv_bias : bool
        If True then we include bias to the query, key and value projections.
    attn_p : float
        Dropout probability applied to the query, key and value tensors.
    proj_p : float
        Dropout probability applied to the output tensor.
    Attributes
    ----------
    scale : float
        Normalizing consant for the dot product.
    qkv : nn.Linear
        Linear projection for the query, key and value.
    proj : nn.Linear
        Linear mapping that takes in the concatenated output of all attention
        heads and maps it into a new space.
    attn_drop, proj_drop : nn.Dropout
        Dropout layers.
    """
    def __init__(self, dim, n_heads=12, qkv_bias=True, attn_p=0., proj_p=0.):
        super().__init__()
        self.n_heads = n_heads
        self.dim = dim
        self.head_dim = dim // n_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_p)

    def forward(self, x):
        """Run forward pass.
        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.
        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.
        """
        n_samples, n_tokens, dim = x.shape

        if dim != self.dim:
            raise ValueError

        qkv = self.qkv(x)  # (n_samples, n_patches + 1, 3 * dim)
        qkv = qkv.reshape(
                n_samples, n_tokens, 3, self.n_heads, self.head_dim
        )  # (n_smaples, n_patches + 1, 3, n_heads, head_dim)
        qkv = qkv.permute(
                2, 0, 3, 1, 4
        )  # (3, n_samples, n_heads, n_patches + 1, head_dim)

        q, k, v = qkv[0], qkv[1], qkv[2]
        k_t = k.transpose(-2, -1)  # (n_samples, n_heads, head_dim, n_patches + 1)
        dp = (
           q @ k_t
        ) * self.scale # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = dp.softmax(dim=-1)  # (n_samples, n_heads, n_patches + 1, n_patches + 1)
        attn = self.attn_drop(attn)

        weighted_avg = attn @ v  # (n_samples, n_heads, n_patches +1, head_dim)
        weighted_avg = weighted_avg.transpose(
                1, 2
        )  # (n_samples, n_patches + 1, n_heads, head_dim)
        weighted_avg = weighted_avg.flatten(2)  # (n_samples, n_patches + 1, dim)

        x = self.proj(weighted_avg)  # (n_samples, n_patches + 1, dim)
        x = self.proj_drop(x)  # (n_samples, n_patches + 1, dim)

        return x


class MLP(nn.Module):
    """Multilayer perceptron.
    Parameters
    ----------
    in_features : int
        Number of input features.
    hidden_features : int
        Number of nodes in the hidden layer.
    out_features : int
        Number of output features.
    p : float
        Dropout probability.
    Attributes
    ----------
    fc : nn.Linear
        The First linear layer.
    act : nn.GELU
        GELU activation function.
    fc2 : nn.Linear
        The second linear layer.
    drop : nn.Dropout
        Dropout layer.
    """
    def __init__(self, in_features, hidden_features, out_features, p=0.):
        super().__init__()
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)

    def forward(self, x):
        """Run forward pass.
        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_patches + 1, in_features)`.
        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches +1, out_features)`
        """
        x = self.fc1(
                x
        ) # (n_samples, n_patches + 1, hidden_features)
        x = self.act(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, hidden_features)
        x = self.fc2(x)  # (n_samples, n_patches + 1, out_features)
        x = self.drop(x)  # (n_samples, n_patches + 1, out_features)

        return x


class Block(nn.Module):
    """Transformer block.
    Parameters
    ----------
    dim : int
        Embeddinig dimension.
    n_heads : int
        Number of attention heads.
    mlp_ratio : float
        Determines the hidden dimension size of the `MLP` module with respect
        to `dim`.
    qkv_bias : bool
        If True then we include bias to the query, key and value projections.
    p, attn_p : float
        Dropout probability.
    Attributes
    ----------
    norm1, norm2 : LayerNorm
        Layer normalization.
    attn : Attention
        Attention module.
    mlp : MLP
        MLP module.
    """
    def __init__(self, dim, n_heads, mlp_ratio=4.0, qkv_bias=True, p=0., attn_p=0.):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
        self.attn = Attention(
                dim,
                n_heads=n_heads,
                qkv_bias=qkv_bias,
                attn_p=attn_p,
                proj_p=p
        )
        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
        hidden_features = int(dim * mlp_ratio)
        self.mlp = MLP(
                in_features=dim,
                hidden_features=hidden_features,
                out_features=dim,
        )

    def forward(self, x):
        """Run forward pass.
        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.
        Returns
        -------
        torch.Tensor
            Shape `(n_samples, n_patches + 1, dim)`.
        """
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))

        return x


class VisionTransformer(nn.Module):
    """Simplified implementation of the Vision transformer.
    Parameters
    ----------
    img_size : int
        Both height and the width of the image (it is a square).
    patch_size : int
        Both height and the width of the patch (it is a square).
    in_chans : int
        Number of input channels.
    n_classes : int
        Number of classes.
    embed_dim : int
        Dimensionality of the token/patch embeddings.
    depth : int
        Number of blocks.
    n_heads : int
        Number of attention heads.
    mlp_ratio : float
        Determines the hidden dimension of the `MLP` module.
    qkv_bias : bool
        If True then we include bias to the query, key and value projections.
    p, attn_p : float
        Dropout probability.
    Attributes
    ----------
    patch_embed : PatchEmbed
        Instance of `PatchEmbed` layer.
    cls_token : nn.Parameter
        Learnable parameter that will represent the first token in the sequence.
        It has `embed_dim` elements.
    pos_emb : nn.Parameter
        Positional embedding of the cls token + all the patches.
        It has `(n_patches + 1) * embed_dim` elements.
    pos_drop : nn.Dropout
        Dropout layer.
    blocks : nn.ModuleList
        List of `Block` modules.
    norm : nn.LayerNorm
        Layer normalization.
    """
    def __init__(
            self,
            img_size=224,
            patch_size=16,
            in_chans=3,
            n_classes=128,
            embed_dim=768,
            depth=12,
            n_heads=12,
            mlp_ratio=4.,
            qkv_bias=True,
            p=0.,
            attn_p=0.,
    ):
        super().__init__()

        self.patch_embed = PatchEmbed(
                img_size=img_size,
                patch_size=patch_size,
                in_chans=in_chans,
                embed_dim=embed_dim,
        )
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(
                torch.zeros(1, 1 + self.patch_embed.n_patches, embed_dim)
        )
        self.pos_drop = nn.Dropout(p=p)

        self.blocks = nn.ModuleList(
            [
                Block(
                    dim=embed_dim,
                    n_heads=n_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    p=p,
                    attn_p=attn_p,
                )
                for _ in range(depth)
            ]
        )

        self.norm = nn.LayerNorm(embed_dim, eps=1e-6)
        self.head = nn.Linear(embed_dim, n_classes)


    def forward(self, x):
        """Run the forward pass.
        Parameters
        ----------
        x : torch.Tensor
            Shape `(n_samples, in_chans, img_size, img_size)`.
        Returns
        -------
        logits : torch.Tensor
            Logits over all the classes - `(n_samples, n_classes)`.
        """
        n_samples = x.shape[0]
        x = self.patch_embed(x)

        cls_token = self.cls_token.expand(
                n_samples, -1, -1
        )  # (n_samples, 1, embed_dim)
        x = torch.cat((cls_token, x), dim=1)  # (n_samples, 1 + n_patches, embed_dim)
        x = x + self.pos_embed  # (n_samples, 1 + n_patches, embed_dim)
        x = self.pos_drop(x)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)

        cls_token_final = x[:, 0]  # just the CLS token
        x = self.head(cls_token_final)

        return x

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [14]:
model = VisionTransformer()
utils.init_weights(model, init_type='uniform')
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas = (0.9, 0.999), eps = 1e-9, weight_decay = 1e-5)

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
  
    model = nn.DataParallel(model, device_ids=[0,1,2,3])

model.to(device)

Let's use 7 GPUs!


DataParallel(
  (module): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLP(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU()
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affin

In [15]:
train_loss_list = []
valid_loss_list = []
train_acc_list = []
valid_acc_list = []

for epoch in range(n_epochs):
    
    running_loss = 0.0
    train_correct = 0
    train_total = 0
    running_val_loss = 0.0
    valid_correct = 0
    valid_total = 0
    model.train()
    
    for idx, (x, y) in tqdm(enumerate(train_loader)):
        
        x, y = x.to(device), y.to(device)
        
        optimizer.zero_grad()
        
        output = model(x)
        
        loss = criterion(output, y)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        _, predicted = torch.max(output.data, 1)
        train_total += y.size(0)
        train_correct += (predicted == y).sum().item()
        
    with torch.no_grad():
        model.eval()

        for idx, (x, y) in tqdm(enumerate(valid_loader)):

            x, y = x.to(device), y.to(device)

            valid_output = model(x)

            valid_loss = criterion(valid_output, y)

            running_val_loss += valid_loss.item()

            _, predicted = torch.max(valid_output.data, 1)
            valid_total += y.size(0)
            valid_correct += (predicted == y).sum().item()
            
    torch.save({
        'epoch' : epoch,
        'model_state_dict' : model.state_dict(),
        'optimizer_state_dict' : optimizer.state_dict(),
        'loss' : loss
    }, 'model/VIT/{}-final_model.pt'.format(epoch+1))


            
        
    print('Epoch {}/{}, Train_Acc: {:.3f}, Train_Loss : {:.6f}, valid_Acc : {:.3f}, Valid_Loss : {:.6f}'.format(epoch+1,n_epochs, 
                                                                                                              train_correct/train_total,
                                                                                                              running_loss / len(train_loader),
                                                                                                             valid_correct/valid_total,
                                                                                                              running_val_loss / len(valid_loader)
                                                                                                             ))
    
    train_loss_list.append(running_loss / len(train_loader))
    valid_loss_list.append(running_val_loss / len(valid_loader))
    train_acc_list.append(train_correct/train_total)
    valid_acc_list.append(valid_correct/valid_total)
    
        

2072it [2:11:46,  3.82s/it]
259it [14:06,  3.27s/it]


Epoch 1/100, Train_Acc: 0.220, Train_Loss : 3.507063, valid_Acc : 0.348, Valid_Loss : 2.808020


2072it [1:30:58,  2.63s/it]
259it [14:13,  3.30s/it]


Epoch 2/100, Train_Acc: 0.411, Train_Loss : 2.503517, valid_Acc : 0.455, Valid_Loss : 2.283995


2072it [1:24:55,  2.46s/it]
259it [12:33,  2.91s/it]


Epoch 3/100, Train_Acc: 0.496, Train_Loss : 2.093021, valid_Acc : 0.526, Valid_Loss : 1.960181


2072it [1:18:37,  2.28s/it]
259it [12:36,  2.92s/it]


Epoch 4/100, Train_Acc: 0.555, Train_Loss : 1.816068, valid_Acc : 0.581, Valid_Loss : 1.715572


2072it [1:19:21,  2.30s/it]
259it [12:32,  2.91s/it]


Epoch 5/100, Train_Acc: 0.598, Train_Loss : 1.615415, valid_Acc : 0.610, Valid_Loss : 1.562666


2072it [1:17:20,  2.24s/it]
259it [12:57,  3.00s/it]


Epoch 6/100, Train_Acc: 0.632, Train_Loss : 1.466292, valid_Acc : 0.638, Valid_Loss : 1.438176


2072it [1:17:31,  2.24s/it]
259it [12:42,  2.94s/it]


Epoch 7/100, Train_Acc: 0.660, Train_Loss : 1.340249, valid_Acc : 0.664, Valid_Loss : 1.320112


2072it [1:19:02,  2.29s/it]
259it [12:16,  2.84s/it]


Epoch 8/100, Train_Acc: 0.683, Train_Loss : 1.239073, valid_Acc : 0.687, Valid_Loss : 1.236773


2072it [1:16:54,  2.23s/it]
259it [12:44,  2.95s/it]


Epoch 9/100, Train_Acc: 0.703, Train_Loss : 1.149877, valid_Acc : 0.704, Valid_Loss : 1.152738


2072it [1:18:50,  2.28s/it]
259it [12:27,  2.89s/it]


Epoch 10/100, Train_Acc: 0.723, Train_Loss : 1.068677, valid_Acc : 0.717, Valid_Loss : 1.099785


2072it [1:18:38,  2.28s/it]
259it [12:41,  2.94s/it]


Epoch 11/100, Train_Acc: 0.738, Train_Loss : 1.002810, valid_Acc : 0.735, Valid_Loss : 1.029935


2072it [1:17:44,  2.25s/it]
259it [12:49,  2.97s/it]


Epoch 12/100, Train_Acc: 0.753, Train_Loss : 0.939569, valid_Acc : 0.749, Valid_Loss : 0.964482


2072it [1:19:31,  2.30s/it]
259it [12:36,  2.92s/it]


Epoch 13/100, Train_Acc: 0.767, Train_Loss : 0.884984, valid_Acc : 0.752, Valid_Loss : 0.943816


2072it [1:18:42,  2.28s/it]
259it [12:50,  2.98s/it]


Epoch 14/100, Train_Acc: 0.779, Train_Loss : 0.833190, valid_Acc : 0.772, Valid_Loss : 0.872506


2072it [1:19:11,  2.29s/it]
259it [12:56,  3.00s/it]


Epoch 15/100, Train_Acc: 0.790, Train_Loss : 0.787371, valid_Acc : 0.782, Valid_Loss : 0.827871


2072it [1:19:38,  2.31s/it]
259it [13:05,  3.03s/it]


Epoch 16/100, Train_Acc: 0.800, Train_Loss : 0.746051, valid_Acc : 0.792, Valid_Loss : 0.789144


2072it [1:19:06,  2.29s/it]
259it [12:48,  2.97s/it]


Epoch 17/100, Train_Acc: 0.808, Train_Loss : 0.708490, valid_Acc : 0.794, Valid_Loss : 0.777694


2072it [1:19:30,  2.30s/it]
259it [12:38,  2.93s/it]


Epoch 18/100, Train_Acc: 0.818, Train_Loss : 0.672801, valid_Acc : 0.807, Valid_Loss : 0.726263


2072it [1:18:40,  2.28s/it]
259it [13:42,  3.18s/it]


Epoch 19/100, Train_Acc: 0.825, Train_Loss : 0.644743, valid_Acc : 0.811, Valid_Loss : 0.709844


2072it [1:18:42,  2.28s/it]
259it [12:50,  2.98s/it]


Epoch 20/100, Train_Acc: 0.832, Train_Loss : 0.614087, valid_Acc : 0.815, Valid_Loss : 0.683044


2072it [1:18:52,  2.28s/it]
259it [12:57,  3.00s/it]


Epoch 21/100, Train_Acc: 0.839, Train_Loss : 0.589807, valid_Acc : 0.819, Valid_Loss : 0.672044


2072it [1:19:53,  2.31s/it]
259it [12:15,  2.84s/it]


Epoch 22/100, Train_Acc: 0.844, Train_Loss : 0.566814, valid_Acc : 0.828, Valid_Loss : 0.643351


2072it [1:18:52,  2.28s/it]
259it [12:37,  2.92s/it]


Epoch 23/100, Train_Acc: 0.850, Train_Loss : 0.544925, valid_Acc : 0.837, Valid_Loss : 0.605113


2072it [1:18:29,  2.27s/it]
259it [13:06,  3.04s/it]


Epoch 24/100, Train_Acc: 0.855, Train_Loss : 0.525756, valid_Acc : 0.839, Valid_Loss : 0.597744


2072it [1:19:42,  2.31s/it]
259it [12:48,  2.97s/it]


Epoch 25/100, Train_Acc: 0.860, Train_Loss : 0.507109, valid_Acc : 0.847, Valid_Loss : 0.569125


2072it [1:19:10,  2.29s/it]
259it [12:55,  2.99s/it]


Epoch 26/100, Train_Acc: 0.865, Train_Loss : 0.490388, valid_Acc : 0.848, Valid_Loss : 0.563914


2072it [1:18:50,  2.28s/it]
259it [12:44,  2.95s/it]


Epoch 27/100, Train_Acc: 0.869, Train_Loss : 0.472268, valid_Acc : 0.848, Valid_Loss : 0.566407


2072it [1:18:38,  2.28s/it]
259it [12:33,  2.91s/it]


Epoch 28/100, Train_Acc: 0.872, Train_Loss : 0.460055, valid_Acc : 0.851, Valid_Loss : 0.553807


2072it [1:18:50,  2.28s/it]
259it [12:57,  3.00s/it]


Epoch 29/100, Train_Acc: 0.876, Train_Loss : 0.446706, valid_Acc : 0.860, Valid_Loss : 0.524070


2072it [1:19:09,  2.29s/it]
259it [12:47,  2.96s/it]


Epoch 30/100, Train_Acc: 0.879, Train_Loss : 0.434358, valid_Acc : 0.861, Valid_Loss : 0.514242


2072it [1:18:09,  2.26s/it]
259it [12:33,  2.91s/it]


Epoch 31/100, Train_Acc: 0.882, Train_Loss : 0.424555, valid_Acc : 0.860, Valid_Loss : 0.509479


2072it [1:18:56,  2.29s/it]
259it [12:37,  2.92s/it]


Epoch 32/100, Train_Acc: 0.885, Train_Loss : 0.410279, valid_Acc : 0.867, Valid_Loss : 0.492459


2072it [1:18:58,  2.29s/it]
259it [12:50,  2.98s/it]


Epoch 33/100, Train_Acc: 0.888, Train_Loss : 0.400133, valid_Acc : 0.861, Valid_Loss : 0.512231


2072it [1:18:38,  2.28s/it]
259it [12:54,  2.99s/it]


Epoch 34/100, Train_Acc: 0.891, Train_Loss : 0.389751, valid_Acc : 0.871, Valid_Loss : 0.478950


2072it [1:17:51,  2.25s/it]
259it [12:21,  2.86s/it]


Epoch 35/100, Train_Acc: 0.893, Train_Loss : 0.382640, valid_Acc : 0.874, Valid_Loss : 0.467222


2072it [1:18:44,  2.28s/it]
259it [12:56,  3.00s/it]


Epoch 36/100, Train_Acc: 0.895, Train_Loss : 0.372802, valid_Acc : 0.877, Valid_Loss : 0.458631


2072it [1:20:26,  2.33s/it]
259it [12:38,  2.93s/it]


Epoch 37/100, Train_Acc: 0.897, Train_Loss : 0.363615, valid_Acc : 0.877, Valid_Loss : 0.451089


2072it [1:19:18,  2.30s/it]
259it [12:34,  2.91s/it]


Epoch 38/100, Train_Acc: 0.899, Train_Loss : 0.357281, valid_Acc : 0.882, Valid_Loss : 0.437915


2072it [1:17:43,  2.25s/it]
259it [13:26,  3.11s/it]


Epoch 39/100, Train_Acc: 0.901, Train_Loss : 0.351733, valid_Acc : 0.880, Valid_Loss : 0.441553


2072it [1:19:59,  2.32s/it]
259it [12:26,  2.88s/it]


Epoch 40/100, Train_Acc: 0.903, Train_Loss : 0.343244, valid_Acc : 0.886, Valid_Loss : 0.426929


2072it [1:19:14,  2.29s/it]
259it [12:36,  2.92s/it]


Epoch 41/100, Train_Acc: 0.905, Train_Loss : 0.337634, valid_Acc : 0.888, Valid_Loss : 0.412795


2072it [1:19:20,  2.30s/it]
259it [12:42,  2.94s/it]


Epoch 42/100, Train_Acc: 0.907, Train_Loss : 0.330621, valid_Acc : 0.887, Valid_Loss : 0.417096


2072it [1:19:53,  2.31s/it]
259it [12:34,  2.91s/it]


Epoch 43/100, Train_Acc: 0.908, Train_Loss : 0.324051, valid_Acc : 0.888, Valid_Loss : 0.413570


2072it [1:21:22,  2.36s/it]
259it [14:17,  3.31s/it]


Epoch 44/100, Train_Acc: 0.910, Train_Loss : 0.318004, valid_Acc : 0.889, Valid_Loss : 0.412441


2072it [1:42:01,  2.95s/it]
259it [14:21,  3.32s/it]


Epoch 45/100, Train_Acc: 0.911, Train_Loss : 0.314342, valid_Acc : 0.895, Valid_Loss : 0.392172


2072it [1:25:51,  2.49s/it]
259it [16:03,  3.72s/it]


Epoch 46/100, Train_Acc: 0.912, Train_Loss : 0.309514, valid_Acc : 0.893, Valid_Loss : 0.397440


2072it [1:43:19,  2.99s/it]
259it [15:02,  3.48s/it]


Epoch 47/100, Train_Acc: 0.914, Train_Loss : 0.302593, valid_Acc : 0.892, Valid_Loss : 0.395625


2072it [1:46:08,  3.07s/it]
259it [14:14,  3.30s/it]


Epoch 48/100, Train_Acc: 0.915, Train_Loss : 0.299677, valid_Acc : 0.894, Valid_Loss : 0.389698


2072it [1:39:10,  2.87s/it]
259it [14:19,  3.32s/it]


Epoch 49/100, Train_Acc: 0.916, Train_Loss : 0.293704, valid_Acc : 0.900, Valid_Loss : 0.371903


2072it [1:33:03,  2.69s/it]
259it [12:56,  3.00s/it]


Epoch 50/100, Train_Acc: 0.917, Train_Loss : 0.291600, valid_Acc : 0.901, Valid_Loss : 0.363440


2072it [1:19:21,  2.30s/it]
259it [13:00,  3.01s/it]


Epoch 51/100, Train_Acc: 0.918, Train_Loss : 0.287390, valid_Acc : 0.897, Valid_Loss : 0.377294


2072it [1:20:55,  2.34s/it]
259it [12:22,  2.86s/it]


Epoch 52/100, Train_Acc: 0.919, Train_Loss : 0.283322, valid_Acc : 0.900, Valid_Loss : 0.372095


2072it [1:20:20,  2.33s/it]
259it [13:16,  3.08s/it]


Epoch 53/100, Train_Acc: 0.920, Train_Loss : 0.279771, valid_Acc : 0.899, Valid_Loss : 0.372776


2072it [1:19:09,  2.29s/it]
259it [12:50,  2.98s/it]


Epoch 54/100, Train_Acc: 0.921, Train_Loss : 0.275996, valid_Acc : 0.902, Valid_Loss : 0.361864


2072it [1:19:29,  2.30s/it]
259it [12:06,  2.81s/it]


Epoch 55/100, Train_Acc: 0.923, Train_Loss : 0.270951, valid_Acc : 0.901, Valid_Loss : 0.367319


2072it [1:18:56,  2.29s/it]
259it [12:43,  2.95s/it]


Epoch 56/100, Train_Acc: 0.922, Train_Loss : 0.272765, valid_Acc : 0.905, Valid_Loss : 0.352596


2072it [1:19:43,  2.31s/it]
259it [12:41,  2.94s/it]


Epoch 57/100, Train_Acc: 0.923, Train_Loss : 0.266902, valid_Acc : 0.907, Valid_Loss : 0.345640


1462it [56:04,  2.30s/it]


KeyboardInterrupt: 