In [88]:
# Get Model

from transformers import AutoFeatureExtractor, AutoModelForImageClassification

extractor = AutoFeatureExtractor.from_pretrained("farleyknight-org-username/vit-base-mnist")

model = AutoModelForImageClassification.from_pretrained("farleyknight-org-username/vit-base-mnist")

In [89]:
model

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [90]:
# Get Data

from Deep_Learning_Term_Paper.Data.Data import DataLoader

distortions = ['shot_noise', 'motion_blur']  # desired distortions

test_data_obj = DataLoader('test', distortions)

# duplicates encoutered in create_dataset function so just using full testing dataset for now

'''test_size = 1000

#test_ratios = {'clean': 0.2, 'shot_noise': 0.4, 'motion_blur': 0.6}

#test_dataset = test_data_obj.create_dataset(test_size, test_ratios)
'''

test_data_dict = test_data_obj.load()

In [91]:
images = test_data_dict['clean'][:10]
images.extend(test_data_dict['shot_noise'][:10])
images.extend(test_data_dict['motion_blur'][:10])

In [92]:
# orginal dimensions
og_images = images.copy()
og_images[0]['image'].shape

TensorShape([28, 28, 1])

In [93]:
# convert to Pytorch Tensors and appropriate dimensions based on https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
import torch
import torch.nn.functional as F

for image in images:
    numpy_array = image['image'].numpy() # turn tensorflow tensor to numpy array
    
    # move channel dimension to the front and convert into Pytorch tensor
    tensor = torch.tensor(numpy_array.reshape((1, 28, 28)))
    
    tensor = tensor.unsqueeze(0) # add batch size dimesion at index 0
    
    # expand image from 28x28 to 224x244
    tensor = F.interpolate(tensor, size=(224, 224), mode='bilinear', align_corners=False)
    
    # make the tensor have three channels instead of 1
    final_tensor = torch.cat((tensor, tensor, tensor), dim=1)
    
    image['image'] = final_tensor
    
    # convert label tensorflow tensor to Pytorch tensor
    image['label'] = torch.tensor(image['label'].numpy())

In [94]:
# new dimensions [batch_size, channels, height, width]
images[0]['image'].shape

torch.Size([1, 3, 224, 224])

In [96]:
import torch.nn as nn

labels = [x for x in range(10)]
loss_fn = nn.MSELoss()

model.eval()

test_loss, correct = 0, 0
with torch.no_grad():
    for image in range(len(images)):
        
        #pred = model(torch.tensor(og_images[image]['image'].numpy()))
        
        pred = model(images[image]['image']) 
        
        print(pred)
        print('Predicted Label:', labels[pred.logits.argmax(-1).item()], 'Actual Label', images[image]['label'].numpy())
        
        #test_loss += loss_fn(pred, images[image]['label']).item()
        #correct += (pred.argmax(1) == images[image]['label']).type(torch.float).sum().item()
    
    #print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

ImageClassifierOutput(loss=None, logits=tensor([[ 0.3755,  0.3209, -0.1929, -0.2370, -0.0211, -0.4150,  0.2091,  0.1212,
         -0.1696, -0.0315]]), hidden_states=None, attentions=None)
Predicted Label: 0 Actual Label 2
ImageClassifierOutput(loss=None, logits=tensor([[ 0.3573,  0.3987, -0.1424, -0.3290, -0.1062, -0.4989,  0.2738,  0.1245,
         -0.1657, -0.0062]]), hidden_states=None, attentions=None)
Predicted Label: 1 Actual Label 0
ImageClassifierOutput(loss=None, logits=tensor([[ 0.3519,  0.3689, -0.1204, -0.2553, -0.1134, -0.4423,  0.1906,  0.1455,
         -0.1682, -0.0480]]), hidden_states=None, attentions=None)
Predicted Label: 1 Actual Label 4
ImageClassifierOutput(loss=None, logits=tensor([[ 0.3140,  0.3168, -0.2192, -0.2142,  0.0875, -0.2758,  0.1640,  0.1454,
         -0.1257, -0.1512]]), hidden_states=None, attentions=None)
Predicted Label: 1 Actual Label 8
ImageClassifierOutput(loss=None, logits=tensor([[ 0.3576,  0.3348, -0.1841, -0.2325, -0.0402, -0.4236,  0.1924, 