In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torchvision.io import read_image
from torchvision.datasets.mnist import MNIST

import numpy as np
import os
import pandas as pd

import matplotlib.pyplot as plt

from collections import Counter




def imshow(image, ax=None, title=None, normalize=True):
    """Imshow for Tensor."""
    if ax is None:
        fig, ax = plt.subplots()
    image = image.numpy().transpose((1, 2, 0))

    if normalize:
        mean = np.array([0.485, 0.456, 0.406])
        std = np.array([0.229, 0.224, 0.225])
        image = std * image + mean
        image = np.clip(image, 0, 1)

    ax.imshow(image)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.tick_params(axis='both', length=0)
    ax.set_xticklabels('')
    ax.set_yticklabels('')

    return ax

In [None]:
transform = transforms.Compose([transforms.Resize(154),
                                 transforms.CenterCrop(154),
                                 transforms.ToTensor()])
dataset = datasets.ImageFolder('../input/transformer1dataset/birds/birds', transform=transform)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [None]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# Run this to test your data loader
images, labels = next(iter(dataloader))
# helper.imshow(images[0], normalize=False)
imshow(images[0], normalize=False)

In [None]:
train_classes = [dataset.targets[i] for i in train_dataset.indices]
Counter(train_classes) # if doesn' work: Counter(i.item() for i in train_classes)

In [None]:
    #Training loop
    optimizer = Adam(model.parameters(), lr = LR)
    criterion = CrossEntropyLoss()
    for epoch in range(N_EPOCHS):
        train_loss = 0.0
        for batch in train_loader:
            x,y = batch
            y_hat = model(x)
            loss = criterion(y_hat,y)/len(x)
            
            train_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print(f"Epoch {epoch + 1}/{N_EPOCHS} loss: {train_loss: .2f}")

   #Test loop
    correct, total = 0,0
    test_loss = 0.0
    for batch in test_loader:
        x,y=batch
        y_hat = model(x)
        loss = criterion(y_hat,y)
        test_loss += loss/len(x)
        
        correct += torch.sum(torch.argmax(y_hat, dim=1) == y).item()
        total += len(x)   
    print(f"Test loss: {test_loss:.2f}")
    print(f"Test accuracy: {correct/total*100:.2f}%")

In [2]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X,y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        #Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)
        
        #Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch %100== 0:
            loss, current = loss.item(), batch*len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    

In [9]:
def main():
    transform = transforms.Compose([transforms.Resize(154),
                                 transforms.CenterCrop(154),
                                 transforms.ToTensor()])
    dataset = datasets.ImageFolder('../input/transformer1dataset/birds/birds', transform=transform)

    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=9075)
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=9075)
    
    #Model and training options
    model = BumbleBird((3,154,154), n_patches = 7, hidden_d=4, n_heads=2, out_d=400).to(device)
    N_EPOCHS = 5 # Number of epochs
    LR = 0.01 #Learning rate
    
    #Training
    optimizer = Adam(model.parameters(), lr = LR)
    criterion = CrossEntropyLoss()
    for t in range(N_EPOCHS):
        print(f"Epoch {t+1}\n-----------------------------------")
        train(train_loader, model, criterion, optimizer)
        test(test_loader, model, criterion)
    print ("Done!")  

def get_positional_embeddings(sequence_length, d):
    result = torch.ones(sequence_length, d)
    for i in range(sequence_length):
        for j in range(d):
            result[i][j] = np.sin(i/(10000**(j/d)) if j%2==0 else np.cos(i/(10000**((j-1)/d))))
    return result.to(device)

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
main()

In [None]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n") 
    
    
# Predictions
with torch.no_grad():
    
    transform = transforms.Compose([transforms.Resize(154),
                                 transforms.CenterCrop(154),
                                 transforms.ToTensor()])
    pred_dataset = datasets.ImageFolder('../input/transformer1dataset/submission_test', transform=transform)
    
    pred_loader = DataLoader(pred_dataset, shuffle=True, batch_size=16)
    model = BumbleBird((3,154,154), n_patches = 7, hidden_d=4, n_heads=2, out_d=400).to(device)
    arr = []
    for X, y in pred_loader:
        X, y = X.to(device), y.to(device)
        prediction = model(X)
        arr = prediction.data.cpu().detach()
    # write CSV
    np.savetxt('output.csv', arr)


In [3]:
#Multi-head Self Attention (MSA)
class MSAtron(nn.Module):
    def __init__(self, d, n_heads=2):
        super(MSAtron, self).__init__()
        self.d = d
        self.n_heads = n_heads
        
        assert d%n_heads == 0, f"Can't divide dimension {d} into {n_heads} heads"
        
        d_head = int(d/n_heads)
        self.q_mappings = [nn.Linear(d_head, d_head).to(device) for _ in range(self.n_heads)]
        self.k_mappings = [nn.Linear(d_head, d_head).to(device) for _ in range(self.n_heads)]
        self.v_mappings = [nn.Linear(d_head, d_head).to(device) for _ in range(self.n_heads)]
        self.d_head = d_head
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, sequences):
        result =[]
        
        for sequence in sequences:
            seq_result = []
            for head in range(self.n_heads):
                q_mapping = self.q_mappings[head]
                k_mapping = self.k_mappings[head]
                v_mapping = self.v_mappings[head]
                
                seq = sequence[:, head*self.d_head:(head+1)*self.d_head]
                q,k,v = q_mapping(seq), k_mapping(seq), v_mapping(seq)
                
                attention = self.softmax(q @ k.T / (self.d_head**0.5))
                seq_result.append(attention @ v)
            result.append(torch.hstack(seq_result))
        return torch.cat([torch.unsqueeze(r,dim=0) for r in result])

#Modelo
class BumbleBird(nn.Module):
    def __init__(self, input_shape, n_patches=7, hidden_d=8, n_heads=2, out_d=10):
        #Super constructor
        super(BumbleBird, self).__init__()
        
        #Input and patches sizes
        self.input_shape = input_shape
        self.n_patches = n_patches
        self.patch_size = (input_shape[1]/n_patches, input_shape[2]/n_patches)
        self.input_d = int(input_shape[0]*self.patch_size[0]*self.patch_size[1])
        self.hidden_d = hidden_d
        
        #Linear mapper
        self.linear_mapper = nn.Linear(self.input_d, self.hidden_d)
        
        #Classification Token
        self.class_token = nn.Parameter(torch.rand(1, self.hidden_d))
        
        #Layer normalization 1
        self.ln1 = nn.LayerNorm((self.n_patches**2+1, self.hidden_d))
        
        #Multihead Self Attention and class token
        self.msa = MSAtron(self.hidden_d,n_heads)
        
        #Layer normalization 2
        self.ln2 = nn.LayerNorm((self.n_patches**2+1, self.hidden_d))
        
        #Encoder MLP
        self.enc_mlp = nn.Sequential(
            nn.Linear(self.hidden_d, self.hidden_d),
            nn.ReLU()
        )
        
        #Classification MLP
        self.mlp = nn.Sequential(
            nn.Linear(self.hidden_d, out_d),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, images):
        #Divide the image into patches
        #print(images.shape)
        n,c,w,h = images.shape
        patches = images.reshape(n,self.n_patches**2,self.input_d)
        
        #Run linear layer for tokenization
        tokens = self.linear_mapper(patches)
        
        #Adding a classification token to the tokens
        tokens = torch.stack([torch.vstack((self.class_token, tokens[i])) for i in range(len(tokens))])
        
        #Positional embedding
        tokens += get_positional_embeddings(self.n_patches**2+1,self.hidden_d).repeat(n,1,1)

        ####################### Transformer Encoder
        # Running Layer Normalization, MSA and residual connection
        out = tokens + self.msa(self.ln1(tokens))
        
        #Running Layer Normalization, MLP  and residual connection
        out = out + self.enc_mlp(self.ln2(out))
        ####################### End transformer
        
        #Getting the classification token only
        out = out[:,0]
        
        return self.mlp(out)

In [None]:
model = BumbleBird(
    input_shape = (3,154,154),
    n_patches = 11,
    hidden_d=8, 
    n_heads=2, 
    out_d=10
)
x=torch.rand(16,3,154,154)
print(model(x).shape)

In [None]:
main()

In [11]:
#Imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch.nn import CrossEntropyLoss

from torchvision import datasets, transforms
from torchvision.transforms import ToTensor
from torchvision.io import read_image
from torchvision.datasets.mnist import MNIST

import numpy as np
import os
import pandas as pd

import matplotlib.pyplot as plt

from collections import Counter

In [None]:
#Ahora de verdad. Modúlos
################## Patch Embedding
class PatchEmbed(nn.Module):
    """Divide la imagen en partes y la asocia a una posición
        Parametros
        --------------
        img_size: int
            El tamaño de la imagen debe de ser un cuadrado para poder dividirlo. 
            Será necesario que la imagen sea escalada a un cuadrado para conseguirlo.
            
        patch_size: int
            - Tamaño de cada una de las partes en las que se divide la imagen.
            - También deben de ser cuadrados.
            - Debe de cumplir que el tamaño de la imagen sea divisible por el
                tamaño de las partes.
            
        in_chans: int
            - Número de canales de la imagen (color).
            - Por ejemplo, si es en esacala de grises debe de ser de valor 1 mientras que de ser
                una imagen RGB deberá tener valor 3. En este caso in_chans es normalmente de valor
                3.
            
        embed_dim: int
            - Como de grande será el "embedding" de una parte de la imagen durante toda la red
                neuronal.
        
        Atributos
        --------------
        n_patches: int
            - Numero de partes (patches) en los que dividimos la imagen.
        
        proj: nn.Conv2d
            - Capa convolucional para dividir la imagen y colocarle su embedding.
    """
    def __init__(self, img_size, patch_size, in_chans=3, embed_dim=768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        #Calculamos el número de partes de la imagen
        assert img_size%patch_size == 0, f"The size {patch_size} for the patches cant divide image size {img_size} into equal patches"
        self.n_patches = (img_size // patch_size)**2
        
        self.proj = nn.Conv2d(
            in_chans,
            embed_dim,
            kernel_size=patch_size,
            stride=patch_size,
        )
    
    def forward(self, x):
        """Run foward pass.
        Parametros
        -------------
        x: torch.Tensor --------> Shape `(n_sambles, in_chans, img_size, img_size)`.
            - Es un batch de imágenes
            - n_samples == batch_size, El número de ejemplos es el mismo al del tamaño del batch.
            - img_size: Altura y anchura de la imagen, que al ser un cuadrado, es la misma.
            
        Rerturns
        -------------
        torch.Tensor ----------> Shape `(n_samples, n_patches, embed_dim)`.
            - n_patches: parches en los que dividimos la imagen
        """
        x = self.proj(x) #  (n_samples, embed_dim, n_patches ** 0.5, n_patches ** 0.5) Esto nos da un tensor de 4 dimensiones
        x = x.flatten(2) # (n_samples, embed_dim, n_patches) Lo aplanamos en una sola dimensión
        x = x.transpose(1,2) # (n_samples, n_patches, embed_dim) Adecuamos el tensor
        
        return x
        