## Initialization

In [1]:
storageName = 'university'

if storageName == 'paperspace': 
    guy_folder = "/notebooks/"
elif storageName == 'colab':
    guy_folder = "/content/"
elif storageName == 'university':
    guy_folder = '/vol/scratch/guy/'
    
    
cache_dir = guy_folder+"/cache/transformer_cache"

In [None]:
%pip uninstall -y enum34

In [2]:
# %pip install matplotlib seaborn pandas tqdm tensorboard

%cd {guy_folder}
!mkdir cache
!mkdir cache/transformer_cache
# %pip install --no-cache-dir --upgrade torch torchvision
#==1.4.0+cu100 torchvision== -f https://download.pytorch.org/whl/torch_stable.html


/content


In [None]:
%pip install wandb
%cd {guy_folder}/cache/
!git clone https://github.com/huggingface/transformers.git
%pip install ./transformers
%pip install -U nlp

%cd {guy_folder}

## Not working!!!
!setenv TRANSFORMERS_CACHE /vol/scratch/guy/cache/transformer_cache
!setenv CUDA_VISIBLE_DEVICES 0

### Imports

In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchvision.datasets import ImageNet, ImageFolder, CIFAR10, CIFAR100
from torchvision import transforms
from torchvision.models import resnet101
from transformers import AutoModel, AutoTokenizer
from transformers import AdamW
import wandb



# Sweep

In [13]:
%%writefile bert-vision.py
paperspace = True
guy_folder = "/content/"
if paperspace: 
    guy_folder = "/notebooks/"

cache_dir = guy_folder+"/cache/transformer_cache"
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchvision.datasets import ImageNet, ImageFolder, CIFAR10, CIFAR100
from torchvision import transforms
from torchvision.models import resnet101, resnet50
from transformers import AutoModel, AutoTokenizer
from transformers import AdamW
import wandb


class DummyLayer(nn.Module):
  def __init__(self, *args, **kwargs):
    super().__init__()
    pass
  def forward(self, x, *args, **kwargs):
    return x

class PlainBERT(nn.Module):
    def __init__(self, n_tokens, min_layer = None):
        super().__init__()
        self.nLayers = 6
        self.nHeads = 12
        self.seqLen = 512


        bert = AutoModel.from_pretrained('distilbert-base-uncased', cache_dir = cache_dir)
        self.position_embeddings = nn.Parameter(
            torch.Tensor(bert.embeddings.position_embeddings(torch.arange(self.seqLen)).detach().numpy()))
        if min_layer is None:
          self.bert = bert.transformer
        else:
          raise NotImplementedError
          bert_ = bert.transformer
          for n, m in bert_.layer.named_children():
            if int(n) < min_layer:
              setattr(bert_.layer, n, DummyLayer())
        
          self.bert = bert_

        self.bert.requires_grad_(False)


    def forward(self, x):
        return self.bert.forward(x + self.position_embeddings, attn_mask = torch.ones(x.size(0), 512).to(x.device),
                                head_mask = torch.ones(self.nLayers, x.size(0), 
                                                       self.nHeads, self.seqLen, self.seqLen).to(x.device))

class BertVision(nn.Module):
    def __init__(self,  n_classes, img_dim):
        super().__init__()
        self.with_classifier = True
        self.n_tokens = np.prod(img_dim)
        self.top = nn.Sequential(
                                 nn.Conv2d(3, 32, 3, padding = 1 ),
                                 nn.LeakyReLU(0.2),
                                 nn.Conv2d(32, 100, 3, padding = 1),
                                 nn.LeakyReLU(0.2),
                                 nn.Conv2d(100, 200, 3, padding = 1),
                                 nn.LeakyReLU(0.2),
                                 nn.Conv2d(200, 768, 3, stride = (1, 2), padding = 1),
                                 nn.LeakyReLU(0.2)
                                )
        
        self.top.apply(self._init_top)
        self.bert = PlainBERT(n_tokens = self.n_tokens)
        self.fc = nn.Linear(768 * self.n_tokens//2, n_classes)
        self.layer_norm = nn.LayerNorm((512,))

    def toggleIntermediate(self):
        self.with_classifier = not self.with_classifier
    
    
    def _init_top(self, m):
        if isinstance(m, (nn.Conv2d, nn.Linear)):
            nn.init.kaiming_uniform_(m.weight)
            m.bias.data.fill_(0.01)
        pass
    def forward(self, x):
        x = self.top(x)
        x = x.view(x.size(0), x.size(1), -1)
        x = x.transpose(1,2)
        x = self.bert(x)
        x = torch.stack(x).squeeze(0)
        x = x.transpose(1,2).contiguous()
#         x = self.layer_norm(x)
#         x = torch.mean(x, dim = (-2,))
        x = x.view(x.size(0), -1)
        if self.with_classifier:
            x = self.fc(x)
        return x.squeeze(1)
    
    
    
device = 'cuda'
train_ds = CIFAR100("{}/data/cifar100".format(guy_folder), download = True, transform=transforms.ToTensor())
test_ds = CIFAR100("{}/data/cifar100".format(guy_folder), download = True, transform=transforms.ToTensor(), train = False)


batch_size = 8

lr = {'bert-vision': [1e-6, 5e-6, 1e-5, 5e-5],
      'resnet': [3e-4, 1e-3, 3e-3, 1e-4]
      }

optimizerDict = {'adam': torch.optim.Adam,
                 'adamw': AdamW,
                 'sgd': torch.optim.SGD, # No momentum
                 }

def makeModel(modelName):
  if modelName == 'resnet':
    model_resnet = resnet50(pretrained = True)
    model_resnet.fc = nn.Linear(model_resnet.fc.in_features, 100)
    model_resnet.to(device)
    model = model_resnet
  elif modelName == 'bert-vision':
    model = BertVision(len(train_ds.classes), (32,32)).to(device)
  else:
    model = Sequential()
  return model

def train(config):
  
  optimizerAlg = optimizerDict[config.optimizer]
  if config.model == 'bert-vision': 
    if config.optimizer == 'adam':
      optimizerAlg = optimizerDict['adamw']
    if config.optimizer == 'sgd':
      return
  modelName = config.model
  lr_idx = config.lr_idx
  model = makeModel(modelName)


  criterion = nn.CrossEntropyLoss()
  optimizer = optimizerAlg(model.parameters(), lr = lr[modelName][lr_idx])
  train_dataloader = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
  test_dataloader = DataLoader(test_ds, batch_size = batch_size, shuffle = True)
  pbar = tqdm(train_dataloader, leave = True, position = 0)
  acc_sum = 0

  for i, (x,y) in enumerate(pbar):
      model.train()    
      optimizer.zero_grad()
      y = y.to(device)
      yhat = model(x.to(device))
      loss = criterion(yhat, y)
      acc_sum += (yhat.argmax(dim =  -1) == y).sum()    
      wandb.log({'loss': loss.item(), 
                 'acc': acc_sum.item() / (batch_size * (i+1))})

      loss.backward()
      optimizer.step()

wandb.init()
config = wandb.config
train(config)


Overwriting bert-vision.py


In [8]:
config = {
    'method': 'grid',
    'program': 'bert-vision.py',
    'parameters': 
    {
        'lr_idx': {'values': [0, 1, 2, 3]},
        'optimizer':{'values': ['adam', 'sgd']},
        'model': {'values': ['resnet', 'bert-vision']}
    }
}
sweep_id = wandb.sweep(config, project = 'bert-vision')

Create sweep with ID: ti2qxgh0
Sweep URL: https://wandb.ai/dar-tau/bert-vision/sweeps/ti2qxgh0


In [14]:
assert(sweep_id.isalnum())
!wandb agent {sweep_id}

[34m[1mwandb[0m: Starting wandb agent 🕵️
2020-09-27 18:32:32,444 - wandb.wandb_agent - INFO - Running runs: []
2020-09-27 18:32:47,832 - wandb.wandb_agent - INFO - Agent received command: run
2020-09-27 18:32:47,832 - wandb.wandb_agent - INFO - Agent starting run with config:
	lr_idx: 1
	model: bert-vision
	optimizer: adam
2020-09-27 18:32:47,833 - wandb.wandb_agent - INFO - About to run command: /usr/bin/env python bert-vision.py --lr_idx=1 --model=bert-vision --optimizer=adam
2020-09-27 18:32:49.331225: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
Files already downloaded and verified
Files already downloaded and verified
[34m[1mwandb[0m: Currently logged in as: [33mdar-tau[0m (use `wandb login --relogin` to force relogin)
2020-09-27 18:32:52,842 - wandb.wandb_agent - INFO - Running runs: ['t8s7v74o']
[34m[1mwandb[0m: Tracking run with wandb version 0.10.2
[34m[1mwandb[0m: Run data is saved locally

# Experiments

In [None]:

class DummyLayer(nn.Module):
  def __init__(self, *args, **kwargs):
    super().__init__()
    pass
  def forward(self, x, *args, **kwargs):
    return x

class PlainBERT(nn.Module):
    def __init__(self, n_tokens, min_layer = None):
        super().__init__()
        self.nLayers = 6
        self.nHeads = 12
        self.seqLen = 512


        bert = AutoModel.from_pretrained('distilbert-base-uncased', cache_dir = cache_dir)
        self.position_embeddings = nn.Parameter(torch.Tensor(bert.embeddings.position_embeddings(torch.arange(self.seqLen)).detach().numpy()))
        if min_layer is None:
          self.bert = bert.transformer
        else:
          raise NotImplementedError
          bert_ = bert.transformer
          for n, m in bert_.layer.named_children():
            if int(n) < min_layer:
              setattr(bert_.layer, n, DummyLayer())
        
          self.bert = bert_

        self.bert.requires_grad_(False)


    def forward(self, x):
        return self.bert.forward(x + self.position_embeddings, attn_mask = torch.ones(x.size(0), 512).to(x.device),
                                head_mask = torch.ones(self.nLayers, x.size(0), self.nHeads, self.seqLen, self.seqLen).to(x.device))

class BertVision(nn.Module):
    def __init__(self,  n_classes, img_dim):
        super().__init__()
        self.with_classifier = True
        self.n_tokens = np.prod(img_dim)
        self.top = nn.Sequential(
                                 nn.Conv2d(3, 32, 3, padding = 1 ),
                                 nn.LeakyReLU(0.2),
                                 nn.Conv2d(32, 100, 3, padding = 1),
                                 nn.LeakyReLU(0.2),
                                 nn.Conv2d(100, 200, 3, padding = 1),
                                 nn.LeakyReLU(0.2),
                                 nn.Conv2d(200, 768, 3, stride = (1, 2), padding = 1),
                                 nn.LeakyReLU(0.2)
                                )
        
        self.top.apply(self._init_top)
        self.bert = PlainBERT(n_tokens = self.n_tokens)
        self.fc = nn.Linear(768 * self.n_tokens//2, n_classes)
        self.layer_norm = nn.LayerNorm((512,))

    def toggleIntermediate(self):
        self.with_classifier = !self.with_classifier
    
    
    def _init_top(self, m):
        if isinstance(m, (nn.Conv2d, nn.Linear)):
            nn.init.kaiming_uniform_(m.weight)
            m.bias.data.fill_(0.01)
        pass
    def forward(self, x):
        x = self.top(x)
        x = x.view(x.size(0), x.size(1), -1)
        x = x.transpose(1,2)
        x = self.bert(x)
        x = torch.stack(x).squeeze(0)
        x = x.transpose(1,2).contiguous()
#         x = self.layer_norm(x)
#         x = torch.mean(x, dim = (-2,))
        x = x.view(x.size(0), -1)
        if self.with_classifier:
            x = self.fc(x)
        return x.squeeze(1)

In [None]:
device = 'cuda'
train_ds = CIFAR100("{}/data/cifar100".format(guy_folder), download = True, transform=transforms.ToTensor())
test_ds = CIFAR100("{}/data/cifar100".format(guy_folder), download = True, transform=transforms.ToTensor(), train = False)

0it [00:00, ?it/s]

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to /notebooks//data/cifar100/cifar-100-python.tar.gz


169009152it [00:09, 18300545.88it/s]                               


Extracting /notebooks//data/cifar100/cifar-100-python.tar.gz to /notebooks//data/cifar100
Files already downloaded and verified


In [None]:
%%script false
# After login in through firefox to ImageNet
%cd {guy_folder}/data
!wget http://www.image-net.org/archive/stanford/fall11_whole.tar
# !tar -xvf 

In [None]:
%%script false
train_folder = "{}/data/imagenet12/train".format(guy_folder)

train_ds = ImageFolder(train_folder,
            transform = transforms.Compose([transforms.ToTensor()]))

In [None]:
from collections import defaultdict
batch_size = 8
total = int(len(train_ds)/batch_size )

from tqdm import tqdm_notebook, tqdm
class_rep = defaultdict(int)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-6)
train_dataloader = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
test_dataloader = DataLoader(test_ds, batch_size = batch_size, shuffle = True)
pbar = tqdm(enumerate(train_dataloader), total = total, leave = True, position = 0)
acc_sum = 0

for i, (x,y) in pbar:
    if i >= total:
        break
    model.eval()
    x_test, y_test = next(iter(test_dataloader))
    test_acc = ((model(x_test.to(device)).argmax(dim =  -1) == y_test.to(device)).sum()).item()/batch_size
    model.train()
    
    optimizer.zero_grad()
    y = y.to(device)
    yhat = model(x.to(device))
    loss = criterion(yhat, y)
    acc_sum += (yhat.argmax(dim =  -1) == y).sum()
    class_rep[y[0].item()] += 1 # Incomplete
    topk = np.where(np.argsort(yhat[0].cpu().detach().numpy()) == y[0].detach().cpu().numpy())[0][0] # TODO: Incomplete
    
    pbar.set_postfix_str("Loss: {:.2f} Test Acc: {:.2f} Acc: {:.2f} Top: {} Class: {}".format(loss.item(),
                                                                                               test_acc, 
                                                                             acc_sum.item()/float(batch_size * (i+1)), 
                                                                             topk, class_rep[y[0].item()]))
    loss.backward()
    optimizer.step()

In [None]:
from collections import defaultdict
batch_size = 32
total = int(len(train_ds)/batch_size )

from tqdm import tqdm_notebook, tqdm
class_rep = defaultdict(int)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(vision_model.parameters(), lr = 1e-2)
train_dataloader = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
pbar = tqdm(enumerate(train_dataloader), total = total, leave = True, position = 0)
acc_sum = 0

for i, (x,y) in pbar:
    if i >= total:
        break
    optimizer.zero_grad()
    y = y.to(device)
    # x = transforms.Normalize(mean=[0.485, 0.456, 0.406],
    #         std=[0.229, 0.224, 0.225],)(x)
    yhat = vision_model(x.to(device))
    loss = criterion(yhat, y)
    acc_sum += (yhat.argmax(dim =  -1) == y).sum()
    class_rep[y[0].item()] += 1 # Incomplete
    topk = np.where(np.argsort(yhat[0].cpu().detach().numpy()) == y[0].detach().cpu().numpy())[0][0] # TODO: Incomplete
    pbar.set_postfix_str("Loss: {:.2f} Acc: {:.2f} Top: {} Class: {}".format(loss.item(), 
                                                                            acc_sum.item()/float(batch_size * (i+1)), 
                                                                            topk, class_rep[y[0].item()]))
    loss.backward()
    optimizer.step()

100%|█████████▉| 1561/1562 [02:02<00:00, 12.76it/s, Loss: 3.81 Acc: 0.07 Top: 93 Class: 12]

In [None]:
y  = []
z = []
for x, _  in tqdm(train_ds):
    z.append(x)
    y.append((model(x.to(device).unsqueeze(0))).detach())

In [None]:
y = torch.stack(y)
z = torch.stack(z)

In [None]:
z.std()/y.std()

tensor(0.1882, device='cuda:0')

In [None]:
i = np.random.choice(len(train_ds))
print(F.softmax(model(train_ds[i][0].unsqueeze(0).to(device))))
print(model(train_ds[i][0].unsqueeze(0).to(device)).argmax())
print(train_ds[i][1])
t = transforms.Compose([
#     transforms.Grayscale(),
    transforms.ToPILImage(),
    transforms.Grayscale(3),
    transforms.ToTensor(),
    
    ])(train_ds[i][0])
print(F.softmax(model(t.unsqueeze(0).to(device))))
print(model(t.unsqueeze(0).to(device)).argmax())

print(F.softmax(model(((train_ds[i+1][0])).unsqueeze(0).to(device))))
print(F.softmax(model(((train_ds[i+1][0])).unsqueeze(0).to(device))).argmax())


In [None]:
model.load_state_dict(state_dict,strict = False)

In [None]:
model.toggleIntermediate()
i = np.random.choice(len(train_ds))
a1 = (F.softmax(model(train_ds[i][0].unsqueeze(0).to(device))))

t = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Grayscale(3),
#     transforms.ColorJitter(10,10, 10),
#     transforms.RandomRotation(90),
    transforms.ToTensor(),
    
    ])(train_ds[i][0])
a2 = (F.softmax(model(t.unsqueeze(0).to(device))))

a3 = (F.softmax(model(((train_ds[i+1][0])).to(device).unsqueeze(0))))


print(torch.norm(a1-a2, p = 2))
print(torch.norm(a1-a3, p = 2))
print(torch.norm(a2-a3, p = 2))
model.toggleIntermediate()