In [8]:
import numpy as np

import torch
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Normalize, ToTensor

from data import CLEVRSplit, CLEVRTextSplit
from model import Model, TrainingModel

from training import complete_train

import lightning as L
from config import Config, load_config


torch.backends.cudnn.benchmark = True

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
config = load_config()
config.display_object_properties = True
config.use_txt_scene = True


if config.use_txt_scene:
    train_dataset, test_dataset, systematic_dataset = CLEVRTextSplit.build_splits(config)
else:
    train_dataset, test_dataset, systematic_dataset = CLEVRSplit.build_splits(config)

Building vocabulary


  0%|          | 0/699960 [00:00<?, ?it/s]

Building answers index


  0%|          | 0/699960 [00:00<?, ?it/s]

In [20]:
len(train_dataset.processor.vocabulary)

95

In [4]:
dlkwargs = {
    'batch_size': 256,
    'num_workers': 16,
    'pin_memory': True,
}

train_loader = DataLoader(train_dataset, shuffle=True, **dlkwargs)
test_loader = DataLoader(test_dataset, shuffle=False, **dlkwargs)
systematic_loader = DataLoader(systematic_dataset, shuffle=False, **dlkwargs)

In [17]:
n_tokens = len(train_dataset.processor.vocabulary)
n_outputs = len(train_dataset.processor.answers_index)
d_hidden = 128
n_layers = 4
nhead = 4
patch_height = 32
patch_width = 48
num_patches = (320 // patch_height) * (480 // patch_width)
max_question_size = 45

model = Model(
    n_tokens, n_outputs, d_hidden, nhead, n_layers, patch_height, patch_width, num_patches, max_question_size)
training_model = TrainingModel(model)

In [18]:
comet_logger = CometLogger(
    api_key=os.environ.get("COMET_API_KEY"),
    workspace=os.environ.get("COMET_WORKSPACE"),
)

trainer = L.Trainer(max_epochs=1, accelerator="gpu", devices=1)
trainer.fit(training_model, train_loader, val_dataloaders=[test_loader, systematic_loader])
trainer.test(training_model, dataloaders=[test_loader, systematic_loader])

Using 16bit None Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name  | Type  | Params
--------------------------------
0 | model | Model | 3.0 M 
--------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
6.018     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]


Testing: 0it [00:00, ?it/s]

[{'test_loss': 2.6154680252075195, 'test_acc': 0.23853431642055511}]

In [19]:
trainer

<lightning.pytorch.trainer.trainer.Trainer at 0x7f8b6066a2d0>

In [None]:
plt.plot(train_losses)
plt.plot(test_losses)
plt.plot()

In [None]:
plt.plot(train_accs)
plt.plot(test_accs)
plt.plot(systematic_accs)
plt.plot()

In [29]:
!python run_training.py

Loading questions
Building vocabulary
100%|███████████████████████████████| 699960/699960 [00:01<00:00, 466873.68it/s]
Building answers index
100%|██████████████████████████████| 699960/699960 [00:00<00:00, 1088596.25it/s]
Loading questions
Loading questions
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [1]

  | Name        | Type  | Params
--------------------------------------
0 | inner_model | Model | 3.0 M 
--------------------------------------
3.0 M     Trainable params
0         Non-trainable params
3.0 M     Total params
12.037    Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.
Sanity Checking DataLoader 0:   0%|                       | 0/2 [00:00<?, ?it/s]Traceback (most recent call last):
  File "run_training.py"