# Imports

In [1]:
%%time
import pytorch_lightning as pl
import torch as th
import torch.nn as nn

from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, GPUStatsMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.metrics.functional.classification import accuracy
from pytorch_lightning import seed_everything
import torch.nn.functional as F

import transformers
import os
import pandas as pd

# data module
from src.dataset import EnzymeDataset, DataModule

# model
from src.model import Model

# config file
from src.config import Config

# custom tokenizer
from src.tokenizer import EnzymeTokenizer

# load tensorboard extension
%load_ext tensorboard

CPU times: user 1.13 s, sys: 355 ms, total: 1.48 s
Wall time: 1.19 s


# Data module and setup

In [2]:
_ = seed_everything(seed =  Config.seed_val)

Global seed set to 2021


In [3]:
config_dict = Config.__dict__.items()
config_dict = dict([item for item in config_dict if '__' not in item[0]])

In [4]:
%%time

train_df = pd.read_csv(os.path.join(Config.data_dir, 'TrainV1.csv'))

dm = DataModule(config=Config, 
                 train_df=train_df,
                 validation_split=.25,
                 train_frac = 1)
dm.setup()

[INFO] Training on 612525
[INFO] Validating on 204175
CPU times: user 3.61 s, sys: 257 ms, total: 3.87 s
Wall time: 3.87 s


In [5]:
%%time
model = Model(config=config_dict)
model

CPU times: user 27.9 ms, sys: 113 µs, total: 28 ms
Wall time: 27.4 ms


Model(
  (embedding): Embedding(26, 350)
  (encoder): GRU(350, 300, num_layers=3, batch_first=True, dropout=0.2, bidirectional=True)
  (classifier): Linear(in_features=600, out_features=20, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

# Training Setup

In [6]:
%%time

ckpt_cb = ModelCheckpoint(
    monitor='val_acc', 
    mode='max', 
    dirpath=Config.models_dir, 
    filename=f'{Config.base_model}-'+'enzyme_classifier-{val_acc:.5f}-{val_loss:.5f}'
)

gpu_stats = GPUStatsMonitor(
    memory_utilization=True, 
    gpu_utilization=True, 
    fan_speed=True, 
    temperature=True
)
es = EarlyStopping(
    monitor='val_acc', 
    patience=2, 
    mode='max'
)

Logger = TensorBoardLogger(
    save_dir=Config.logs_dir, 
    name='enzyme'
)

Callbacks = [es, ckpt_cb, gpu_stats]

trainer = pl.Trainer(
    gpus=-1, 
    max_epochs=Config.num_epochs, 
    precision=16,
    callbacks=Callbacks,
    logger=Logger,
    # fast_dev_run=True
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using native 16bit precision.


CPU times: user 16.7 ms, sys: 24.7 ms, total: 41.5 ms
Wall time: 60.5 ms


# Training phase

In [7]:
dm.train_ds[0]['input_ids'].shape

torch.Size([512])

In [8]:
for data in dm.train_dataloader():
    print(data['input_ids'].shape)
    # print(data['trg'].shape)
    out = model(data['input_ids'])
    break
    
#nn.LogSoftmax(dim=1)(out).argmax(1)

torch.Size([512, 512])


RuntimeError: input must have 3 dimensions, got 4

In [None]:
zfzfpzpjzpjijjeepp

In [None]:
%%time
trainer.fit(model=model, datamodule=dm)

In [None]:
%tensorboard --logdir ../logs