In [1]:
import sys
sys.path.append('./src')

In [2]:
import os
import pandas as pd
import numpy as np

import torch as th
from torchvision import transforms   

from config import Config
from dataset import DataModule
from model import Model

from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, GPUStatsMonitor
from pytorch_lightning.loggers import TensorBoardLogger

from tqdm.auto import tqdm

from sklearn.utils.class_weight import compute_class_weight

# Setup dataset

In [3]:
data_transforms = {
    'train': th.nn.Sequential(
    transforms.CenterCrop(Config.resize_shape),
    transforms.RandomRotation(degrees=35, resample=False, expand=False, center=None, fill=0.0),
    transforms.RandomVerticalFlip(p=0.6),
    transforms.RandomHorizontalFlip(p=0.6),
),
    "validation": th.nn.Sequential(
    transforms.CenterCrop(Config.resize_shape),
    transforms.RandomRotation(degrees=35, resample=False, expand=False, center=None, fill=0.0),

),
    'test': th.nn.Sequential(
    transforms.CenterCrop(Config.resize_shape),

)
}

In [4]:
train_df = pd.read_csv(os.path.join(Config.data_dir, 'train.csv'))

dm = DataModule(
    df=train_df, 
    frac=1, 
    validation_split=.25, 
    train_batch_size=Config.train_batch_size, 
    test_batch_size=Config.test_batch_size,
    transform = data_transforms
)

dm.setup()

[INFO] Training on 150630 samples
[INFO] Validating on 50210 samples


# Training pipeline
* Model definition : GraphemeClassifier
* Callbacks : pl.callbacks
* Logger : pl.loggers


### Model definition

In [5]:
# classes weights
vowels_class_weight = compute_class_weight(
    class_weight='balanced', 
    classes=train_df.vowel_diacritic.unique(), 
    y=train_df.vowel_diacritic.values
)
g_root_class_weight = compute_class_weight(
    class_weight='balanced', 
    classes=train_df.grapheme_root.unique(), 
    y=train_df.grapheme_root.values
)
consonant_class_weight = compute_class_weight(
    class_weight='balanced', 
    classes=train_df.consonant_diacritic.unique(), 
    y=train_df.consonant_diacritic.values
)
# model definition 
model = Model(
    base_encoder= Config.base_model,
    arch_from = 'timm',
    vowels_class_weight=None,
    g_root_class_weight=None,
    consonant_class_weight=None,
    drop=0.3,
    lr=Config.learning_rate,
    pretrained=True
)

model

Model(
  (extractor): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act1): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act1): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (act2): ReLU(inplace=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, m

### Callbacks

In [6]:
# callbacks definitions
model_ckpt = ModelCheckpoint(
    filename=os.path.join(Config.models_dir, f"bengali_grapheme-{Config.base_model}"), 
    monitor='val_recall', 
    mode="max"
)
es = EarlyStopping( 
    monitor='val_recall', 
    patience=10, 
    mode="max"
)
gpu_stats = GPUStatsMonitor(
    memory_utilization = True,
    gpu_utilization = True,
    intra_step_time = False,
    inter_step_time = False,
    fan_speed = True,
    temperature = True,
)

callbacks_list = [es, model_ckpt, gpu_stats]

### Logger

In [7]:
# Logger(s) definition
tb_logger = TensorBoardLogger(
    save_dir = Config.logs_dir,
    name = 'kaggle-bengali-ai',
    default_hp_metric = False
)


### Trainer

In [8]:
trainer = Trainer(
    gpus=1,
    precision=32,
    #fast_dev_run=True,
    max_epochs =  Config.epochs,
    min_epochs =2,
    # plugins = 'deepspeed'
    logger=tb_logger,
    callbacks = callbacks_list
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


### Train model(s)

In [9]:
trainer.fit(
    model=model, 
    datamodule=dm
)



  | Name                        | Type       | Params
-----------------------------------------------------------
0 | extractor                   | ResNet     | 21.8 M
1 | encoder                     | Sequential | 21.8 M
2 | dropout_layer               | Dropout    | 0     
3 | grapheme_root_decoder       | Linear     | 168 K 
4 | vowel_diacritic_decoder     | Linear     | 11.0 K
5 | consonant_diacritic_decoder | Linear     | 7.0 K 
-----------------------------------------------------------
22.0 M    Trainable params
0         Non-trainable params
22.0 M    Total params
87.936    Total estimated model params size (MB)


Adjusting learning rate of group 0 to 2.0000e-07.


Validation sanity check: 0it [00:00, ?it/s]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/zeusdric/miniconda3/envs/deepl/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-eee476b51974>", line 1, in <module>
    trainer.fit(
  File "/home/zeusdric/miniconda3/envs/deepl/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 513, in fit
    self.dispatch()
  File "/home/zeusdric/miniconda3/envs/deepl/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in dispatch
    self.accelerator.start_training(self)
  File "/home/zeusdric/miniconda3/envs/deepl/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in start_training
    self.training_type_plugin.start_training(trainer)
  File "/home/zeusdric/miniconda3/envs/deepl/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 111, in start_training
    self

TypeError: object of type 'NoneType' has no len()

# Convert model to jit

In [None]:
th.jit.save(
    model.to_torchscript(),
    os.path.join(Config.models_dir, 'grapheme-classifier-3-in-1.pt')
)