In [1]:
# Here we take care of paths.
# Make sure root project directory is named 'VESUVIUS_Challenge' for this to work

from pathlib import Path
import os
print('Starting path:' + os.getcwd())
if os.getcwd()[-18:] == 'VESUVIUS_Challenge':
    pass
else:
    PATH = Path().resolve().parents[0]
    os.chdir(PATH)

# make sure you are in the root folder of the project
print('Current path:' + os.getcwd())

Starting path:/home/gregorymar577/VESUVIUS_Challenge/jupyter notebooks
Current path:/home/gregorymar577/VESUVIUS_Challenge


In [2]:
import torch
import monai
import segmentation_models_pytorch as smp
import matplotlib.pyplot as plt
#import tempfile
import shutil
import os
import glob
import cv2
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from typing import Tuple, List
import albumentations as A
from albumentations.pytorch import ToTensorV2
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from Data_Modules.Vesuvius_Dataset import Vesuvius_Tile_Datamodule
from lit_models.Vesuvius_Lit_Model import Lit_Model
from pytorch_lightning.callbacks import ModelCheckpoint
import torch.nn as nn
import torch.nn as nn
from functools import partial
import torchvision
import torch.nn.functional as F
from lit_models.scratch_models import FPNDecoder
from Models.PreBackbone_3D import PreBackbone_3D
from Models.PreBackbone_3d_Zdim import PreBackbone_3D_ZDIM

2023-05-24 13:14:04,111 - Created a temporary directory at /tmp/tmpbm9p__4u
2023-05-24 13:14:04,112 - Writing /tmp/tmpbm9p__4u/_remote_module_non_scriptable.py


In [3]:
PATCH_SIZE = 256
Z_DIM = 24
COMPETITION_DATA_DIR_str =  "kaggle/input/vesuvius-challenge-ink-detection/"


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# change to the line below if not using Apple's M1 or chips
# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class Model_3dAtt_w_Segformer(nn.Module):
    def __init__(self ):
        
        super().__init__()
        
        self.model_3d = PreBackbone_3D_ZDIM(z_dim=Z_DIM, out_channels = 4).to(DEVICE) 
        
       
        self.model_2d = monai.networks.nets.SwinUNETR(img_size = 256,
                                                      in_channels = 4 ,
                                                      out_channels = 1 ,
                                                      depths=(2, 2, 2, 2,2),
                                                      num_heads=(3, 6, 12, 24,48),
                                                      feature_size=48,
                                                      norm_name='instance',
                                                      drop_rate=0.0,
                                                      attn_drop_rate=0.1,
                                                      dropout_path_rate=0.0,
                                                      normalize=True,
                                                      use_checkpoint=False,
                                                      spatial_dims=2,
                                                      downsample='mergingv2').to(DEVICE) 
      
       

    def forward(self, x):
      
        outs_3d = self.model_3d(x)
        logits = self.model_2d(outs_3d)
        
       
       
            
        
        return logits












In [5]:


class CFG:
    
    device = DEVICE
    
    THRESHOLD = 0.4
    use_wandb = True
    
    ######### Dataset #########
    
    # stage: 'train' or 'test'
    stage = 'train' 
    
    # location of competition Data
    competition_data_dir = COMPETITION_DATA_DIR_str
    
    # Number of slices in z-dim: 1<z_dim<65
    z_dim = Z_DIM
    
    # fragments to use for training avalaible [1,2,3]
    train_fragment_id=[2,3]
    
    # fragments to use for validation
    val_fragment_id=[1]
    
    

    
    
    batch_size = 8
    
    # Size of the patch and stride for feeding the model
    patch_size = PATCH_SIZE
    stride = patch_size // 2
    
    
    num_workers = 8
    on_gpu = True
    
    
    ######## Model and Lightning Model paramters ############
    
    # MODEL
    model = Model_3dAtt_w_Segformer().to(DEVICE) 
    
    
    
    
    checkpoint = None
    save_directory = None
    
    
    accumulate_grad_batches = 128 // batch_size  # experiments showed batch_size * accumulate_grad = 192 is optimal
    learning_rate =  0.00001
    eta_min = 1e-8
    t_max = 50
    max_epochs = 120
    weight_decay =  0.0001
    precision =16
    
    # checkpointing
    save_top_k=5
    
    monitor="FBETA"
    mode="max"
    
    
    ####### Augemtnations ###############
    
    # Training Aug
    train_transforms = [
        # A.RandomResizedCrop(
        #     size, size, scale=(0.85, 1.0)),
        A.Resize(patch_size, patch_size),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.RandomBrightnessContrast(p=0.75),
        A.ShiftScaleRotate(p=0.75),
        A.OneOf([
                A.GaussNoise(var_limit=[10, 50]),
                A.GaussianBlur(),
                A.MotionBlur(),
                ], p=0.4),
        
       
        A.GridDistortion(num_steps=5, distort_limit=0.3, p=0.5),
        A.CoarseDropout(max_holes=1, max_width=int(patch_size * 0.3), max_height=int(patch_size * 0.3), 
                        mask_fill_value=0, p=0.5),
        # A.Cutout(max_h_size=int(size * 0.6),
        #          max_w_size=int(size * 0.6), num_holes=1, p=1.0),
        A.Normalize(
            mean= [0] * z_dim,
            std= [1] * z_dim
        ),
        ToTensorV2(transpose_mask=True),
    ]
    

    
    # Validaiton Aug
    val_transforms = [
        A.Resize(patch_size, patch_size),
        A.Normalize(
            mean= [0] * z_dim,
            std= [1] * z_dim
        ),
        ToTensorV2(transpose_mask=True),
    ]
    
    # Test Aug
    test_transforms = [
        A.Resize(patch_size, patch_size),
        A.Normalize(
            mean=[0] * z_dim,
            std=[1] * z_dim
        ),

        ToTensorV2(transpose_mask=True),
    ]
        
    
    

In [6]:
dataset = Vesuvius_Tile_Datamodule(cfg=CFG)#.to(DEVICE)

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/24 [00:00<?, ?it/s]

In [7]:
lit_model = Lit_Model(cfg=CFG,).to(DEVICE) 

Checkpoint = True
if Checkpoint:
    lit_model = lit_model.load_from_checkpoint('logs/3dAttn_w_Swin_24_05bce1_05tver50_afterE10_50bce5_50tver70/last-v2.ckpt',
                                               #learning_rate =7e-6 ,
                                                #t_max = 70,
                                               #eta_min = 1e-8,
                                               #weight_decay =  0.0001,
                                              )


[34m[1mwandb[0m: Currently logged in as: [33mgmarus[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670525583333054, max=1.0…

# Changong hyperparameters for resuming or transfer

In [8]:
# pos
lit_model.loss_bce.pos_weight = torch.tensor(0.5 ,device='cuda')
lit_model.loss_tversky.alpha = 0.7
lit_model.loss_tversky.beta = 0.3


## FREEZING WEIGHTS

def freeze_weights(model):
    for param in model.parameters():
        param.requires_grad = False

        
freeze_weights(lit_model.model.model_2d)

In [9]:
print('bce pos',lit_model.loss_bce.pos_weight)
print('tver alpha', lit_model.loss_tversky.alpha)
print('tver beta', lit_model.loss_tversky.beta)

bce pos tensor(0.5000, device='cuda:0')
tver alpha 0.7
tver beta 0.3


In [None]:
SAVE_DIR = 'logs/3dAttn_w_Swin_24_05bce1_05tver50_afterE10_50bce5_50tver70_cont'

checkpoint_callback = ModelCheckpoint(
    save_top_k=5,
    monitor="FBETA",
    mode="max",
    dirpath=SAVE_DIR,
    filename="3dAttn_w_Swin_24_05bce1_05tver50_afterE10_50bce5_50tver70{epoch:02d}{FBETA:.2f}{val_loss:.2f}{recall:.2f}{precision:.2f}",
    save_last =True,
)

torch.autograd.set_detect_anomaly(True)
trainer = pl.Trainer(
        accelerator='gpu',
        #benchmark=True,
        max_epochs=CFG.max_epochs,
        check_val_every_n_epoch= 1,
        devices=1,
        #fast_dev_run=fast_dev_run,
        logger=pl.loggers.CSVLogger(save_dir=SAVE_DIR),
        log_every_n_steps=1,
        default_root_dir = SAVE_DIR,
        #overfit_batches=1,
        #precision= CFG.precision,
        accumulate_grad_batches=CFG.accumulate_grad_batches, 
        callbacks=[checkpoint_callback],
        #gradient_clip_val=1,
        #resume_from_checkpoint ='logs/Model_48_3dAttn_w_FPN_mitb3_bce75_05tver60_contbce30_tver70/last.ckpt'
        #detect_anomaly=True
        
        )



print('bce pos',lit_model.loss_bce.pos_weight)
print('tver alpha', lit_model.loss_tversky.alpha)
print('tver beta', lit_model.loss_tversky.beta)

trainer.fit(lit_model, datamodule=dataset,
            #ckpt_path='logs/gcp_checkpoints/MoUB4_Bce015_Tver_alpha085epoch_64.ckpt'
           )

2023-05-24 13:15:53,771 - GPU available: True (cuda), used: True
2023-05-24 13:15:53,772 - TPU available: False, using: 0 TPU cores
2023-05-24 13:15:53,773 - IPU available: False, using: 0 IPUs
2023-05-24 13:15:53,773 - HPU available: False, using: 0 HPUs
bce pos tensor(0.5000, device='cuda:0')
tver alpha 0.7
tver beta 0.3
2023-05-24 13:15:53,971 - LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Adjusting learning rate of group 0 to 1.0000e-05.
2023-05-24 13:15:53,978 - 
  | Name         | Type                    | Params
---------------------------------------------------------
0 | metrics      | ModuleDict              | 0     
1 | model        | Model_3dAtt_w_Segformer | 25.4 M
2 | loss_tversky | TverskyLoss             | 0     
3 | loss_bce     | SoftBCEWithLogitsLoss   | 0     
---------------------------------------------------------
25.4 M    Trainable params
0         Non-trainable params
25.4 M    Total params
101.661   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [None]:
print('bce pos',lit_model.loss_bce.pos_weight)
print('tver alpha', lit_model.loss_tversky.alpha)
print('tver beta', lit_model.loss_tversky.beta)