# Alaska2 Baseline PyTorch

* This notebook is based on the great work of [Alex Shonenkov](https://www.kaggle.com/shonenkov/train-inference-gpu-baseline)
* I have classified the dataset into 12 classes, 3 quality factors for each folder

# Dependencies

In [None]:
# Changes:
# Seed = 1
# Fold = 1
# Sampler downsize
# smoothing = 0.05
#      net._fc = nn.Sequential(nn.Linear(1408, 1024),
#                                  nn.ReLU(),
#                                  nn.Dropout(0.2 , inplace = False),
#                                  nn.Linear(1024,512),
#                                  nn.ReLU(),
#                                  nn.Dropout(0.2 , inplace = True),
#                                  nn.Linear(512,256),
#                                  nn.ReLU(),
#                                  nn.Dropout(0.2 , inplace = True),
#                                  nn.Linear(256,2),
#                                  nn.LogSoftmax(dim=1)).to('cuda') 


#model link:
#    https://drive.google.com/file/d/14Z9F-TABfD7-ibuVX1C9DrrPSDrKIo9Y/view?usp=sharing
        

In [None]:
DCT = False
TransferLearning = True
UpdateLayerInFitter = False
UpdateLayer2InFitter = False 
UpdateLayerInModel = False
UpdateLayer2InModel = True
Load_model = True
LoadedFileName = 'modelalldata-2classes-17-1/modelalldata_2classes_17_1.bin'
n_epochs = 1
SavedFile = 'modelalldata_2classes_18_1.bin'
QualityNum = 0 # 0 for all qualities, 90,95,75 for certain quality

In [None]:
!pip install -q efficientnet_pytorch > /dev/null

In [None]:
from glob import glob
from sklearn.model_selection import GroupKFold
import cv2
from skimage import io
import torch
from torch import nn
import os
from datetime import datetime
import time
import random
import cv2
import pandas as pd
import numpy as np
import albumentations as A
import matplotlib.pyplot as plt
from albumentations.pytorch.transforms import ToTensorV2
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.sampler import SequentialSampler, RandomSampler
import sklearn
import pdb


SEED = 1 #42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

# GroupKFold splitting

Thanks to [Remi Cogranne](https://www.kaggle.com/remicogranne/jpeg-explanations?scriptVersionId=33893706) for his explanation of JPEG and this part of the quantization table corresponding to each quality factor

In [None]:
df_fn =  pd.read_csv('../input/quality-ds/quality_ds.csv') 
if QualityNum != 0:
    df_fn = df_fn[df_fn['Quality']==QualityNum]
df_fn

In [None]:

%%time
#count =0
dataset = []

for label, kind in enumerate(['Cover', 'JMiPOD', 'JUNIWARD', 'UERD']):
    if label >1 :
        l = 1
    else:
        l = label
    for i,row in df_fn.iterrows():
      #  l = 0
      #  if row['Quality'] == 95:
      #      if label >= 1:
      #          l = 2
      #  if (row['Quality'] == 90) | (row['Quality'] == 75):
      #      if label >= 1:
      #          l = 1
        dataset.append({
            'kind': kind,
            'image_name': row['ImageName'], #path.split('/')[-1],
            'label': l
        })
     

random.shuffle(dataset)
dataset = pd.DataFrame(dataset)

gkf = GroupKFold(n_splits=5)

dataset.loc[:, 'fold'] = 0
for fold_number, (train_index, val_index) in enumerate(gkf.split(X=dataset.index, y=dataset['label'], groups=dataset['image_name'])):    
    dataset.loc[dataset.iloc[val_index].index, 'fold'] = fold_number 

# Simple Augs: Flips

In [None]:
def get_train_transforms():
    return A.Compose([#transforms.Compose([
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.Resize(height=512, width=512, p=1.0),
            #transforms.ToTensor(),
            ToTensorV2(p=1.0),
    ], p=1.0)

def get_valid_transforms():
    return A.Compose([
            A.Resize(height=512, width=512, p=1.0),
            ToTensorV2(p=1.0),
        ], p=1.0)

# Dataset

In [None]:
if DCT:
    ! git clone https://github.com/dwgoon/jpegio
    !pip install jpegio/.
    import jpegio as jio


In [None]:
DATA_ROOT_PATH = '../input/alaska2-image-steganalysis'

# Define 2D DCT
def dct2(a):
    # Return the Discrete Cosine Transform of arbitrary type sequence x.
    return fftpack.dct(fftpack.dct( a, axis=0, norm='ortho' ), axis=1, norm='ortho')

def onehot(size, target):
    vec = torch.zeros(size, dtype=torch.float32)
    vec[target] = 1.
    return vec

class DatasetRetriever(Dataset):

    def __init__(self, kinds, image_names, labels, transforms=None):
        super().__init__()
        self.kinds = kinds
        self.image_names = image_names
        self.labels = labels
        self.transforms = transforms

    def __getitem__(self, index: int):
        kind, image_name, label = self.kinds[index], self.image_names[index], self.labels[index]

        image = cv2.imread(f'{DATA_ROOT_PATH}/{kind}/{image_name}', cv2.IMREAD_COLOR)   
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
   
            #DCT
            ##image = np.zeros([512,512,3])
            ##image_jpeg = jio.read(f'{DATA_ROOT_PATH}/{kind}/{image_name}')
            #image = JPEGdecompressYCbCr(image_jpeg)
            ##image[:,:,0] = image_jpeg.coef_arrays[0]  
            ##image[:,:,1] = image_jpeg.coef_arrays[1]  
            ##image[:,:,2] = image_jpeg.coef_arrays[2]
            ##image /= 255.0
            ###image = mpimg.imread(f'{DATA_ROOT_PATH}/{kind}/{image_name}')
            ###imsize = (512, 512, 3)
            ###dct = np.zeros([512,512,3])
            ###for i in r_[:imsize[0]:8]:
            ###    for j in r_[:imsize[1]:8]:
            ###        dct[i:(i+8),j:(j+8)] = dct2( image[i:(i+8),j:(j+8)] )
            
            ###thresh = 0.002
            ###image = dct * (abs(dct) > (thresh*np.max(dct)))
        
        if self.transforms:
            sample = {'image': image}
            sample = self.transforms(**sample)
            image = sample['image']
           
        target = onehot(2, label)
        return image, target

    def __len__(self) -> int:
        return self.image_names.shape[0]

    def get_labels(self):
        return list(self.labels)

In [None]:
fold_number = 1 #0

train_dataset = DatasetRetriever(
    kinds=dataset[dataset['fold'] != fold_number].kind.values,
    image_names=dataset[dataset['fold'] != fold_number].image_name.values,
    labels=dataset[dataset['fold'] != fold_number].label.values,
    transforms=get_train_transforms(),
)

validation_dataset = DatasetRetriever(
    kinds=dataset[dataset['fold'] == fold_number].kind.values,
    image_names=dataset[dataset['fold'] == fold_number].image_name.values,
    labels=dataset[dataset['fold'] == fold_number].label.values,
    transforms=get_valid_transforms(),
)

In [None]:
#image, target = train_dataset[79000]
image, target = train_dataset[0]
numpy_image = image.permute(1,2,0).cpu().numpy()

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
    
ax.set_axis_off()
ax.imshow(numpy_image);

# Metrics

In [None]:
from sklearn import metrics

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
        
def alaska_weighted_auc(y_true, y_valid):
    """
    https://www.kaggle.com/anokas/weighted-auc-metric-updated
    """
    tpr_thresholds = [0.0, 0.4, 1.0]
    weights = [2, 1]

    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_valid, pos_label=1)

    # size of subsets
    areas = np.array(tpr_thresholds[1:]) - np.array(tpr_thresholds[:-1])

    # The total area is normalized by the sum of weights such that the final weighted AUC is between 0 and 1.
    normalization = np.dot(areas, weights)

    competition_metric = 0
    for idx, weight in enumerate(weights):
        y_min = tpr_thresholds[idx]
        y_max = tpr_thresholds[idx + 1]
        mask = (y_min < tpr) & (tpr < y_max)
        if len(fpr[mask])==0:
            pdb.set_trace()

        x_padding = np.linspace(fpr[mask][-1], 1, 100)

        x = np.concatenate([fpr[mask], x_padding])
        y = np.concatenate([tpr[mask], [y_max] * len(x_padding)])
        y = y - y_min  # normalize such that curve starts at y=0
        score = metrics.auc(x, y)
        submetric = score * weight
        best_subscore = (y_max - y_min) * weight
        competition_metric += submetric

    return competition_metric / normalization
        
class RocAucMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.y_true = np.array([0,1])
        self.y_pred = np.array([0.5,0.5])
        self.score = 0

    def update(self, y_true, y_pred):
        y_true = y_true.cpu().numpy().argmax(axis=1).clip(min=0, max=1).astype(int)
        y_pred = 1 - nn.functional.softmax(y_pred, dim=1).data.cpu().numpy()[:,0]
        self.y_true = np.hstack((self.y_true, y_true))
        self.y_pred = np.hstack((self.y_pred, y_pred))
        self.score = alaska_weighted_auc(self.y_true, self.y_pred)
    
    @property
    def avg(self):
        return self.score

# Label Smoothing

In [None]:
class LabelSmoothing(nn.Module):
    def __init__(self, smoothing = 0.05): #0.01):
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing

    def forward(self, x, target):
        
        if self.training:
            x = x.float()
            target = target.float()
            logprobs = torch.nn.functional.log_softmax(x, dim = -1)

            nll_loss = -logprobs * target
            nll_loss = nll_loss.sum(-1)
    
            smooth_loss = -logprobs.mean(dim=-1)

            loss = self.confidence * nll_loss + self.smoothing * smooth_loss

            return loss.mean()
        else:
            return torch.nn.functional.cross_entropy(x, target)

# Fitter

In [None]:
import warnings

warnings.filterwarnings("ignore")

class Fitter:
    
    def __init__(self, model, device, config):
        self.config = config
        self.epoch = 0
        
        self.base_dir = './'
        self.log_path = f'{self.base_dir}/log.txt'
        self.best_summary_loss = 10**5

        self.model = model
        self.device = device

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ] 

        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=config.lr)
        self.scheduler = config.SchedulerClass(self.optimizer, **config.scheduler_params)
        self.criterion = LabelSmoothing().to(self.device)
        self.log(f'Fitter prepared. Device is {self.device}')

    def fit(self, train_loader, validation_loader):
        for e in range(n_epochs):
            if self.config.verbose:
                lr = self.optimizer.param_groups[0]['lr']
                timestamp = datetime.utcnow().isoformat()
                self.log(f'\n{timestamp}\nLR: {lr}')

            t = time.time()
            summary_loss, final_scores = self.train_one_epoch(train_loader)

            self.log(f'[RESULT]: Train. Epoch: {self.epoch}, summary_loss: {summary_loss.avg:.5f}, final_score: {final_scores.avg:.5f}, time: {(time.time() - t):.5f}')
            self.save(f'{self.base_dir}/last-checkpoint.bin')

            t = time.time()
            summary_loss, final_scores = self.validation(validation_loader)

            self.log(f'[RESULT]: Val. Epoch: {self.epoch}, summary_loss: {summary_loss.avg:.5f}, final_score: {final_scores.avg:.5f}, time: {(time.time() - t):.5f}')
            if summary_loss.avg < self.best_summary_loss:
                self.best_summary_loss = summary_loss.avg
                self.model.eval()
                self.save(f'{self.base_dir}/best-checkpoint-{str(self.epoch).zfill(3)}epoch.bin')
                for path in sorted(glob(f'{self.base_dir}/best-checkpoint-*epoch.bin'))[:-3]:
                    os.remove(path)

            if self.config.validation_scheduler:
                self.scheduler.step(metrics=summary_loss.avg)

            self.epoch += 1

    def validation(self, val_loader):
        self.model.eval()
        summary_loss = AverageMeter()
        final_scores = RocAucMeter()
        t = time.time()
        for step, (images, targets) in enumerate(val_loader):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Val Step {step}/{len(val_loader)}, ' + \
                        f'summary_loss: {summary_loss.avg:.5f}, final_score: {final_scores.avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            with torch.no_grad():
                targets = targets.to(self.device).float()
                batch_size = images.shape[0]
                images = images.to(self.device).float()
                outputs = self.model(images)
                loss = self.criterion(outputs, targets)
                final_scores.update(targets, outputs)
                summary_loss.update(loss.detach().item(), batch_size)

        return summary_loss, final_scores

    def train_one_epoch(self, train_loader):
        self.model.train()
        summary_loss = AverageMeter()
        final_scores = RocAucMeter()
        t = time.time()
        for step, (images, targets) in enumerate(train_loader):
            if self.config.verbose:
                if step % self.config.verbose_step == 0:
                    print(
                        f'Train Step {step}/{len(train_loader)}, ' + \
                        f'summary_loss: {summary_loss.avg:.5f}, final_score: {final_scores.avg:.5f}, ' + \
                        f'time: {(time.time() - t):.5f}', end='\r'
                    )
            
            targets = targets.to(self.device).float()
            images = images.to(self.device).float()
            batch_size = images.shape[0]

            self.optimizer.zero_grad()
            outputs = self.model(images)
            loss = self.criterion(outputs, targets)
            loss.backward()
            
            final_scores.update(targets, outputs)
            summary_loss.update(loss.detach().item(), batch_size)

            self.optimizer.step()

            if self.config.step_scheduler:
                self.scheduler.step()

        return summary_loss, final_scores
    
    def save(self, path):
        self.model.eval()
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'best_summary_loss': self.best_summary_loss,
            'epoch': self.epoch,
        }, path)

    def load(self, path):
        checkpoint = torch.load(path)
        #pdb.set_trace()
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        self.best_summary_loss = checkpoint['best_summary_loss']
        self.epoch = checkpoint['epoch'] + 1
        
    def log(self, message):
        if self.config.verbose:
            print(message)
        with open(self.log_path, 'a+') as logger:
            logger.write(f'{message}\n')

# EfficientNet

In [None]:
from efficientnet_pytorch import EfficientNet

def get_net():
    net = EfficientNet.from_pretrained('efficientnet-b2')
    if TransferLearning:
        for param in net.parameters():
            param.requires_grad = False
    if UpdateLayerInModel:
        net._fc = nn.Sequential(nn.Linear(1408, 256),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(256, 2),
                                 nn.LogSoftmax(dim=1)).to('cuda')
    elif UpdateLayer2InModel:
        net._fc = nn.Sequential(nn.Linear(1408, 1024),
                                 nn.ReLU(),
                                 nn.Dropout(0.2 , inplace = False),
                                 nn.Linear(1024,512),
                                 nn.ReLU(),
                                 nn.Dropout(0.2 , inplace = True),
                                 nn.Linear(512,256),
                                 nn.ReLU(),
                                 nn.Dropout(0.2 , inplace = True),
                                 nn.Linear(256,2),
                                 nn.LogSoftmax(dim=1)).to('cuda')        
    else:
        net._fc = nn.Linear(in_features=1408, out_features=4, bias=True)
     
    return net

net = get_net().cuda()

In [None]:
net

# Config

In [None]:
class TrainGlobalConfig:
    num_workers = 4
    batch_size = 16 
    #n_epochs = 1 #5#25
    lr = 0.0001
    #0.001#0.0009#0.002#0.0002#0.001

    # -------------------
    verbose = True
    verbose_step = 1
    # -------------------

    # --------------------
    step_scheduler = False  # do scheduler.step after optimizer.step
    validation_scheduler = True  # do scheduler.step after validation stage loss


    SchedulerClass = torch.optim.lr_scheduler.ReduceLROnPlateau
    scheduler_params = dict(
        mode='min',
        factor=0.5,#0.1,#0.5
        patience=7,#5,#1
        verbose=False, 
        threshold=0.0001,#0.0001
        threshold_mode='abs',
        cooldown=10, 
        min_lr=1e-12,#1e-4,#1e-8
        eps=1e-12,#1e-04#1e-8
    )
    # --------------------

# Class Balance "on fly" from [@CatalystTeam](https://github.com/catalyst-team/catalyst)

In [None]:
from catalyst.data.sampler import BalanceClassSampler

def run_training():
    device = torch.device('cuda')

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        sampler=BalanceClassSampler(labels=train_dataset.get_labels(), mode="downsampling"),
        batch_size=TrainGlobalConfig.batch_size,
        pin_memory=False,
        drop_last=True,
        num_workers=TrainGlobalConfig.num_workers,
    )
    val_loader = torch.utils.data.DataLoader(
        validation_dataset, 
        batch_size=TrainGlobalConfig.batch_size,
        num_workers=TrainGlobalConfig.num_workers,
        shuffle=False,
        sampler=SequentialSampler(validation_dataset),
        pin_memory=False,
    )

    fitter = Fitter(model=net, device=device, config=TrainGlobalConfig)
    if Load_model:
        fitter.load('../input/' + LoadedFileName)
    #fitter.load('../input/new-layer-1/modelafteralex14.bin')
    if UpdateLayerInFitter:
        fitter.model._fc = nn.Sequential(nn.Linear(1408, 256),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(256, 2),
                                 nn.LogSoftmax(dim=1)).to('cuda')
        fitter.optimizer = torch.optim.AdamW(fitter.model.parameters(), lr=TrainGlobalConfig.lr)
    elif UpdateLayer2InFitter:
        fitter.model._fc = nn.Sequential(nn.Linear(1408, 1024),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(1024,512),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(512,256),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(256,2),
                                 nn.LogSoftmax(dim=1)).to('cuda')        
        fitter.optimizer = torch.optim.AdamW(fitter.model.parameters(), lr=TrainGlobalConfig.lr)
    fitter.fit(train_loader, val_loader)
    fitter.save(SavedFile)

# Training

I have used 1xV100 for training model, in kaggle kernel it works also. You can make fork and check it, but I would like to share with you my logs

In [None]:
run_training()

In [None]:
branchName = SavedFile

!git config --global user.name "RTCTeam"
!git config --global user.email "tareksherif.courses@gmail.com"

!git checkout -b origin
!git init
!git checkout master
!git add .
!git commit -m  {branchName}
!git remote add origin https://RTCTeam:RTC_Team_2020@github.com/RTCTeam/ALASKA2.git
!git push -u  --force origin master 
 
# delete branch if exist 
!git branch -d {branchName}  --force
!git push origin --delete {branchName}  --force

# create branch
!git checkout -b  {branchName}
!git commit -m  rm{branchName}
!git push origin {branchName}

# Inference

In [None]:
#checkpoint = torch.load('../input/afteralex1epoch/last-checkpoint.bin')
#net.load_state_dict(checkpoint['model_state_dict']);
net.eval();

In [None]:
class DatasetSubmissionRetriever(Dataset):

    def __init__(self, image_names, transforms=None):
        super().__init__()
        self.image_names = image_names
        self.transforms = transforms

    def __getitem__(self, index: int):
        image_name = self.image_names[index]
        #jpeg = jio.read(f'{DATA_ROOT_PATH}/Test/{image_name}')
        #image = JPEGdecompressYCbCr(jpeg)
        image = cv2.imread(f'{DATA_ROOT_PATH}/Test/{image_name}', cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image /= 255.0
        
        if self.transforms:
            sample = {'image': image}
            sample = self.transforms(**sample)
            image = sample['image']

        return image_name, image

    def __len__(self) -> int:
        return self.image_names.shape[0]

In [None]:
dataset = DatasetSubmissionRetriever(
    image_names=np.array([path.split('/')[-1] for path in glob('../input/alaska2-image-steganalysis/Test/*.jpg')]),
    transforms=get_valid_transforms(),
)


data_loader = DataLoader(
    dataset,
    batch_size=8,
    shuffle=False,
    num_workers=2,
    drop_last=False,
)

In [None]:
%%time

result = {'Id': [], 'Label': []}
for step, (image_names, images) in enumerate(data_loader):
    print(step, end='\r')
    
    y_pred = net(images.cuda())
    #cover95 = nn.functional.softmax(y_pred, dim=1).data.cpu().numpy()[:,0]
    #cover90 = nn.functional.softmax(y_pred, dim=1).data.cpu().numpy()[:,1]
    #cover75 = nn.functional.softmax(y_pred, dim=1).data.cpu().numpy()[:,2]
    y_pred = 1 - nn.functional.softmax(y_pred, dim=1).data.cpu().numpy()[:,0]
    #1- (cover95 + cover90 + cover75)
    
    result['Id'].extend(image_names)
    result['Label'].extend(y_pred)

In [None]:
submission = pd.DataFrame(result)
submission.to_csv('submission.csv', index=False)
submission.head()

In [None]:
#FileLink(r'submission95_2.csv')