In [None]:
import os
import gc
import cv2
import math
import copy
import time
import random
import glob
from PIL import Image
from matplotlib import pyplot as plt

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp
import torchvision
from torchvision import transforms



# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold

# For Image Models
import timm

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


# Auxiliary Functions
import auxiliaryfunctions as af
import imageprocessing as ip
import myclasses as mc
import aefiles as ae

## Set Configuration

In [None]:
CONFIG = {
    "seed": 42,    # for reproducibility
    "img_size": 2048,
    "model_name": "tf_efficientnetv2_s_in21ft1k",
    "num_classes": 5,
    "valid_batch_size":4,
    "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
}

In [None]:
af.set_seed(CONFIG['seed'])

In [None]:
ROOT_DIR = '/kaggle/input/UBC-OCEAN'
TEST_DIR = '/kaggle/input/UBC-OCEAN/test_thumbnails'
TRAIN_DIR = '/kaggle/input/UBC-OCEAN/train_thumbnails'

ALT_TEST_DIR = '/kaggle/input/UBC-OCEAN/test_images'
ALT_TRAIN_DIR = '/kaggle/input/UBC-OCEAN/train_images'

LABEL_ENCODER_BIN = "...."
BEST_WEIGHT = "...."
BEST_WEIGHT2 = "...."
BEST_WEIGHT3 = "...."
BEST_WEIGHT4 = "...."

## Reading in Data

In [None]:
df = pd.read_csv(f"{ROOT_DIR}/test.csv")
df['file_path'] = df['image_id'].apply(af.get_test_file_path)
df['label'] = 0 # dummy
df_sub = pd.read_csv(f"{ROOT_DIR}/sample_submission.csv")
encoder = joblib.load( LABEL_ENCODER_BIN )

In [None]:
dfs = []
for (file_path, image_id) in zip(df["file_path"], df["image_id"]):
    dfs.append( ip.get_cropped_images(file_path, image_id, None) )

df_crop = pd.concat(dfs)
df_crop["label"] = 0 # dummy
df_crop = df_crop.drop_duplicates(subset=["image_id", "sx", "ex", "sy", "ey"]).reset_index(drop=True)

In [None]:
data_transforms = {
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

## Load Pre-Trained Models

In [None]:
model0 = mc.UBCModel('tf_efficientnetv2_s_in21ft1k', CONFIG['num_classes'])
model2 = mc.UBCModel('tf_efficientnet_b0_ns', CONFIG['num_classes'])
model3 = mc.UBCModel('tf_efficientnet_b0_ns', CONFIG['num_classes'])
model4 = mc.UBCModel('tf_efficientnet_b0_ns', CONFIG['num_classes'])

model0.load_state_dict(torch.load( BEST_WEIGHT ))
model0.to(CONFIG['device']);
model2.load_state_dict(torch.load( BEST_WEIGHT2 ))
model2.to(CONFIG['device']);
model3.load_state_dict(torch.load( BEST_WEIGHT3 ))
model3.to(CONFIG['device']);
model4.load_state_dict(torch.load( BEST_WEIGHT3 ))
model4.to(CONFIG['device']);

In [None]:
exclude_params = ["model.conv_stem.weight", "model.bn1.weight"]
include_params  = ["linear.weight", "linear.bias"]
for n, param in model0.named_parameters():
    if n not in include_params:
        param.requires_grad = False
    else:
        param.requires_grad=True

## AutoEncoder Model for Outlier Detection

### Prepare data for training AE

In [None]:
df_train = pd.read_csv(f"{ROOT_DIR}/train.csv")    
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=42)

    
df_train['file_path'] = df_train['image_id'].apply(af.get_train_file_path)
df_valid['file_path'] = df_valid['image_id'].apply(af.get_train_file_path)
    
df_train['label'] = encoder.transform(df_train['label'])
df_valid['label'] = encoder.transform(df_valid['label'])

dfs_train = []
for (file_path, image_id,label) in zip(df_train["file_path"], df_train["image_id"], df_train['label']):
    dfs_train.append(ip.get_cropped_images(file_path, image_id, label) )
    
dfs_valid = []
for (file_path, image_id,label) in zip(df_valid["file_path"], df_valid["image_id"], df_valid['label']):
     dfs_valid.append(ip.get_cropped_images(file_path, image_id, label) )
        
df_crop_train = pd.concat(dfs_train)
df_crop_valid = pd.concat(dfs_valid)
    
df_crop_train = df_crop_train.drop_duplicates(subset=["image_id", "sx", "ex", "sy", "ey","label"]).reset_index(drop=True)
df_crop_valid = df_crop_valid.drop_duplicates(subset=["image_id", "sx", "ex", "sy", "ey","label"]).reset_index(drop=True)
        
dataset = mc.UBCDataset(df_crop_train, transforms=data_transforms["valid"])
dataset_valid = mc.UBCDataset(df_crop_valid, transforms=data_transforms["valid"])

train_loader = DataLoader(dataset, batch_size=2,
                          num_workers=2,shuffle=False,pin_memory=False)
valid_loader = DataLoader(dataset_valid, batch_size=2,
                          num_workers=2,shuffle=False,pin_memory=False)

### Train AutoEncoder

In [None]:
autoencoder = ae.Autoencoder()
autoencoder = autoencoder.to(CONFIG['device'])
criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)
ae.train_autoencoder(train_loader,valid_loader, autoencoder, criterion, optimizer, num_epochs=30)

# Inference 

In [None]:
# Set up DataLoader for the test set
test_dataset = mc.UBCDataset(df_crop, transforms=data_transforms["valid"])
# with batch size 1
test_loader = DataLoader(test_dataset, batch_size=1, 
                          num_workers=2, shuffle=False, pin_memory=True)

# Evaluate the autoencoder on the test set
loaded_model = ae.Autoencoder()
criterion = nn.MSELoss()
loaded_model.load_state_dict(torch.load('....pth'))
loaded_model = loaded_model.to(CONFIG['device'])
#load with batchsize 1
anom_list,ids,probs_test = ae.evaluate_autoencoder(test_loader, loaded_model,df_crop,1)
anom_dict = af.combine_anoms(ids, anom_list)

In [None]:
test_dataset = mc.UBCDataset(df_crop, transforms=data_transforms["valid"])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], 
                          num_workers=2, shuffle=False, pin_memory=True)

In [None]:
preds = []
with torch.no_grad():
    bar = tqdm(enumerate(test_loader), total=len(test_loader))
    for step, data in bar:        
        images = data['image'].to(CONFIG["device"], dtype=torch.float)        
        
        outputs1 = model0(images)
        outputs2 = model2(images)
        outputs3 = model3(images)
        outputs4 = model4(images)
        
        outputs = 0.66 * (0.34 * outputs4 + 0.7 * outputs2) + 0.322 * (0.4 * outputs1 + 0.6 * outputs3)
        outputs = model0.softmax(outputs)
        
        preds.append( outputs.detach().cpu().numpy() )

preds = np.vstack(preds)

In [None]:
for i in range(preds.shape[-1]):
    df_crop[f"cat{i}"] = preds[:, i]
    


dict_label = {}
anomaly_label = {}
for image_id, gdf in df_crop.groupby("image_id"):
    dict_label[image_id] = np.argmax( gdf[ [f"cat{i}" for i in range(preds.shape[-1])] ].values.max(axis=0) )
    
    
preds = np.array( [ dict_label[image_id] for image_id in df["image_id"].values ] )


In [None]:
pred_labels = encoder.inverse_transform( preds )
df_sub["label"] = pred_labels
#df_sub["anomalies"] = anom_list
df_sub['anomalies'] = df_sub['image_id'].map(anom_dict)
df_sub

In [None]:
df_sub["label"] = df_sub.apply(lambda x: af.update_classes(x["label"], x["anomalies"]), axis=1)
df_sub = df_sub.drop(columns=["anomalies"])
df_sub.to_csv("submission.csv", index=False)