## 1. Imports

In [None]:
import os
import ast
import yaml
import torch
import shutil
import random
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from sklearn.model_selection import GroupKFold

from tqdm.notebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append('../input/d/cowfrica/yolov5')
sys.path.append('../input/hyperparameters-for-yolov5')
sys.path.append('../input/tensorflow-great-barrier-reef')

import utils
tqdm.pandas()

%pip install -q wandb
%pip install wandb --upgrade
import wandb
wandb.login(key="f04c0b8d3b383666c2518b204435adcb3f9532e9")

## 2. Configuration

In [None]:
def random_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        
random_seed(1702)

In [None]:
FOLD = 4 
IMAGE_WIDTH = 1280
IMAGE_HEIGHT = 720
REMOVE_NOBBOX = True 
NUM_BACKGROUND_IMG = 400

CWD = '/kaggle/working/'
IMAGE_DIR = '/kaggle/images' 
LABEL_DIR = '/kaggle/labels' 
ROOT_DIR = '/kaggle/input/tensorflow-great-barrier-reef/'

In [None]:
!mkdir -p {IMAGE_DIR}
!mkdir -p {LABEL_DIR}

## 3. Helpers

I don't write any code in this section.

In [None]:
def coco2yolo(image_width, image_height, bboxes):
    bboxes = bboxes.copy().astype(float) 
    
    # normalize
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]] / image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]] / image_height
    
    # gets xmid and ymid 
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] + bboxes[..., [2, 3]] / 2
    
    return bboxes

def get_bbox(annots):
    # converts from dictionary to list 
    # formart after converting: [x, y, width, height]
    
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

## 4. Dataset

In [None]:
def get_path(row):
    row['old_image_path'] = f'{ROOT_DIR}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    row['image_path'] = f'{IMAGE_DIR}/video_{row.video_id}_{row.video_frame}.jpg'
    row['label_path'] = f'{LABEL_DIR}/video_{row.video_id}_{row.video_frame}.txt'
    
    return row

df = pd.read_csv(f'{ROOT_DIR}/train.csv')
df = df.progress_apply(get_path, axis=1)
df['annotations'] = df['annotations'].progress_apply(lambda x: ast.literal_eval(x))
df['num_bbox'] = df['annotations'].progress_apply(lambda x: len(x))
df['bboxes'] = df.annotations.progress_apply(get_bbox)

df.head(2)

In [None]:
# gets background images
df_empty_bbox = df[df["num_bbox"] == 0]
df_empty_bbox = df_empty_bbox.sample(frac=1).reset_index(drop=True).iloc[:NUM_BACKGROUND_IMG,]

# gets images with objects
df = df[df["num_bbox"] > 0]

# concats background images and image with objects
df = pd.concat([df, df_empty_bbox], ignore_index=True)

In [None]:
# copy from the original path to kaggle/working 
# because the models requires folder that can be written data on

def make_copy(path):
    data = path.split('/')
    filename = data[-1]
    video_id = data[-2]
    new_path = os.path.join(IMAGE_DIR, f'{video_id}_{filename}')
    shutil.copy(path, new_path)
    return

# using Parrallel for faster copying 
image_paths = df.old_image_path.tolist()
_ = Parallel(n_jobs=-1, backend='threading')(delayed(make_copy)(path) for path in tqdm(image_paths))

In [None]:
# perform KFold

kf = GroupKFold(n_splits = 5)
df = df.reset_index(drop=True)
df['fold'] = -1

for fold, (_, val_idx) in enumerate(kf.split(df, y=df.video_id.tolist(), groups=df.sequence)):
    df.loc[val_idx, 'fold'] = fold

df.fold.value_counts()

In [None]:
train_df = df[df.fold != FOLD]
valid_df = df[df.fold == FOLD]

train_df.shape[0], valid_df.shape[0]

## 5. Files required for YOLOv5

More details about the requirements can be found [here](https://github.com/ultralytics/yolov5/wiki/Train-Custom-Data).

In [None]:
# dataset.yaml for YOLOv5

with open(os.path.join(CWD, 'train.txt'), 'w') as f:
    for path in train_df.image_path.tolist():
        f.write(path + '\n')
            
with open(os.path.join(CWD, 'val.txt'), 'w') as f:
    for path in valid_df.image_path.tolist():
        f.write(path + '\n')

data = dict(
    path  = CWD,
    train =  os.path.join(CWD, 'train.txt'),
    val   =  os.path.join(CWD, 'val.txt' ),
    nc    = 1,
    names = ['cots'],
    )

with open(os.path.join(CWD, 'starfish.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

f = open(os.path.join(CWD, 'starfish.yaml'), 'r')
print('\nyaml:')
print(f.read())

In [None]:
# labels for YOLOv5

for row_idx in tqdm(range(df.shape[0])):
    row = df.iloc[row_idx]
    bboxes_coco = np.array(row.bboxes).astype(np.float32).copy()
    num_bbox = row.num_bbox
    names = ['cots'] * num_bbox
    labels = [0] * num_bbox

    with open(row.label_path, 'w') as f:
        if num_bbox < 1:
            annot = ''
            f.write(annot)
            continue
            
        bboxes_yolo  = coco2yolo(IMAGE_WIDTH, IMAGE_HEIGHT, bboxes_coco)
        bboxes_yolo  = np.clip(bboxes_yolo, 0, 1)
        
        for bbox_idx in range(len(bboxes_yolo)):
            label = [str(labels[bbox_idx])]
            bboxes = list(bboxes_yolo[bbox_idx].astype(str))
            new_line = (['\n'] if num_bbox != (bbox_idx + 1) else [''])
            
            annot =  label + bboxes + new_line
            annot = ' '.join(annot)
            annot = annot.strip(' ')
            
            f.write(annot)

## 6. Training

In [None]:
%cd /kaggle/working
!rm -r /kaggle/working/yolov5
!cp -r ../input/d/cowfrica/yolov5 /kaggle/working/yolov5
%cd yolov5

!python train.py --img 1280\
--batch 10\
--epochs 20\
--data /kaggle/working/starfish.yaml\
--weights yolov5m.pt --workers 0\
--adam\
--save-period 1