In [None]:
import numpy as np
import pandas as pd
from glob import glob
import shutil
from tqdm.notebook import tqdm
import os
from os import listdir
from os.path import isfile, join
import yaml
import datetime
from ensemble_boxes import *

# Define variables

In [None]:
S3_BUCKET_NAME = 'default'

WORKING_DIR='/content/result'
DATA_DIR = '/tmp/vinbigdata-chest-xray-resized-png-1024x1024'
LABELS_DIR = '/tmp/vinbigdata-yolo-labels'
YOLO_DIR = '/tmp/vinbigdata-yolov5'

HYPERPARAMS_FILE = '/content/result/vinbigdata.yaml'

In [None]:
iou_thr = 0.5
skip_box_thr = 0.001
sigma = 0.1

hyperparameters = dict(
        lr0 = 0.005,  # initial learning rate (SGD=1E-2, Adam=1E-3) 
        lrf = 0.032,  # final OneCycleLR learning rate (lr0 * lrf) 
        momentum = 0.937,  # SGD momentum/Adam beta1 
        weight_decay = 0.0005,  # optimizer weight decay 5e-4 
        warmup_epochs = 3.0,  # warmup epochs (fractions ok) 
        warmup_momentum = 0.8,  # warmup initial momentum 
        warmup_bias_lr = 0.1,  # warmup initial bias lr 
        box = 0.1,  # box loss gain 
        cls = 1.0,  # cls loss gain 
        cls_pw = 0.5,  # cls BCELoss positive_weight 
        obj = 2.0,  # obj loss gain (scale with pixels) 
        obj_pw = 0.5,  # obj BCELoss positive_weight 
        iou_t = 0.20,  # IoU training threshold 
        anchor_t = 4.0,  # anchor-multiple threshold 
        anchors = 0,  # anchors per output layer (0 to ignore) 
        fl_gamma = 0.0,  # focal loss gamma (efficientDet default gamma=1.5) 
        hsv_h = 0.015,  # image HSV-Hue augmentation (fraction) 
        hsv_s = 0.7,  # image HSV-Saturation augmentation (fraction) 
        hsv_v = 0.4,  # image HSV-Value augmentation (fraction) 
        degrees = 0.1,  # image rotation (+/- deg) 
        translate = 0.2,  # image translation (+/- fraction) 
        scale = 0.6,  # image scale (+/- gain) 
        shear = 0.1,  # image shear (+/- deg) 
        perspective = 0.0,  # image perspective (+/- fraction), range 0-0.001 
        flipud = 0.0,  # image flip up-down (probability) 
        fliplr = 0.5,  # image flip left-right (probability) 
        mosaic = 0.0,  # image mosaic (probability) 
        mixup = 0.0  # image mixup (probability) 
        )

In [None]:
classes = ["Aortic_enlargement", "Atelectasis", "Calcification", "Cardiomegaly", "Consolidation", "ILD", "Infiltration", "Lung_Opacity", "Nodule/Mass", "Other_lesion", "Pleural_effusion", "Pleural_thickening", "Pneumothorax", "Pulmonary_fibrosis"]

# Install dependencies and download data

In [None]:
!pip install -U PyYAML
!pip install thop
!pip install ensemble_boxes

In [None]:
print("MAKE SURE AWSCLI IS INSTALLED AND AWS CONFIGURED")
!mkdir $DATA_DIR
!mkdir $LABELS_DIR
!mkdir $YOLO_DIR

!aws s3 cp s3://$S3_BUCKET_NAME/vinbigdata-chest-xray-resized-png-1024x1024/ $DATA_DIR --recursive
!aws s3 cp s3://$S3_BUCKET_NAME/vinbigdata-yolo-labels/ $LABELS_DIR --recursive
!aws s3 cp s3://$S3_BUCKET_NAME/vinbigdata-yolov5/ $YOLO_DIR --recursive

# Format data

In [None]:
# ===============================
# Default WBF config (you can change these)
# iou_thr = 0.6
# skip_box_thr = 0.0001
# sigma = 0.1
# ===============================

# Loading the train DF
df = pd.read_csv(f"{DATA_DIR}/train.csv")
df.fillna(0, inplace=True)
df.loc[df["class_id"] == 14, ['x_max', 'y_max']] = 1.0

results = []
image_ids = df["image_id"].unique()

for image_id in tqdm(image_ids, total=len(image_ids)):

    # All annotations for the current image.
    data = df[df["image_id"] == image_id]    
    data = data.reset_index(drop=True)

    annotations = {}
    weights = []

    # WBF expects the coordinates in 0-1 range.
    max_value = data.iloc[:, 4:].values.max()
    data.loc[:, ["x_min", "y_min", "x_max", "y_max"]] = data.iloc[:, 4:] / max_value

    # Loop through all of the annotations
    for idx, row in data.iterrows():

        rad_id = row["rad_id"]

        if rad_id not in annotations:
            annotations[rad_id] = {
                "boxes_list": [],
                "scores_list": [],
                "labels_list": [],
            }

            # We consider all of the radiologists as equal.
            weights.append(1.0)

        annotations[rad_id]["boxes_list"].append([row["x_min"], row["y_min"], row["x_max"], row["y_max"]])
        annotations[rad_id]["scores_list"].append(1.0)
        annotations[rad_id]["labels_list"].append(row["class_id"])

    boxes_list = []
    scores_list = []
    labels_list = []

    for annotator in annotations.keys():
        boxes_list.append(annotations[annotator]["boxes_list"])
        scores_list.append(annotations[annotator]["scores_list"])
        labels_list.append(annotations[annotator]["labels_list"])

    # Calculate WBF
    boxes, scores, labels = weighted_boxes_fusion(
        boxes_list,
        scores_list,
        labels_list,
        weights=weights,
        iou_thr=iou_thr,
        skip_box_thr=skip_box_thr
    )

    for idx, box in enumerate(boxes):
        class_id = int(labels[idx])
        results.append({
            "image_id": image_id,
            "class_id": class_id,
            "rad_id": "wbf",
            "x_min": box[0] * max_value,
            "y_min": box[1] * max_value,
            "x_max": box[2] * max_value,
            "y_max": box[3] * max_value,
            "class_name": "No finding" if class_id >= 14 else classes[class_id],
            "width": data.iloc[0]['width'],
            "height": data.iloc[0]['height'],
        })

results = pd.DataFrame(results)

In [None]:
train_df = results
train_df = train_df[train_df.class_id!=14].reset_index(drop = True) # Drop no-findings
train_df['image_path'] = f'/tmp/vinbigdata-chest-xray-resized-png-1024x1024/train/'+train_df.image_id+('.png')

print(train_df.shape)
train_df.head()

In [None]:
train_df['x_min'] = train_df.apply(lambda row: (row.x_min)/row.width, axis =1)
train_df['y_min'] = train_df.apply(lambda row: (row.y_min)/row.height, axis =1)

train_df['x_max'] = train_df.apply(lambda row: (row.x_max)/row.width, axis =1)
train_df['y_max'] = train_df.apply(lambda row: (row.y_max)/row.height, axis =1)

train_df['x_mid'] = train_df.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
train_df['y_mid'] = train_df.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

train_df['w'] = train_df.apply(lambda row: (row.x_max-row.x_min), axis =1)
train_df['h'] = train_df.apply(lambda row: (row.y_max-row.y_min), axis =1)

train_df['area'] = train_df['w']*train_df['h']
train_df.head()

In [None]:
features = ['x_min', 'y_min', 'x_max', 'y_max', 'x_mid', 'y_mid', 'w', 'h', 'area']
X = train_df[features]
y = train_df['class_id']
X.shape, y.shape

In [None]:
train_files = []
val_files   = []
# val_files += list(train_df.image_path.unique()[:1])
train_files += list(train_df.image_path.unique())
len(train_files), len(val_files)

# Copy files for training

In [None]:
os.makedirs(f'{WORKING_DIR}/vinbigdata/labels/train', exist_ok = True)
os.makedirs(f'{WORKING_DIR}/vinbigdata/labels/val', exist_ok = True)
os.makedirs(f'{WORKING_DIR}/vinbigdata/images/train', exist_ok = True)
os.makedirs(f'{WORKING_DIR}/vinbigdata/images/val', exist_ok = True)

# copy files from input (/tmp) directory to working directory
for file in tqdm(train_files):
    shutil.copy(file, f'{WORKING_DIR}/vinbigdata/images/train')
    filename = file.split('/')[-1].split('.')[0]
    shutil.copy(os.path.join(LABEL_DIR, filename+'.txt'), f'{WORKING_DIR}/vinbigdata/labels/train')
    
for file in tqdm(val_files):
    shutil.copy(file, follow_symlinks=f'{WORKING_DIR}/vinbigdata/images/val')
    filename = file.split('/')[-1].split('.')[0]
    shutil.copy(os.path.join(LABEL_DIR, filename+'.txt'), f'{WORKING_DIR}/vinbigdata/labels/val')

In [None]:
with open(join(WORKING_DIR, 'train.txt'), 'w') as f:
    for path in glob(f'{WORKING_DIR}/vinbigdata/images/train/*'):
        f.write(path+'\n')
            
with open(join( cwd , 'val.txt'), 'w') as f:
    for path in glob(f'{WORKING_DIR}/vinbigdata/images/val/*'):
        f.write(path+'\n')

data = dict(
    train = join(WORKING_DIR, 'train.txt'),
    val = join(WORKING_DIR, 'val.txt' ),
    nc = 14,
    names = classes
    )

with open(join(WORKING_DIR, 'vinbigdata.yaml'), 'w') as outfile:
    yaml.dump(data, outfile, default_flow_style=False)

f = open(join(WORKING_DIR, 'vinbigdata.yaml'), 'r')
print(f.read())

In [None]:
shutil.copytree(YOLO_DIR, f'${WORKING_DIR}/yolov5')
os.chdir(f'{WORKING_DIR}/yolov5')

with open(f'{WORKING_DIR}/yolov5/data/hyp.scratch.yaml', 'w') as outfile2:
    yaml.dump(hyperparameters, outfile2, default_flow_style=False)

# Train

In [None]:
!WANDB_MODE="dryrun" python train.py --img 1024 --batch 2 --epochs 30 --multi-scale --data $HYPERPARAMS_FILE --weights yolov5x.pt --cache --notest

In [None]:
timestamp = datetime.datetime.now().strftime("%m_%d_%Y_%H.%M")
print(timestamp)
!aws s3 cp /content/result/yolov5/runs/train/exp/weights s3://$S3_BUCKET_NAME/vinbigdata-trained-yolov5lmodel-final-{timestamp} --recursive