In [1]:
import torch
import numpy as np
import json
import os
import cv2
import random
import glob
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

import tqdm

BLUE=(255, 0, 0)
GREEN = (0, 255, 0)
RED = (0, 0, 255)
YELLOW = (0, 255, 255)
PINK = (255, 0, 255)
BLACK = (0, 0, 0)
ORANGE = (0, 127, 255)
CUSTOM = (255,170,170)
COLOR_CLASS = {0: BLUE, 1:GREEN, 2:RED, 3:YELLOW, 4:PINK, 5:BLACK, 6:ORANGE, 7:CUSTOM}

def bb_intersection_over_union(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)

    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

def cocoToAbsoluteBox(cocoBox):
    #xywh -> xyxy
    return [cocoBox[0], cocoBox[1], cocoBox[0]+cocoBox[2], cocoBox[1]+cocoBox[3]]


# matplotlib 
def plot(key, list_bbox_):
    fig = plt.figure(figsize=(15,10))

    plt.hist(list_bbox_[key], color = 'blue', edgecolor = 'black',
            bins = int(len(set(list_bbox_[key]))))

    # Add labels
    plt.title('Histogram of {}'.format(key))
    plt.xlabel(key)
    plt.ylabel('count')
#     plt.savefig("d/{}_distribution.png".format(key))

In [2]:
df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/original_data/train.csv')

In [3]:
df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,


In [4]:
df['width'] = df['x_max'] - df['x_min']
df['height'] = df['y_max'] - df['y_min']

In [5]:
x = df[['width', 'height']]
sns.pairplot(x, corner=True, diag_kind='auto', kind='hist', diag_kws=dict(bins=50), plot_kws=dict(pmax=0.9))
plt.savefig('trainshape_correlogram.jpg', dpi=200)
plt.close()

# kfold

In [6]:
from sklearn.model_selection import GroupKFold

gkf  = GroupKFold(n_splits = 5)

In [7]:
train_df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/original_data/train.csv')
train_df = train_df[train_df['class_id'] != 14]
train_df = train_df.sample(frac=1)
train_df = train_df.reset_index(drop = True)

train_df['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups = train_df.image_id.tolist())):
    train_df.loc[val_idx, 'fold'] = fold
train_df.head()


Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold
0,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,1780.0,361.0,2047.0,612.0,4
1,e7e8948818352b4d800dfac9a8999300,Pleural thickening,11,R9,599.0,303.0,885.0,366.0,4
2,53e2a10eb9969b0e336a51d11dda17f9,Pleural thickening,11,R8,1838.0,370.0,2416.0,897.0,1
3,5bf3368744630f459a499ccdccc9cdf1,Cardiomegaly,3,R10,1066.0,2026.0,2324.0,2586.0,0
4,0f186e3eba8d9ebd51feed957204ddbf,Cardiomegaly,3,R8,1192.0,1695.0,2321.0,2009.0,1


In [13]:
# train_df.to_csv('train_only_box.csv', index=False)

# train no merge


In [2]:
df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/images_only/train_only_box.csv')
df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold
0,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,1780.0,361.0,2047.0,612.0,4
1,e7e8948818352b4d800dfac9a8999300,Pleural thickening,11,R9,599.0,303.0,885.0,366.0,4
2,53e2a10eb9969b0e336a51d11dda17f9,Pleural thickening,11,R8,1838.0,370.0,2416.0,897.0,1
3,5bf3368744630f459a499ccdccc9cdf1,Cardiomegaly,3,R10,1066.0,2026.0,2324.0,2586.0,0
4,0f186e3eba8d9ebd51feed957204ddbf,Cardiomegaly,3,R8,1192.0,1695.0,2321.0,2009.0,1


In [3]:
def get_shape(image_name):
    try:
        image = cv2.imread('{}/{}'.format('/home/hana/sonnh/kaggle-vin/dataset/images_only/train', image_name))
    
        return image_name, image.shape[0], image.shape[1]
    except:
        return image_name, 1, 1


import os
from multiprocessing import Pool

pool = Pool()
data = pool.map(get_shape, os.listdir('/home/hana/sonnh/kaggle-vin/dataset/images_only/train'))

data_ = {}
for i in range(len(data)):
    data_[data[i][0]] = {'height': data[i][1], 'width':data[i][2]}

In [4]:
data = {}

for i in range(len(df)):
    image_id = df.iloc[i]['image_id']
    if image_id not in data:
        data[image_id] = {}
    rad_id = df.iloc[i]['rad_id']
    
    if rad_id not in data[image_id]:
        data[image_id][rad_id] = []
        
    if 'fold' not in data[image_id]:
        data[image_id]['fold'] = df.iloc[i]['fold']

    class_id = df.iloc[i]['class_id']
    if class_id != 14:
        width = data_['{}.dicom.png'.format(image_id)]['width']
        height = data_['{}.dicom.png'.format(image_id)]['height']
        xmax = df.iloc[i]['x_max']
        ymax = df.iloc[i]['y_max']
        xmin = df.iloc[i]['x_min']
        ymin = df.iloc[i]['y_min']

        x_center = ((xmax + xmin)//2) / width
        b_width = (xmax - xmin) / width
        
        y_center = ((ymax + ymin)//2) / height
        b_height = (ymax - ymin) / height
        if b_height == 0:
            print(ymax, ymin, height, image_id)
        
        data[image_id][rad_id].append([class_id, x_center, y_center, b_width, b_height])

In [6]:
fold = 2
from shutil import copy
image_folder = '/home/hana/sonnh/kaggle-vin/dataset/images_only/train'
for image_id in data:
    if data[image_id]['fold'] != fold:
        image_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold3/images/train_only_box'
        label_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold3/labels/train_only_box'
    else:
        continue

        
    for rad_id in data[image_id]:
        if rad_id == 'fold':continue
            
        f= open(os.path.join(label_path, '{}_{}.txt'.format(image_id, rad_id)),"w+")
            
        for anno in data[image_id][rad_id]:
            f.write("{} {} {} {} {}\n".format(anno[0], anno[1], anno[2], anno[3], anno[4]))
        
        f.close()
        copy('{}/{}.dicom.png'.format(image_folder, image_id), '{}/{}_{}.png'.format(image_path, image_id, rad_id))

# train merge


In [4]:
df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/images_only/train_only_box_merge_box.csv') 

In [5]:
def get_shape(image_name):
    try:
        image = cv2.imread('{}/{}'.format('/home/hana/sonnh/kaggle-vin/dataset/images_only/train', image_name))
    
        return image_name, image.shape[0], image.shape[1]
    except:
        return image_name, 1, 1


import os
from multiprocessing import Pool

pool = Pool()
data = pool.map(get_shape, os.listdir('/home/hana/sonnh/kaggle-vin/dataset/images_only/train'))

data_ = {}
for i in range(len(data)):
    data_[data[i][0]] = {'height': data[i][1], 'width':data[i][2]}
    

In [6]:
data = {}

for i in range(len(df)):
    image_id = df.iloc[i]['image_id']
    if image_id not in data:
        data[image_id] = {}
        data[image_id]['bbox'] = []
        
    if 'fold' not in data[image_id]:
        data[image_id]['fold'] = df.iloc[i]['fold']

    class_id = df.iloc[i]['class_id']
    if class_id != 14:
        width = data_['{}.dicom.png'.format(image_id)]['width']
        height = data_['{}.dicom.png'.format(image_id)]['height']
        xmax = df.iloc[i]['x_max']
        ymax = df.iloc[i]['y_max']
        xmin = df.iloc[i]['x_min']
        ymin = df.iloc[i]['y_min']

        x_center = ((xmax + xmin)//2) / width
        b_width = (xmax - xmin) / width
        
        y_center = ((ymax + ymin)//2) / height
        b_height = (ymax - ymin) / height
        if b_height == 0:
            print(ymax, ymin, height, image_id)
        
        data[image_id]['bbox'].append([class_id, x_center, y_center, b_width, b_height])

In [7]:
fold = 0
from shutil import copy
image_folder = '/home/hana/sonnh/kaggle-vin/dataset/images_only/train'
for image_id in data:
    if data[image_id]['fold'] != fold:
        image_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold{}/images/train_only_box_merge_box'.format(fold + 1)
        label_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold{}/labels/train_only_box_merge_box'.format(fold + 1)
    else:
        continue
        
    f= open(os.path.join(label_path, '{}.txt'.format(image_id)),"w+")
        
    for rad_id in data[image_id]:
        if rad_id == 'fold':continue

        for anno in data[image_id][rad_id]:
            f.write("{} {} {} {} {}\n".format(anno[0], anno[1], anno[2], anno[3], anno[4]))
        
    f.close()
    copy('{}/{}.dicom.png'.format(image_folder, image_id), '{}/{}.png'.format(image_path, image_id))
        
        

# train val  chia 3 bo theo rad r8, r9, r10

In [5]:
data = {}

for i in range(len(df)):
    image_id = df.iloc[i]['image_id']
    if image_id not in data:
        data[image_id] = {}
    rad_id = df.iloc[i]['rad_id']
    
    if rad_id not in data[image_id]:
        data[image_id][rad_id] = []
        
    if 'fold' not in data[image_id]:
        data[image_id]['fold'] = df.iloc[i]['fold']

    class_id = df.iloc[i]['class_id']
    if class_id != 14:
        width = data_['{}.dicom.png'.format(image_id)]['width']
        height = data_['{}.dicom.png'.format(image_id)]['height']
        xmax = df.iloc[i]['x_max']
        ymax = df.iloc[i]['y_max']
        xmin = df.iloc[i]['x_min']
        ymin = df.iloc[i]['y_min']

        x_center = ((xmax + xmin)//2) / width
        b_width = (xmax - xmin) / width
        
        y_center = ((ymax + ymin)//2) / height
        b_height = (ymax - ymin) / height
        if b_height == 0:
            print(ymax, ymin, height, image_id)
        
        data[image_id][rad_id].append([class_id, x_center, y_center, b_width, b_height])

In [11]:
len(data)

4394

In [14]:
fold = 0
from shutil import copy
image_folder = '/home/hana/sonnh/kaggle-vin/dataset/images_only/train'
for image_id in data:
    if data[image_id]['fold'] != fold:
        image_path_ = '//home/hana/sonnh/kaggle-vin/dataset/yolov5/2/fodl0/images/{}'
        label_path_ = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/2/fodl0/labels/{}'
        continue
    else:
        image_path_ = '//home/hana/sonnh/kaggle-vin/dataset/yolov5/2/fodl0/images/{}_val'
        label_path_ = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/2/fodl0/labels/{}_val'

        
    for rad_id in data[image_id]:
        if rad_id == 'fold':continue
        if rad_id in ['R9', 'R8', 'R10']:
            rads = [rad_id]
        else:
#             print(rad_id)
            rads = ['R9', 'R8', 'R10']
 
        for rad in rads:
            image_path = image_path_.format(rad)
            label_path = label_path_.format(rad)
            
            f= open(os.path.join(label_path, '{}_{}.txt'.format(image_id, rad_id)),"w+")

            for anno in data[image_id][rad_id]:
                f.write("{} {} {} {} {}\n".format(anno[0], anno[1], anno[2], anno[3], anno[4]))

            f.close()
            copy('{}/{}.dicom.png'.format(image_folder, image_id), '{}/{}_{}.png'.format(image_path, image_id, rad_id))

# val merge


In [7]:
df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/images_only/train_only_box_merge_box.csv')

In [7]:
def get_shape(image_name):
    try:
        image = cv2.imread('{}/{}'.format('/home/hana/sonnh/kaggle-vin/dataset/images_only/train', image_name))
    
        return image_name, image.shape[0], image.shape[1]
    except:
        return image_name, 1, 1


import os
from multiprocessing import Pool

pool = Pool()
data = pool.map(get_shape, os.listdir('/home/hana/sonnh/kaggle-vin/dataset/images_only/train'))

data_ = {}
for i in range(len(data)):
    data_[data[i][0]] = {'height': data[i][1], 'width':data[i][2]}

In [10]:
import json
import pickle

with open('data.p', 'wb') as fp:
    pickle.dump(data_, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
data = {}

for i in range(len(df)):
    image_id = df.iloc[i]['image_id']
    if image_id not in data:
        data[image_id] = {}
        data[image_id]['bbox'] = []
        
    if 'fold' not in data[image_id]:
        data[image_id]['fold'] = df.iloc[i]['fold']

    class_id = df.iloc[i]['class_id']
    if class_id != 14:
        width = data_['{}.dicom.png'.format(image_id)]['width']
        height = data_['{}.dicom.png'.format(image_id)]['height']
        xmax = df.iloc[i]['x_max']
        ymax = df.iloc[i]['y_max']
        xmin = df.iloc[i]['x_min']
        ymin = df.iloc[i]['y_min']

        x_center = ((xmax + xmin)//2) / width
        b_width = (xmax - xmin) / width
        
        y_center = ((ymax + ymin)//2) / height
        b_height = (ymax - ymin) / height
        if b_height == 0:
            print(ymax, ymin, height, image_id)
        
        data[image_id]['bbox'].append([class_id, x_center, y_center, b_width, b_height])

In [10]:
fold = 2
from shutil import copy
image_folder = '/home/hana/sonnh/kaggle-vin/dataset/images_only/train'
for image_id in data:
    if data[image_id]['fold'] != fold:
        continue
    else:
        image_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold3/images/val_no_rad_merge_box'
        label_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold3/labels/val_no_rad_merge_box'
        
    f= open(os.path.join(label_path, '{}.txt'.format(image_id)),"w+")
        
    for rad_id in data[image_id]:
#         print(rad_id)
        if rad_id == 'fold':continue

        for anno in data[image_id][rad_id]:
            f.write("{} {} {} {} {}\n".format(anno[0], anno[1], anno[2], anno[3], anno[4]))
        
    f.close()
    copy('{}/{}.dicom.png'.format(image_folder, image_id), '{}/{}.png'.format(image_path, image_id))
        
        

# no box

In [2]:
# train_df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/original_data/train.csv')
# train_df = train_df[train_df['class_id'] == 14]
# train_df = train_df.sample(frac=1)
# train_df = train_df.reset_index(drop = True)

# train_df['fold'] = -1
# for fold, (train_idx, val_idx) in enumerate(gkf.split(train_df, groups = train_df.image_id.tolist())):
#     train_df.loc[val_idx, 'fold'] = fold
# train_df.head()

NameError: name 'gkf' is not defined

In [15]:
# train_df.to_csv('train_no_box.csv', index=False)

In [11]:
df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/images_only/train_no_box.csv')
len(df[df['fold'] ==0])//3

2122

In [12]:
data = {}

for i in range(len(df)):
    image_id = df.iloc[i]['image_id']
    if image_id not in data:
        data[image_id] = {}
        
    if 'fold' not in data[image_id]:
        data[image_id]['fold'] = df.iloc[i]['fold']

In [14]:
fold = 2
from shutil import copy
image_folder = '/home/hana/sonnh/kaggle-vin/dataset/images_only/train'
for image_id in data:
    if data[image_id]['fold'] != fold:
        continue
        image_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold1/train/images'
        label_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold1/train/labels'
    else:
        image_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold3/images/val_no_rad_merge_box'
        label_path = '/home/hana/sonnh/kaggle-vin/dataset/yolov5/1/fold3/labels/val_no_rad_merge_box'
    copy('{}/{}.dicom.png'.format(image_folder, image_id), '{}/{}.png'.format(image_path, image_id))