In [3]:
import torch
import numpy as np
import json
import os
import cv2
import random
import glob
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

import tqdm

BLUE=(255, 0, 0)
GREEN = (0, 255, 0)
RED = (0, 0, 255)
YELLOW = (0, 255, 255)
PINK = (255, 0, 255)
BLACK = (0, 0, 0)
ORANGE = (0, 127, 255)
CUSTOM = (255,170,170)
COLOR_CLASS = {0: BLUE, 1:GREEN, 2:RED, 3:YELLOW, 4:PINK, 5:BLACK, 6:ORANGE, 7:CUSTOM}

def bb_intersection_over_union(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)

    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

def cocoToAbsoluteBox(cocoBox):
    #xywh -> xyxy
    return [cocoBox[0], cocoBox[1], cocoBox[0]+cocoBox[2], cocoBox[1]+cocoBox[3]]


# matplotlib 
def plot(key, list_bbox_):
    fig = plt.figure(figsize=(15,10))

    plt.hist(list_bbox_[key], color = 'blue', edgecolor = 'black',
            bins = int(len(set(list_bbox_[key]))))

    # Add labels
    plt.title('Histogram of {}'.format(key))
    plt.xlabel(key)
    plt.ylabel('count')
#     plt.savefig("d/{}_distribution.png".format(key))

In [2]:
df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/original_data/train.csv')

In [3]:
df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,
2,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0
3,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,


In [5]:
df_full = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/images_only/train_only_box.csv')
# df = pd.read_csv('/home/hana/sonnh/kaggle-vin/dataset/images_only/train_only_box_merge_box.csv')

In [3]:
df.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,fold
0,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,1780.0,361.0,2047.0,612.0,4
1,e7e8948818352b4d800dfac9a8999300,Pleural thickening,11,R9,599.0,303.0,885.0,366.0,4
2,53e2a10eb9969b0e336a51d11dda17f9,Pleural thickening,11,R8,1838.0,370.0,2416.0,897.0,1
3,5bf3368744630f459a499ccdccc9cdf1,Cardiomegaly,3,R10,1066.0,2026.0,2324.0,2586.0,0
4,0f186e3eba8d9ebd51feed957204ddbf,Cardiomegaly,3,R8,1192.0,1695.0,2321.0,2009.0,1


In [13]:
for fold in [0, 1,2,3,4]:
    print('fold ', fold)
    df = df_full[df_full['fold'] == fold]
    for rad in [9]:
        print('RAD ID ', rad)
        print(df[df['rad_id'] == 'R{}'.format(rad)]['class_id'].value_counts())

fold  0
RAD ID  9
0     516
11    490
13    382
3     320
10    209
9     206
7     205
8     190
6      91
5      75
2      59
4      32
1      24
12     12
Name: class_id, dtype: int64
fold  1
RAD ID  9
0     514
11    482
3     350
13    331
8     216
7     210
10    205
9     172
6      77
5      61
2      61
4      35
1      14
12     14
Name: class_id, dtype: int64
fold  2
RAD ID  9
0     492
11    486
13    351
3     327
9     203
7     201
10    196
8     131
6      94
2      63
4      54
5      53
12     12
1      11
Name: class_id, dtype: int64
fold  3
RAD ID  9
0     516
11    494
13    342
3     334
10    229
7     214
9     203
8     176
6      78
5      55
2      55
4      38
12     22
1      15
Name: class_id, dtype: int64
fold  4
RAD ID  9
11    514
0     503
13    351
3     337
7     212
10    206
9     187
8     144
6      99
2      60
5      47
4      42
12     15
1      14
Name: class_id, dtype: int64


In [4]:
data = {}

for i in range(len(df)):
    image_id = df.iloc[i]['image_id']
    class_id = df.iloc[i]['class_id']
    rad_id = df.iloc[i]['rad_id']
    if image_id not in data:
        data[image_id] = {'bbox':[], 'class_id':[], 'rad_id' : []}
        
  
    xmax = df.iloc[i]['x_max']
    ymax = df.iloc[i]['y_max']
    xmin = df.iloc[i]['x_min']
    ymin = df.iloc[i]['y_min']
    
    data[image_id]['bbox'].append([xmin, ymin, xmax, ymax])
    data[image_id]['class_id'].append(class_id)
    data[image_id]['rad_id'].append(rad_id)

In [5]:
sum([len(data[image_id]['class_id']) for image_id in data])

36096

In [6]:
len(data)

4394

In [12]:
count = 0
iou = 0.6
# rad = ['R9', 'R8', 'R10']
rad = ['R8']
for image_id in data:
#     print(data[image_id]['rad_id'])
#     break
    for i in range(len(data[image_id]['bbox'])):
        for j in range(i+1, len(data[image_id]['bbox'])):
            if bb_intersection_over_union(data[image_id]['bbox'][i], data[image_id]['bbox'][j]) > iou \
            and data[image_id]['class_id'][i] == data[image_id]['class_id'][j] \
            and (data[image_id]['rad_id'][i] in  rad or  data[image_id]['rad_id'][j] in  rad):
                count += 1

count

8616

In [11]:
12985 

12985

In [14]:
36096 - 11098

24998