In [1]:
# !git clone https://github.com/open-mmlab/mmdetection.git
# !cd mmdetection/ && pip install -e . && cd -

In [2]:
import pandas as pd
from PIL import Image
from tqdm.contrib.concurrent import process_map
from pathlib import Path
import numpy as np

# EDA

In [3]:
BBox_df = pd.read_csv('/kaggle/input/data/BBox_List_2017.csv')
print(f'BBox_df dataset len {len(BBox_df)}')
print(f'unique images {BBox_df["Image Index"].nunique()}')

BBox_df dataset len 984
unique images 880


In [4]:
BBox_df.head()

Unnamed: 0,Image Index,Finding Label,Bbox [x,y,w,h],Unnamed: 6,Unnamed: 7,Unnamed: 8
0,00013118_008.png,Atelectasis,225.084746,547.019217,86.779661,79.186441,,,
1,00014716_007.png,Atelectasis,686.101695,131.543498,185.491525,313.491525,,,
2,00029817_009.png,Atelectasis,221.830508,317.053115,155.118644,216.949153,,,
3,00014687_001.png,Atelectasis,726.237288,494.95142,141.016949,55.322034,,,
4,00017877_001.png,Atelectasis,660.067797,569.780787,200.677966,78.101695,,,


In [5]:
BBox_df['Finding Label'].value_counts()

Finding Label
Atelectasis     180
Effusion        153
Cardiomegaly    146
Infiltrate      123
Pneumonia       120
Pneumothorax     98
Mass             85
Nodule           79
Name: count, dtype: int64

#### Data quit unbalanced, will use corresponding sampler

In [6]:
image2path = {item.name: str(item) for item in Path('/kaggle/input/data/').glob('images*/*/*.png')}
BBox_df['path'] = BBox_df['Image Index'].map(image2path)
def get_size(fp):
    return Image.open(fp).size
img_sizes = process_map(get_size, BBox_df['path'], max_workers=4)
print(f' image size are {set(img_sizes)}')

  0%|          | 0/984 [00:00<?, ?it/s]

 image size are {(1024, 1024)}


In [7]:
BBox_df['width'], BBox_df['height'] = zip(*img_sizes)

#### all images of same size

In [8]:
BBox_df.describe()

Unnamed: 0,Bbox [x,y,w,h],Unnamed: 6,Unnamed: 7,Unnamed: 8,width,height
count,984.0,984.0,984.0,984.0,0.0,0.0,0.0,984.0,984.0
mean,398.806111,405.425364,256.334708,252.302547,,,,1024.0,1024.0
std,222.700868,166.309995,167.62962,159.443635,,,,0.0,0.0
min,5.417989,12.837934,27.306667,21.617778,,,,1024.0,1024.0
25%,203.093333,293.869045,136.533333,115.674074,,,,1024.0,1024.0
50%,340.249735,412.850794,214.340942,216.949153,,,,1024.0,1024.0
75%,607.959365,521.641995,311.832381,367.90243,,,,1024.0,1024.0
max,905.887831,876.980783,901.12,873.379894,,,,1024.0,1024.0


#### No out of image annotations

In [9]:
BBox_df.groupby('Finding Label')['w'].mean()

Finding Label
Atelectasis     218.960376
Cardiomegaly    479.834344
Effusion        221.624451
Infiltrate      294.044417
Mass            168.686870
Nodule           71.942670
Pneumonia       276.470519
Pneumothorax    198.881123
Name: w, dtype: float64

In [10]:
BBox_df.groupby('Finding Label')['h]'].mean()

Finding Label
Atelectasis     139.110324
Cardiomegaly    381.118611
Effusion        318.007295
Infiltrate      297.393439
Mass            189.838700
Nodule           70.433862
Pneumonia       304.486942
Pneumothorax    246.010444
Name: h], dtype: float64

# Train

## folds spit

In [11]:
np.random.seed(42)
image_files_names = BBox_df['Image Index'].unique()
np.random.shuffle(image_files_names)
image_files_names = list(image_files_names)
num_unique_image = len(image_files_names)
NUMBER_OF_FOLDS = 5
image2fold = {}
for fold in range(0, NUMBER_OF_FOLDS - 1):
    image2fold.update({image_files_names.pop(): fold for _ in range(num_unique_image//NUMBER_OF_FOLDS)})
image2fold.update({image: NUMBER_OF_FOLDS - 1 for image in image_files_names})
BBox_df['fold'] = BBox_df['Image Index'].map(image2fold)
print(f'num images pert fold {BBox_df.drop_duplicates("Image Index")["fold"].value_counts()}')
print(f'num annotations pert fold {BBox_df["fold"].value_counts()}')

num images pert fold fold
2    176
0    176
3    176
4    176
1    176
Name: count, dtype: int64
num annotations pert fold fold
2    200
0    199
4    197
3    194
1    194
Name: count, dtype: int64


## add class_id field

In [12]:
sorted(BBox_df['Finding Label'].unique())

['Atelectasis',
 'Cardiomegaly',
 'Effusion',
 'Infiltrate',
 'Mass',
 'Nodule',
 'Pneumonia',
 'Pneumothorax']

In [13]:
class_name2class_id = {name: id_+1 for id_, name in enumerate(sorted(BBox_df['Finding Label'].unique()))}
BBox_df['class_id'] = BBox_df['Finding Label'].map(class_name2class_id)

In [14]:
BBox_df['class_id'].value_counts()

class_id
1    180
3    153
2    146
4    123
7    120
8     98
5     85
6     79
Name: count, dtype: int64

### I will use YOLOX-tiny, because of resources

In [15]:
BBox_df[BBox_df['fold'].isin({0,1,2})].to_csv('/kaggle/working/BBox_List_2017_train.csv')
BBox_df[BBox_df['fold'].isin({3})].to_csv('/kaggle/working/BBox_List_2017_val.csv')
BBox_df[BBox_df['fold'].isin({4})].to_csv('/kaggle/working/BBox_List_2017_test.csv')

In [16]:
!ls /kaggle/working

BBox_List_2017_test.csv  BBox_List_2017_train.csv  BBox_List_2017_val.csv
