#Brief explanation
We want to create a new Test subset, to evaluate image classification in our model. The subset will consist of 80% "No finding" images and 20% "Consolidation" images. The previously chosen images are selected and then some "No finding" which haven't been previously used for train nor validation are picked, randomly. 

# Mounting and Importing

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install bbox-visualizer
import bbox_visualizer as bbv
import numpy as np
import pandas as pd
import os
from glob import glob # Retrieve files/pathnames matching a specified pattern
import shutil, os #operations on files, operating system dependent functionality

import matplotlib.pyplot as plt
import seaborn as sns
#import bbox_visualizer as bbv

from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm #Progress bar

import cv2
from skimage.io import imread
from sklearn.model_selection import train_test_split


Collecting bbox-visualizer
  Downloading https://files.pythonhosted.org/packages/e2/ed/3fee03fcc9913a772a802e9407a49dfb026f78bab4f1385e8b91eb544e4a/bbox_visualizer-0.1.0-py2.py3-none-any.whl
Installing collected packages: bbox-visualizer
Successfully installed bbox-visualizer-0.1.0


In [8]:
#Original VinBigData train set 
ds = pd.read_csv('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/vinbigdata/train.csv')
print(ds.class_name.value_counts())
all_nof = ds[ds['class_name']== 'No finding']
print(all_nof)

# My dataset after preprocesing, including train - validation - test split. 
fds_ws = pd.read_excel('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/vinbigdata/fds_withsplit.xlsx')
fds_ws.head()

No finding            31818
Aortic enlargement     7162
Cardiomegaly           5427
Pleural thickening     4842
Pulmonary fibrosis     4655
Nodule/Mass            2580
Lung Opacity           2483
Pleural effusion       2476
Other lesion           2203
Infiltration           1247
ILD                    1000
Calcification           960
Consolidation           556
Atelectasis             279
Pneumothorax            226
Name: class_name, dtype: int64
                               image_id  class_name  ...  width height
0      50a418190bc3fb1ef1633bf9678929b3  No finding  ...   2332   2580
1      21a10246a5ec7af151081d0cd6d65dc9  No finding  ...   2954   3159
4      063319de25ce7edb9b1c6b8881290140  No finding  ...   2540   3072
12     5550a493b1c4554da469a072fdfab974  No finding  ...   3072   3072
13     869f39afbdd8783b531530942eda8bad  No finding  ...   3072   3072
...                                 ...         ...  ...    ...    ...
67905  955f258cc29153f996ee6716218c1196  No finding 

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height,Group
0,0,57877,7b30d37b73be405bfd91ed5e2d46c473,Consolidation,7,R8,1148.0,911.0,1693.0,1482.0,2304,2880,Train
1,1,4860,7acb16c6d6f5cfc41a958e0b41e25106,Consolidation,7,R10,761.0,964.0,976.0,1415.0,2304,2880,Train
2,2,25382,6c79f2551808438721052023e043ab4d,Consolidation,4,R8,803.0,1156.0,1345.0,1496.0,3072,3072,Train
3,3,61581,ecf474d5d4f65d7a3e23370a68b8c6a0,Consolidation,8,R8,675.0,620.0,757.0,706.0,2408,2692,Train
4,4,12091,4b001bab36d94f73c1ead3ab74690dbc,Consolidation,8,R9,1574.0,923.0,1597.0,951.0,1936,2488,Train


In [20]:
# keep all no finding not previously used from the original VinBigData set 
nof = all_nof[~all_nof['image_id'].isin(fds_ws['image_id'])] 
nof

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
0,50a418190bc3fb1ef1633bf9678929b3,No finding,14,R11,,,,,2332,2580
1,21a10246a5ec7af151081d0cd6d65dc9,No finding,14,R7,,,,,2954,3159
4,063319de25ce7edb9b1c6b8881290140,No finding,14,R10,,,,,2540,3072
12,5550a493b1c4554da469a072fdfab974,No finding,14,R9,,,,,3072,3072
13,869f39afbdd8783b531530942eda8bad,No finding,14,R3,,,,,3072,3072
...,...,...,...,...,...,...,...,...,...,...
67905,955f258cc29153f996ee6716218c1196,No finding,14,R8,,,,,2048,2500
67909,936fd5cff1c058d39817a08f58b72cae,No finding,14,R1,,,,,2444,3200
67910,ca7e72954550eeb610fe22bf0244b7fa,No finding,14,R1,,,,,1994,2430
67911,aa17d5312a0fb4a2939436abca7f9579,No finding,14,R8,,,,,2048,2500


In [11]:
# Test subset
original_test = fds_ws[fds_ws['Group']=='Test']
original_test.class_name.value_counts()

Consolidation    849
No finding        14
Name: class_name, dtype: int64

In [None]:
original_test

In [12]:
# Amount of images belonging to each class in test subset
unique = original_test.drop_duplicates(subset = ["image_id"])
unique.class_name.value_counts()

Consolidation    278
No finding        14
Name: class_name, dtype: int64

In [13]:
# Percentage of No finding in our original Test subset
14/(14+278)

0.04794520547945205

In [14]:
14+278

292

Now, we want to find how many images we need to add.  

$278 __ 20%$

$x+14   __ 80%$


In [19]:
# Amount of images 
(278*80/20)-14

1098.0

In [23]:
new_nf = nof.sample(n=1098)
new_nf.head()

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height
67545,a4843f51fdfdc62754b9c68701f9db61,No finding,14,R4,,,,,1994,2430
19964,160bb47c3022a18a9e106c8794ddd4a0,No finding,14,R16,,,,,2829,3000
25857,19a70899726d73494cdd551ba38882cf,No finding,14,R1,,,,,3072,3072
22935,aaeb1dc488bcadea5a0861e50063ab98,No finding,14,R3,,,,,2880,3003
47027,24a83567c34851d7e8dcdcc6bf2e2833,No finding,14,R3,,,,,1994,2430


In [37]:
fds = pd.concat([original_test,new_nf]).sample(frac=1)
fds

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height,Group
63739,,,dd99e9d412337a3c380957def7ff1479,No finding,14,R1,,,,,2336,2836,
5415,,,4e0a6b7c284703ea4ab306f3a5a8ad84,No finding,14,R2,,,,,1994,2430,
19074,,,0902a07255f28d5d15a24214761c743c,No finding,14,R12,,,,,2517,3028,
250,250.0,3231.0,15c3fc505c414c69ba757cb3be3ed213,Consolidation,8,R8,1974.0,1270.0,2063.0,1342.0,2620,2868,Test
3908,3908.0,65162.0,fdd529400be877bedaf4f2df9176cedf,Consolidation,7,R9,1639.0,1338.0,2071.0,1917.0,2304,2880,Test
...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,833.0,50969.0,53e2a10eb9969b0e336a51d11dda17f9,Consolidation,1,R10,1925.0,574.0,2559.0,1264.0,3072,3072,Test
37858,,,c6ae45cbd57420cf4937b9f9299a5667,No finding,14,R10,,,,,2048,2500,
48877,,,b45657dafdcbd92f294c732983d8f577,No finding,14,R3,,,,,2881,3082,
12359,,,924e69559651654e67727eb1d8ca1b7a,No finding,14,R1,,,,,3072,3072,


In [27]:
fss = pd.concat([unique,new_nf]).sample(frac=1)
fss.class_name.value_counts() 

No finding       1112
Consolidation     278
Name: class_name, dtype: int64

In [29]:
fss.to_csv('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/vinbigdata/classification_test.csv')

In [None]:
fss = pd.read_csv('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/vinbigdata/classification_test.csv')

In [30]:
fss[['image_id','class_name']]

Unnamed: 0,image_id,class_name
14820,86bd0cea376e57b1d04ec36ee55f0bed,No finding
27819,1049692f29ed7540074d4b0b623cae6e,No finding
14875,68fb6b382475cacea88d364da1730d34,No finding
57201,01ad4b5b1e69b4fd92ebe4e35d76eeb5,No finding
241,ae9d5b3baccd0f0f32f178b85aa868ff,Consolidation
...,...,...
23610,19735d182fff4091e4ec19e851b1d27a,No finding
5184,4a704557b3d8950d46722e4b410ccf70,No finding
66,f9e722d2706d42998afff41568223a01,Consolidation
10632,2a2619632df6e2c6c696a31d0d223bf0,No finding


# Create Directories and save images and labels

Now we have to prepare the new folders containing images and labels in order to be able to run our model. 

In [31]:
# Creating directories
os.makedirs('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/binary_classification/labels/test', exist_ok = True)
os.makedirs('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/binary_classification/images/test', exist_ok = True)

In [32]:
fss_im = fss.image_id.to_numpy()

#Copy images
for file in tqdm(fss_im):
  shutil.copy('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/vinbigdata/train/'+file+'.jpg','/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/binary_classification/images/test')
  

HBox(children=(FloatProgress(value=0.0, max=1390.0), HTML(value='')))




In [38]:
# Normalizing Annotations 

# BB Normalized Limits
fds['x_min'] = fds.apply(lambda row: (row.x_min)/row.width, axis =1)
fds['y_min'] = fds.apply(lambda row: (row.y_min)/row.height, axis =1)

fds['x_max'] = fds.apply(lambda row: (row.x_max)/row.width, axis =1)
fds['y_max'] = fds.apply(lambda row: (row.y_max)/row.height, axis =1)

# BB Normalized Center
fds['x_mid'] = fds.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
fds['y_mid'] = fds.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

# BB Nomalized With & Height
fds['w'] = fds.apply(lambda row: (row.x_max-row.x_min), axis =1)
fds['h'] = fds.apply(lambda row: (row.y_max-row.y_min), axis =1)

# BB as a % area of the image
fds['area'] = fds['w']*fds['h']
fds.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height,Group,x_mid,y_mid,w,h,area
63739,,,dd99e9d412337a3c380957def7ff1479,No finding,14,R1,,,,,2336,2836,,,,,,
5415,,,4e0a6b7c284703ea4ab306f3a5a8ad84,No finding,14,R2,,,,,1994,2430,,,,,,
19074,,,0902a07255f28d5d15a24214761c743c,No finding,14,R12,,,,,2517,3028,,,,,,
250,250.0,3231.0,15c3fc505c414c69ba757cb3be3ed213,Consolidation,8,R8,0.753435,0.442817,0.787405,0.467922,2620,2868,Test,0.77042,0.45537,0.033969,0.025105,0.000853
3908,3908.0,65162.0,fdd529400be877bedaf4f2df9176cedf,Consolidation,7,R9,0.711372,0.464583,0.898872,0.665625,2304,2880,Test,0.805122,0.565104,0.1875,0.201042,0.037695


In [53]:
def create_labels():
  label_dir =  '/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/binary_classification/labels/test/'
  for image in tqdm(fss_im):
    my_list = []
    for index, row in fds.iterrows():
      if(image == row['image_id']):
       if(row['class_name']=='Consolidation'): 
         my_list.append('0' + '\t' + str(row['x_mid']) + '\t' + str(row['y_mid']) + '\t' + str(row['w']) + '\t' + str(row['h']) + '\n')
    filename = image
    file1 = open(os.path.join(label_dir, filename+'.txt'),"w") 
    file1.writelines(np.unique(my_list))
    file1.close()

In [54]:
create_labels()

HBox(children=(FloatProgress(value=0.0, max=1390.0), HTML(value='')))




In [56]:
import os, os.path
DIR = '/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/binary_classification/labels/test/'
print(len([name for name in os.listdir(DIR)]))

1351


In [59]:
len(fss_im) , len(fds)

(1390, 1961)

In [61]:
DIR = '/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/binary_classification/images/test'
print(len([name for name in os.listdir(DIR)]))

1351


In [None]:
with np.printoptions(threshold=np.inf):
    print(fss_im)

In [None]:
for i in 