#Brief explanation 
The aim of this notebook is to select from the entire list of given HIBA validation images, just the classes: "Consolidation", "Nodule/Mass", "Atelectasis", "No Finding", and see how these results compare to the ones obtained previously (including also "Interstitial Pattern" and "Wall Injuries" as these types of images weren't used for training). Therefore, it is expected that we obtain better results. 

# Mounting & Importing 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import shutil, os
import zipfile
import pandas as pd
import numpy as np
from os.path import isfile, join
from glob import glob
import yaml
from tqdm.notebook import tqdm

**Labels dimension:** 

1.   Nodule/Mass
2.   Consolidation
3.   Interstitial Pattern
4.   Atelectasis
5.   Wall Injuries



# Images with findings
In the first place, we import images which have findings and preprocessed as obtained from [this](https://https://colab.research.google.com/drive/1hMyelskJb7pmy_R4Re045QjPdd8Xtk06?authuser=1#scrollTo=QlKul14VdGaM) notebook, and discard labels associated with interstitial pattern and wall injuries.

In [3]:
# Just images from consolidation class.
df = pd.read_csv('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/boxes.csv')
df

Unnamed: 0.1,Unnamed: 0,boxes,labels,image,height,width,class
0,0,"[838, 869, 1384, 1627]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,3054,3056.0,Consolidation
1,0,"[489, 1313, 909, 1629]",1,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2735,2802.0,Consolidation
2,0,"[852, 763, 1493, 1961]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2544,3056.0,Consolidation
3,1,"[1728, 420, 2511, 1659]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2544,3056.0,Consolidation
4,0,"[854, 840, 1467, 1764]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2364,3008.0,Consolidation
...,...,...,...,...,...,...,...
370,2,"[54, 989, 202, 1550]",5,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2048,2500.0,Consolidation
371,0,"[1371, 711, 2349, 2076]",1,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,3056,2544.0,Consolidation
372,0,"[1803, 594, 2010, 1145]",4,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2364,2880.0,Consolidation
373,0,"[1598, 889, 1899, 1165]",1,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2048,2500.0,Consolidation


In [6]:
df['class'].value_counts()

Consolidation    375
Name: class, dtype: int64

In [7]:
df.labels.value_counts()

3    154
1     98
2     64
5     34
4     25
Name: labels, dtype: int64

In [8]:
# Choose the classes we want
fds = df.loc[(df.labels!=3) & (df.labels!=5)]

In [9]:
fds.labels.value_counts()

1    98
2    64
4    25
Name: labels, dtype: int64

In [10]:
len(fds)
# Almost half of our images belonged to the classes our model wasn't trained with 

187

In [11]:
fds

Unnamed: 0.1,Unnamed: 0,boxes,labels,image,height,width,class
0,0,"[838, 869, 1384, 1627]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,3054,3056.0,Consolidation
1,0,"[489, 1313, 909, 1629]",1,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2735,2802.0,Consolidation
2,0,"[852, 763, 1493, 1961]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2544,3056.0,Consolidation
3,1,"[1728, 420, 2511, 1659]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2544,3056.0,Consolidation
4,0,"[854, 840, 1467, 1764]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2364,3008.0,Consolidation
...,...,...,...,...,...,...,...
368,0,"[298, 830, 850, 1390]",1,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2048,2500.0,Consolidation
369,1,"[1485, 773, 2005, 1661]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2048,2500.0,Consolidation
371,0,"[1371, 711, 2349, 2076]",1,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,3056,2544.0,Consolidation
372,0,"[1803, 594, 2010, 1145]",4,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2364,2880.0,Consolidation


In [13]:
int(fds.boxes[0].strip('][').split(', ')[0])

838

In [14]:
# Normalizing Annotations 

# BB Normalized Limits
fds['x_min'] = fds.apply(lambda row: (int(row.boxes.strip('][').split(', ')[0]))/row.width, axis =1)
fds['y_min'] = fds.apply(lambda row: (int(row.boxes.strip('][').split(', ')[1]))/row.height, axis =1)

fds['x_max'] = fds.apply(lambda row: (int(row.boxes.strip('][').split(', ')[2]))/row.width, axis =1)
fds['y_max'] = fds.apply(lambda row: (int(row.boxes.strip('][').split(', ')[3]))/row.height, axis =1)

# BB Normalized Center
fds['x_mid'] = fds.apply(lambda row: (row.x_max+row.x_min)/2, axis =1)
fds['y_mid'] = fds.apply(lambda row: (row.y_max+row.y_min)/2, axis =1)

# BB Nomalized With & Height
fds['w'] = fds.apply(lambda row: (row.x_max-row.x_min), axis =1)
fds['h'] = fds.apply(lambda row: (row.y_max-row.y_min), axis =1)

# BB as a % area of the image
fds['area'] = fds['w']*fds['h']
fds.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docum

Unnamed: 0.1,Unnamed: 0,boxes,labels,image,height,width,class,x_min,y_min,x_max,y_max,x_mid,y_mid,w,h,area
0,0,"[838, 869, 1384, 1627]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,3054,3056.0,Consolidation,0.274215,0.284545,0.45288,0.532744,0.363547,0.408644,0.178665,0.248199,0.044344
1,0,"[489, 1313, 909, 1629]",1,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2735,2802.0,Consolidation,0.174518,0.480073,0.324411,0.595612,0.249465,0.537843,0.149893,0.115539,0.017319
2,0,"[852, 763, 1493, 1961]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2544,3056.0,Consolidation,0.278796,0.299921,0.488547,0.770833,0.383671,0.535377,0.209751,0.470912,0.098774
3,1,"[1728, 420, 2511, 1659]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2544,3056.0,Consolidation,0.565445,0.165094,0.821662,0.652123,0.693554,0.408608,0.256217,0.487028,0.124785
4,0,"[854, 840, 1467, 1764]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2364,3008.0,Consolidation,0.28391,0.35533,0.487699,0.746193,0.385805,0.550761,0.20379,0.390863,0.079654


In [25]:
fds.image.values[0]

'/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/masks/b4ef5c3ea6b611ebabb6f48e3885516d.npy'

In [43]:
fds.image.str.split('/').to_list()

['',
 'content',
 'drive',
 'MyDrive',
 'Quinto_Anio',
 'TESIS_Eugenia_Berrino',
 'Part_II_DS',
 'DS_VAL_OD',
 'unzipped_v2',
 'masks',
 'b4f305e3a6b611ebbbb7f48e3885516d.npy']

In [46]:
#f['Date'] =  df['NAME'].str.split(".").str[0].str.split('_').str[-1]
fds['file_name'] = fds.image.str.split('/').str[-1].str.split('.').str[0]
fds.file_name.values[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


'b4ef5c3ea6b611ebabb6f48e3885516d'

In [47]:
fds.head()

Unnamed: 0.1,Unnamed: 0,boxes,labels,image,height,width,class,x_min,y_min,x_max,y_max,x_mid,y_mid,w,h,area,file_name
0,0,"[838, 869, 1384, 1627]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,3054,3056.0,Consolidation,0.274215,0.284545,0.45288,0.532744,0.363547,0.408644,0.178665,0.248199,0.044344,b4ef5c3ea6b611ebabb6f48e3885516d
1,0,"[489, 1313, 909, 1629]",1,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2735,2802.0,Consolidation,0.174518,0.480073,0.324411,0.595612,0.249465,0.537843,0.149893,0.115539,0.017319,b4f2b7a8a6b611ebac95f48e3885516d
2,0,"[852, 763, 1493, 1961]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2544,3056.0,Consolidation,0.278796,0.299921,0.488547,0.770833,0.383671,0.535377,0.209751,0.470912,0.098774,b4f2b7a9a6b611ebb8abf48e3885516d
3,1,"[1728, 420, 2511, 1659]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2544,3056.0,Consolidation,0.565445,0.165094,0.821662,0.652123,0.693554,0.408608,0.256217,0.487028,0.124785,b4f2b7a9a6b611ebb8abf48e3885516d
4,0,"[854, 840, 1467, 1764]",2,/content/drive/MyDrive/Quinto_Anio/TESIS_Eugen...,2364,3008.0,Consolidation,0.28391,0.35533,0.487699,0.746193,0.385805,0.550761,0.20379,0.390863,0.079654,b4f2b7aaa6b611ebb118f48e3885516d


# No finding images
Now that images with findings have been preprocessed, we can add the no finding ones to our table. 

In [16]:
all_hiba = pd.read_csv('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/test_hibav2.csv')
all_hiba.head()

Unnamed: 0,file_name,class_name,height,width,x1,x2,y1,y2,label_level
0,b4ef5c3ea6b611ebabb6f48e3885516d,Consolidacion,3054,3056,838.0,1384.0,869.0,1627.0,mask
1,b4f2b7a8a6b611ebac95f48e3885516d,NoduloMasa,2735,2802,489.0,909.0,1313.0,1629.0,mask
2,b4f2b7a9a6b611ebb8abf48e3885516d,Consolidacion-Consolidacion,2544,3056,852.0,1493.0,763.0,1961.0,mask
3,b4f2b7aaa6b611ebb118f48e3885516d,Consolidacion,2364,3008,854.0,1467.0,840.0,1764.0,mask
4,b4f2b7aba6b611eb8759f48e3885516d,Consolidacion,2500,2048,349.0,773.0,636.0,1131.0,mask


In [23]:
nof = all_hiba[all_hiba['class_name'].isna()]
nof

Unnamed: 0,file_name,class_name,height,width,x1,x2,y1,y2,label_level
233,b4f305e4a6b611ebb570f48e3885516d,,2403,2404,,,,,nofinding
234,b4f305e5a6b611ebbffbf48e3885516d,,3052,2540,,,,,nofinding
235,b4f305e6a6b611ebb4faf48e3885516d,,2472,2526,,,,,nofinding
236,b4f305e7a6b611eb9d9df48e3885516d,,2801,2802,,,,,nofinding
237,b4f305e8a6b611eb94dcf48e3885516d,,2515,2144,,,,,nofinding
...,...,...,...,...,...,...,...,...,...
1325,b4f7e7e8a6b611eb9da7f48e3885516d,,2342,2928,,,,,nofinding
1326,b4f7e7e9a6b611eb8c44f48e3885516d,,2314,2360,,,,,nofinding
1327,b4f7e7eaa6b611eb8699f48e3885516d,,2540,3056,,,,,nofinding
1328,b4f7e7eba6b611ebb13bf48e3885516d,,2544,3056,,,,,nofinding


In [52]:
nof['class'] = 'No finding'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [53]:
total = pd.concat([fds,nof]).sample(frac=1)
total.head()

Unnamed: 0.1,Unnamed: 0,boxes,labels,image,height,width,class,x_min,y_min,x_max,y_max,x_mid,y_mid,w,h,area,file_name,class_name,x1,x2,y1,y2,label_level
1255,,,,,2735,2802.0,No finding,,,,,,,,,,b4f59dfca6b611ebbca4f48e3885516d,No finding,,,,,nofinding
288,,,,,2484,2252.0,No finding,,,,,,,,,,b4f3061ba6b611ebabedf48e3885516d,No finding,,,,,nofinding
1301,,,,,2580,2404.0,No finding,,,,,,,,,,b4f5c50fa6b611eb84cbf48e3885516d,No finding,,,,,nofinding
1133,,,,,2782,2932.0,No finding,,,,,,,,,,b4f528c0a6b611ebae48f48e3885516d,No finding,,,,,nofinding
508,,,,,2540,3056.0,No finding,,,,,,,,,,b4f35438a6b611eba57ff48e3885516d,No finding,,,,,nofinding


In [54]:
total.to_csv('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/reduced/total.csv')

# Generate Image and Labels in their corresponding directories

In [55]:
os.makedirs('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/reduced2/labels/test', exist_ok = True)
os.makedirs('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/reduced2/images/test', exist_ok = True)

In [57]:
len(total)

1284

In [56]:
unique = total.drop_duplicates(subset = ["file_name"])
unique

Unnamed: 0.1,Unnamed: 0,boxes,labels,image,height,width,class,x_min,y_min,x_max,y_max,x_mid,y_mid,w,h,area,file_name,class_name,x1,x2,y1,y2,label_level
1255,,,,,2735,2802.0,No finding,,,,,,,,,,b4f59dfca6b611ebbca4f48e3885516d,No finding,,,,,nofinding
288,,,,,2484,2252.0,No finding,,,,,,,,,,b4f3061ba6b611ebabedf48e3885516d,No finding,,,,,nofinding
1301,,,,,2580,2404.0,No finding,,,,,,,,,,b4f5c50fa6b611eb84cbf48e3885516d,No finding,,,,,nofinding
1133,,,,,2782,2932.0,No finding,,,,,,,,,,b4f528c0a6b611ebae48f48e3885516d,No finding,,,,,nofinding
508,,,,,2540,3056.0,No finding,,,,,,,,,,b4f35438a6b611eba57ff48e3885516d,No finding,,,,,nofinding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
988,,,,,2801,2802.0,No finding,,,,,,,,,,b4f4b38ba6b611eb81eaf48e3885516d,No finding,,,,,nofinding
937,,,,,2858,2990.0,No finding,,,,,,,,,,b4f48c80a6b611eb96a5f48e3885516d,No finding,,,,,nofinding
721,,,,,2361,2704.0,No finding,,,,,,,,,,b4f3f03ea6b611eba29bf48e3885516d,No finding,,,,,nofinding
568,,,,,2735,2802.0,No finding,,,,,,,,,,b4f37b2ca6b611eba03ef48e3885516d,No finding,,,,,nofinding


In [58]:
# Every different image
for file in tqdm(unique.file_name.to_numpy()):
  shutil.copy('/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/images/'+ file +'.jpg','/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/reduced2/images/test')

HBox(children=(FloatProgress(value=0.0, max=1232.0), HTML(value='')))




In [59]:
DIR = '/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/reduced2/images/test'
print(len([name for name in os.listdir(DIR)]))

1232


In [60]:
def create_labels():
  label_dir = '/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/reduced2/labels/test/' 
  for image in tqdm(unique.file_name.to_numpy()):
    my_list = []
    for index, row in total.iterrows():
      if(image == row['file_name']):
       if(row['class']=='Consolidation'): 
         my_list.append('0' + '\t' + str(row['x_mid']) + '\t' + str(row['y_mid']) + '\t' + str(row['w']) + '\t' + str(row['h']) + '\n')
    filename = image
    file1 = open(os.path.join(label_dir, filename+'.txt'),"w") 
    file1.writelines(np.unique(my_list))
    file1.close()

In [61]:
create_labels()

HBox(children=(FloatProgress(value=0.0, max=1232.0), HTML(value='')))




In [62]:
DIR = '/content/drive/MyDrive/Quinto_Anio/TESIS_Eugenia_Berrino/Part_II_DS/DS_VAL_OD/unzipped_v2/reduced2/labels/test'
print(len([name for name in os.listdir(DIR)]))

1232
