# **Data reorganization and pre-processing**

In this phase, with the raw directories located and created we should make an anonymization and separation in training, test and validation. Once already we will proceed to make data augmentation.

In our work environment we have two subdirectories with raw data. Each case is made up of 2 pairs of images {photo, mask}:

* 4 folders control cases (202 cases)
* 1 folder of cancer cases (120 cases)

This data should be rearranged to get the following structure:

- **Procesado**

  * Cancer
    - Case 1C
      + Original
      + Mask
    - ...
    - Case 120C

  * Controles
    - Case 1
      + Original
      + Mask
    - ...
    - Case 202

  * test
    - images
    - masks
    - bboxes*
    - DETR**
  * train
    - images
    - masks
    - bboxes
    - DETR
  * valid
    - images
    - masks
    - bboxes
    - DETR

   *bboxes contains the gold standard bounding box we could obtain by using the gold mask.
   
   ** DETR contains the bounding boxes we have obtained by our DETR model.

- **Data**

  * Gold_Standard (Original data, detections(d) and segmentations(s))
    - train, dtrain, strain
    - test, dtest, stest
    - valid, dvalid, svalid
  * SAM
  * DETR
  * SAM+DETR
  *These three last directories will be generated in subsequent phases automathicaly.*

## Library

In [None]:
#Directories & base
import os
import errno
from distutils.dir_util import copy_tree
import shutil
import time
import numpy as np
import csv
import json
import random
from tqdm import tqdm

#Augmentation
from skimage.io import imread, imshow
from skimage.transform import resize
import matplotlib.pyplot as plt
from skimage import color
import cv2
import pandas as pd
import albumentations as A

#Bounding_boxes
import torch
from torchvision.ops import masks_to_boxes

In [None]:
['/home/eva/'+data for data in os.listdir('/home/eva/') if 'control' in data]

['/home/eva/01. Primeros_casos controles',
 '/home/eva/03. Tercera seleccion_controles',
 '/home/eva/04. Cuarta seleccion-controles',
 '/home/eva/02. Segundo envio casos controles']

- TYPE="Balanced", to treat augmentation in a way that reduces the imbalance
    
- TYPE="", it means to process the data by increasing both classes equally

Just comment or uncomment the corresponding option.

In [None]:
TYPE="_Balanced"
#TYPE=""

## Raw Directories

In [None]:
#Paths
raw_controles = ['/home/eva/'+data for data in os.listdir('/home/eva/') if 'control' in data]
raw_cancer = '/home/eva/Cancer'

We’ll need to create the directories so ViT can quickly read them.

In [None]:
#We create directories by class within data
try:
    os.makedirs("./data"+TYPE)
    os.makedirs("./data"+TYPE+"/Gold_Standard")
    print("Directorio creado")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

names=['train','valid','test','dtrain','dvalid','dtest','strain','svalid','stest']

for name in names:

    try:
        os.makedirs("./data"+TYPE+"/Gold_Standard/"+name)
        os.makedirs("./data"+TYPE+"/Gold_Standard/"+name+"/Cáncer")
        os.makedirs("./data"+TYPE+"/Gold_Standard/"+name+"/Control")
        print("Directorio", name, "creado")
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

Directorio creado
Directorio train creado
Directorio valid creado
Directorio test creado
Directorio dtrain creado
Directorio dvalid creado
Directorio dtest creado
Directorio strain creado
Directorio svalid creado
Directorio stest creado


## Anonymization

In [None]:
files_names = [data for data in os.listdir('/home/eva/') if 'control' in data]
print(files_names)

['01. Primeros_casos controles', '03. Tercera seleccion_controles', '04. Cuarta seleccion-controles', '02. Segundo envio casos controles']


In [None]:
#We create the folder for anonymization control cases
try:
    os.makedirs("./Procesado"+TYPE+"/Controles")
    print("Directorio creado")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

#We created the folder for cancer cases
try:
    os.makedirs("./Procesado"+TYPE+"/Cancer")
    print("Directorio creado")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

print(os.listdir("./Procesado"+TYPE))

Directorio creado
Directorio creado
['Controles', 'Cancer']


* **Controls**

In [None]:
raw_controles[0].split('eva/')[1]

'01. Primeros_casos controles'

In [None]:
inicio = time.time() #Time taken

ruta_imgC="./Procesado"+TYPE+"/Controles/"

width_shape, height_shape = 250, 250
top_len=5 #00000
j=1
jj=1
towritedics=[] #list of new allocations
n=0
for f in raw_controles:
    print('****'+f.split('eva/')[1]+'****')
    input_images_path=raw_controles[n]

    '''
    We load the images into a list and rearrange them to have the originals first
    and then the corresponding mask
    '''
    imgs_names = os.listdir(input_images_path)
    imgs_names.sort(reverse=True)
    for i in range(len(imgs_names)):
        img_name= imgs_names[i]
        img_rute=input_images_path+"/"+img_name

        #For the original images
        if(')' not in img_name):
            ceros=top_len-len(str(j))
            paciente_name=('0'*ceros)+str(j)
            real_name=img_name

            towritedics.append({'original_field': str(f.split('eva/')[1]),
                                'real_name': real_name,
                                'assigned_name': str(paciente_name)+'.jpg'})

            #With the new name we create the directory
            try:
                os.makedirs(ruta_imgC+str(paciente_name))
                os.makedirs(ruta_imgC+str(paciente_name)+'/Original')
                os.makedirs(ruta_imgC+str(paciente_name)+'/Mask')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

            #Preprocess the original image (resize) and save it
            img = cv2.imread(img_rute,cv2.COLOR_BGR2RGB)
            img = cv2.resize(img,(height_shape, width_shape))
            #Save image to desired folder with your name
            cv2.imwrite(ruta_imgC+str(paciente_name)+"/Original/"+str(paciente_name)+'.jpg', img)
            j=j+1

        else:
            real_name=img_name
            ceros=top_len-len(str(jj))
            paciente_name=('0'*ceros)+str(jj)
            towritedics.append({'original_field': str(f.split('eva/')[1]),
                                'real_name': real_name,
                                'assigned_name': str(paciente_name)+'(1).jpg'})

            #With the new name we create the directory
            try:
                os.makedirs(ruta_imgC+str(paciente_name))
                os.makedirs(ruta_imgC+str(paciente_name)+'/Original')
                os.makedirs(ruta_imgC+str(paciente_name)+'/Mask')
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise

            #Resize the mask and save it
            img = cv2.imread(img_rute,cv2.COLOR_BGR2RGB)
            img = cv2.resize(img,(height_shape, width_shape))
            #Save image to desired folder with your name
            cv2.imwrite(ruta_imgC+str(paciente_name)+"/Mask/"+str(paciente_name)+'(1).jpg',img)
            jj=jj+1
        time.sleep(2)
    print('****NEXT*****')
    n=n+1

#We write the file with the new assignments made
with open(ruta_imgC+'assigned_names.csv', 'w', newline='') as csvfile:
    fieldnames = ['original_field', 'real_name', 'assigned_name']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for d in towritedics:
        writer.writerow(d)

print('*****END*****')
print('Tiempo requerido:',round((time.time()-inicio)/60,0))

****01. Primeros_casos controles****
****NEXT*****
****03. Tercera seleccion_controles****
****NEXT*****
****04. Cuarta seleccion-controles****
****NEXT*****
****02. Segundo envio casos controles****
****NEXT*****
*****END*****
Tiempo requerido: 14.0


* **Cancer**

In [None]:
inici = time.time()
ruta_imgC="./Procesado"+TYPE+"/Cancer/" #Processed Data
width_shape, height_shape = 250, 250
top_len=5 #00000
j=1
jj=1
towritedics=[] #list of new allocations

input_images_path=raw_cancer
# We load the images into a list and reorder it to have them first
# original and then the corresponding mask
imgs_names = os.listdir(input_images_path)
imgs_names.sort(reverse=True)

for i in range(len(imgs_names)):
    img_name= imgs_names[i]
    img_rute=input_images_path+"/"+img_name

    #For the original images
    if('_seg' not in img_name):
        ceros=top_len-len(str(j))
        #To differentiate them by adding them to the subjcontos we put a C
        paciente_name=('0'*ceros)+str(j)+'C'
        real_name=img_name
        towritedics.append({'original_field': "Cancer",
                            'real_name': real_name,
                            'assigned_name': str(paciente_name)+'.jpg'})

        #With the new name we create the directory
        try:
            os.makedirs(ruta_imgC+str(paciente_name))
            os.makedirs(ruta_imgC+str(paciente_name)+'/Original')
            os.makedirs(ruta_imgC+str(paciente_name)+'/Mask')
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        #Preprocess the original image (resize) and save it
        img = cv2.imread(img_rute,cv2.COLOR_BGR2RGB)
        img = cv2.resize(img,(height_shape, width_shape))
        #Save image to desired folder with your name
        cv2.imwrite(ruta_imgC+str(paciente_name)+"/Original/"+str(paciente_name)+'.jpg', img)
        j=j+1
    else:
        real_name=img_name
        ceros=top_len-len(str(jj))
        paciente_name=('0'*ceros)+str(jj)+'C'
        towritedics.append({'original_field':"Cancer",
                            'real_name': real_name,
                            'assigned_name': str(paciente_name)+'(1).jpg'})

        #With the new name we create the directory
        try:
            os.makedirs(ruta_imgC+str(paciente_name))
            os.makedirs(ruta_imgC+str(paciente_name)+'/Original')
            os.makedirs(ruta_imgC+str(paciente_name)+'/Mask')
        except OSError as e:
            if e.errno != errno.EEXIST:
                raise

        #Resize the mask and save it
        img = cv2.imread(img_rute,cv2.COLOR_BGR2RGB)
        img = cv2.resize(img,(height_shape, width_shape))
        #Save image to desired folder with your name
        cv2.imwrite(ruta_imgC+str(paciente_name)+"/Mask/"+str(paciente_name)+'(1).jpg',img)
        jj=jj+1
    time.sleep(2)
print('****FIN*****')

#We write the file with the new assignments made
with open(ruta_imgC+'assigned_names.csv', 'w', newline='') as csvfile:
    fieldnames = ['original_field', 'real_name', 'assigned_name']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for d in towritedics:
        writer.writerow(d)

print('*****END*****')
print('Tiempo requerido:',round((time.time()-inicio)/60,0))

****FIN*****
*****END*****
Tiempo requerido: 22.0


## Sets division

In [None]:
DIR_INPUT = "./Procesado"+TYPE+"/Cancer/"
cancer_img_ids = os.listdir(DIR_INPUT)
cancer_img_ids.remove("assigned_names.csv")
cancer_img_ids.sort(reverse=True)
DIR_INPUT = "./Procesado"+TYPE+"/Controles/"
lesion_img_ids = os.listdir(DIR_INPUT)
lesion_img_ids.remove("assigned_names.csv")
lesion_img_ids.sort(reverse=True)

In [None]:
random.seed(123)
nsplit= int(len(cancer_img_ids)*1/3) #nº cancer test
nnsplit = int(len(lesion_img_ids)*1/3) #nº control test
ts1 = random.sample([elemento for elemento in cancer_img_ids],k=nsplit)
ts2 = random.sample([elemento for elemento in lesion_img_ids],k=nnsplit)
test_ids = [ts1,ts2]
test_ids = [item for sublist in test_ids for item in sublist]
print(len(np.unique(test_ids)))

107


In [None]:
random.seed(123)
image_idsL = [elemento for elemento in lesion_img_ids if elemento not in test_ids]
image_idsC =  [elemento for elemento in cancer_img_ids if elemento not in test_ids]
# It would be 2/3 of the total for train and the rest (1/3) for validation
L_bound=round(len(image_idsL)*2/3)
C_bound=round(len(image_idsC)*2/3)
#print(L_bound,C_bound)

trainL_ids = image_idsL[-L_bound:]
validL_ids = image_idsL[:-L_bound]

trainC_ids = image_idsC[-C_bound:]
validC_ids = image_idsC[:-C_bound]

train_ids = trainL_ids+trainC_ids
valid_ids = validL_ids+validC_ids

In [None]:
print(len(np.unique(train_ids)))
print(len(np.unique(valid_ids)))

143
72


In [None]:
try:
    os.makedirs("./Procesado"+TYPE+"/train")
    os.makedirs("./Procesado"+TYPE+"/train/images")
    os.makedirs("./Procesado"+TYPE+"/train/masks")
    os.makedirs("./Procesado"+TYPE+"/train/bboxes")
    print("Directorio creado")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

try:
    os.makedirs("./Procesado"+TYPE+"/valid")
    os.makedirs("./Procesado"+TYPE+"/valid/images")
    os.makedirs("./Procesado"+TYPE+"/valid/masks")
    os.makedirs("./Procesado"+TYPE+"/valid/bboxes")
    print("Directorio creado")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

try:
    os.makedirs("./Procesado"+TYPE+"/test")
    os.makedirs("./Procesado"+TYPE+"/test/images")
    os.makedirs("./Procesado"+TYPE+"/test/masks")
    os.makedirs("./Procesado"+TYPE+"/test/bboxes")
    print("Directorio creado")
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

print(os.listdir("./Procesado"+TYPE))

Directorio creado
Directorio creado
Directorio creado
['valid', 'Controles', 'train', 'test', 'Cancer']


+ **TEST**

In [None]:
inicio = time.time()
print("Copiando a Test...")
for fid in test_ids:
    if 'C' in fid:
        DIRECTORIO_ORIGEN = "./Procesado"+TYPE+"/Cancer/"+fid
    else:
        DIRECTORIO_ORIGEN = "./Procesado"+TYPE+"/Controles/"+fid
    #Image
    src=DIRECTORIO_ORIGEN+'/Original/'+fid+'.jpg'
    DIRECTORIO_DESTINO = "./Procesado"+TYPE+"/test/images/"
    shutil.copy(src,DIRECTORIO_DESTINO)
    time.sleep(2)
    #Mask
    src=DIRECTORIO_ORIGEN+'/Mask/'+fid+'(1).jpg'
    DIRECTORIO_DESTINO = "./Procesado"+TYPE+"/test/masks/"
    shutil.copy(src,DIRECTORIO_DESTINO)
    time.sleep(2)

print("Copiado en: ",  round((time.time()-inicio)/60,0))

Copiando a Test...
Copiado en:  7.0


+ **TRAIN**

In [None]:
inicio = time.time()
print("Copiando a Train...")
for fid in train_ids:
    if 'C' in fid:
        DIRECTORIO_ORIGEN = "./Procesado"+TYPE+"/Cancer/"+fid
    else:
        DIRECTORIO_ORIGEN = "./Procesado"+TYPE+"/Controles/"+fid

    #Image
    src=DIRECTORIO_ORIGEN+'/Original/'+fid+'.jpg'
    DIRECTORIO_DESTINO = "./Procesado"+TYPE+"/train/images/"
    shutil.copy(src,DIRECTORIO_DESTINO)
    time.sleep(2)
    #Mask
    src=DIRECTORIO_ORIGEN+'/Mask/'+fid+'(1).jpg'
    DIRECTORIO_DESTINO = "./Procesado"+TYPE+"/train/masks/"
    shutil.copy(src,DIRECTORIO_DESTINO)
    time.sleep(2)
print("Copiado en: ", round((time.time()-inicio)/60,0))

Copiando a Train...
Copiado en:  10.0


+ **VALIDATION**

In [None]:
inicio = time.time()
print("Copiando a Valid...")
for fid in valid_ids:
    if 'C' in fid:
        DIRECTORIO_ORIGEN = "./Procesado"+TYPE+"/Cancer/"+fid
    else:
        DIRECTORIO_ORIGEN = "./Procesado"+TYPE+"/Controles/"+fid

    #Image
    src=DIRECTORIO_ORIGEN+'/Original/'+fid+'.jpg'
    DIRECTORIO_DESTINO = "./Procesado"+TYPE+"/valid/images/"
    shutil.copy(src,DIRECTORIO_DESTINO)
    time.sleep(2)
    #Mask
    src=DIRECTORIO_ORIGEN+'/Mask/'+fid+'(1).jpg'
    DIRECTORIO_DESTINO = "./Procesado"+TYPE+"/valid/masks/"
    shutil.copy(src,DIRECTORIO_DESTINO)
    time.sleep(2)
print("Copiado en: ", round((time.time()-inicio)/60,0))

Copiando a Valid...
Copiado en:  5.0


## Augmentation

In [None]:
def OralDataset(image_dir):
    '''prepocesado'''
    #Name of each directory item
    dir_list = [os.path.splitext(nombre)[0] for nombre in os.listdir(image_dir+'/images/')]
    dir_list.sort(reverse=True)

    #Results
    images=[]
    masks=[]

    #We store the images to apply the transformations
    #We go through the patient directory
    for pid in dir_list:
        Ximg_rute = image_dir+'/images/'+pid+'.jpg'
        Yimg_rute = image_dir+'/masks/'+pid+'(1).jpg'

        '''Image'''
        norm_img = np.zeros((250,250))
        Ximg = cv2.imread(Ximg_rute,cv2.COLOR_BGR2RGB)
        image = cv2.normalize(Ximg,norm_img,0,255,cv2.NORM_MINMAX)
        images.append(image)
        '''Mask'''
        Yimg = cv2.imread(Yimg_rute,cv2.COLOR_BGR2RGB)
        masks.append(Yimg)
    return dir_list,images,masks

In [None]:
def auto_augmentation(directorio,names,images,masks,n_aug=10,nn_aug=10):
    #n_aug: Augmentation rate for control data
    #nn_aug: Augmentation rate for cancer data
    #For imbalance will be the same value to increase equally
    #To balance we increase by 2 nn_aug

    """Dimension global"""
    H=250
    W=250

    for n,x,y in zip(names,images,masks):
        # Setting a random seed for reproducibility
        random.seed(42)
        imgs=[x]
        maks=[y]
        nams=[n]
        # Define augmentation transformations
        transform = A.Compose([
            A.HorizontalFlip(p=0.5),
            A.RandomBrightnessContrast(p=0.3),
            A.GaussNoise(p=0.3),
            A.VerticalFlip(p=0.5)
        ])

        # Generate multiple pairs of images and similar augmented masks
        if 'C' in n:
            num_augmented_pairs = nn_aug
        else:
            num_augmented_pairs = n_aug

        for i in range(num_augmented_pairs):
            augmented = transform(image=x, mask=y)
            x1 = augmented['image']
            y1 = augmented['mask']
            n1 = n+'_aug0'+str(i)
            #Resize to keep sizes
            x1=cv2.resize(x1, (W, H))
            y1=cv2.resize(y1, (W, H))

            cv2.imwrite(directorio+"/images/"+n1+'.jpg', x1)
            cv2.imwrite(directorio+"/masks/"+n1+'(1).jpg',y1)
            time.sleep(4)

            imgs.append(x1)
            maks.append(y1)
            nams.append(n1)
    print('*****END*****')
    return imgs,maks,nams

In [None]:
DIR_INPUT = "./Procesado"+TYPE+"/train"
n, x, y = OralDataset(DIR_INPUT)

In [None]:
inicio = time.time()

if TYPE == "":
    #Desbalanced
    print("Inicio Aumentación Desbalanceo")
    imgs,masks,nams=auto_augmentation(DIR_INPUT, n, x, y)
else:
    #Balanced
    print("Inicio Aumentación Balanceo")
    imgs,masks,nams=auto_augmentation(DIR_INPUT, n, x, y,8,14)

print('Tiempo requerrido:',round((time.time()-inicio)/60,0))

Inicio Aumentación Desbalanceo
*****END*****
Tiempo requerrido: 96.0


In [None]:
DIR_INPUT = "./Procesado"+TYPE+"/valid"
n, x, y = OralDataset(DIR_INPUT)

inicio = time.time()

if TYPE == "":
    #Desbalanced
    print("Inicio Aumentación Desbalanceo")
    imgs,masks,nams=auto_augmentation(DIR_INPUT, n, x, y)
else:
    #Balanced
    print("Inicio Aumentación Balanceo")
    imgs,masks,nams=auto_augmentation(DIR_INPUT, n, x, y,8,14)

print('Tiempo requerrido:',round((time.time()-inicio)/60,0))

Inicio Aumentación Desbalanceo
*****END*****
Tiempo requerrido: 48.0


## Gold Standar dump

This subphase consists in dumping already generated data and obtaining the bboxes, as well as the necessary annotation files for used transformers (SAM and DETR).

In [None]:
#To generate the annotations as we need
def generate_dict(a_id,img_id,cat_id,bbox):

    xmin, ymin, xmax, ymax = bbox
    ancho = xmax - xmin
    alto = ymax - ymin
    area = ancho * alto

    annotation={
        "id": a_id,
        "image_id": img_id,
        "category_id": cat_id,
        "bbox": [int(xmin),int(ymin),int(ancho),int(alto)],
        "area": int(area),
        "segmentation":[],
        "iscrowd": 0
    }
    return annotation

+ *Test*

In [None]:
#From each mask take out cropped boundbox and store the resulting image in d(set)
#If possible to save the coordinates to a file

inicio = time.time()

ruta_test='./Procesado'+TYPE+'/test'
test_elems=[os.path.splitext(nombre)[0] for nombre in os.listdir(ruta_test+'/images/')]

images=[]
annotations=[]
uniannotations=[]

print("Copiando a test...")
for n, fid in tqdm(enumerate(test_elems)):

    #We copy the originals to test
    ORIGEN = ruta_test+'/images/'+fid+'.jpg'
    if 'C' in fid:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/test/Cáncer"
    else:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/test/Control"
    shutil.copy(ORIGEN,DIRECTORIO_DESTINO)
    time.sleep(2)

    #We copy the masks to stest
    ORIGEN = ruta_test+'/masks/'+fid+'(1).jpg'
    if 'C' in fid:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/stest/Cáncer"
    else:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/stest/Control"
    shutil.copy(ORIGEN,DIRECTORIO_DESTINO)
    time.sleep(2)

    #We read the mask and transform to bbox
    ruta_orig = ruta_test+'/images/'+fid+'.jpg'
    orig_image = imread(ruta_orig)
    Yimg = imread(ORIGEN,as_gray=True)
    mask=np.zeros([250, 250])
    ind =(Yimg>0.1)
    mask[ind]=1
    mask_tmp = torch.tensor(mask,dtype=torch.uint8)
    obj_ids = torch.unique(mask_tmp)
    obj_ids = obj_ids[1:]
    masks_tmp = mask_tmp == obj_ids[:, None, None]
    boxes = masks_to_boxes(masks_tmp)
    bbox=boxes[0]
    my_mask=np.zeros((orig_image.shape[0],orig_image.shape[1]),dtype=np.uint8)
    my_mask[int(bbox[1]):int(bbox[3]),int(bbox[0]):int(bbox[2])]=255
    my_mask=cv2.cvtColor(my_mask, cv2.COLOR_GRAY2RGB)
    # Apply the mask to the image
    result = cv2.bitwise_and(orig_image, my_mask)
    result = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)

    if 'C' in fid:
        new_dir='./data'+TYPE+'/Gold_Standard/dtest/Cáncer/'
        label=0
    else:
        new_dir='./data'+TYPE+'/Gold_Standard/dtest/Control/'
        label=1

    im_data={"id": n,"file_name": str(fid)+".jpg","heigth" : 250,"width" : 250}
    images.append(im_data)
    annotations.append(generate_dict(n,n,label,bbox))
    uniannotations.append(generate_dict(n,n,0,bbox))

    cv2.imwrite(new_dir+fid+'.jpg', result)
    time.sleep(2)
    cv2.imwrite('./Procesado'+TYPE+'/test/bboxes/'+fid+'.jpg',result)
    time.sleep(2)

print("Copiado en: ", round((time.time()-inicio)/60,0))

Copiando a test...


107it [14:18,  8.02s/it]

Copiado en:  14.0





In [None]:
#We define the two immutable elements of annotations

info={"year" : "2023"}

categories=[{"id": 0,"name": "cancer","supercategory": "none"},
            {"id": 1,"name": "control","supercategory": "none"}]
cat=[{"id": 0,"name": "lesion","supercategory": "none"}]

data={"info":info,"categories":categories,"images":images,"annotations":annotations}
unidata={"info":info,"categories":cat,"images":images,"annotations":uniannotations}

nombre_archivo = ruta_test+"/images/annotation.json"
with open(nombre_archivo, 'w') as archivo_json:
    json.dump(data, archivo_json, indent=4)  # indent=4 readable output

nombre_archivo = ruta_test+"/images/annotation_uniclass.json"
with open(nombre_archivo, 'w') as archivo_json:
    json.dump(unidata, archivo_json, indent=4)

print("Archivos JSON creado exitosamente.")

Archivos JSON creado exitosamente.


+ *Train*

In [None]:
#Take all the originals of each folder and put in train
#Masks to strain and bbox to dtrain
inicio = time.time()

ruta_train='./Procesado'+TYPE+'/train'
train_elems= [os.path.splitext(nombre)[0] for nombre in os.listdir(ruta_train+'/images/')]

images=[]
annotations=[]
uniannotations=[]

print("Copiando a Train...")
for n, fid in tqdm(enumerate(train_elems)):

    #We copy the originals to train
    ORIGEN = ruta_train+'/images/'+fid+'.jpg'
    if 'C' in fid:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/train/Cáncer"
    else:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/train/Control"
    shutil.copy(ORIGEN,DIRECTORIO_DESTINO)
    time.sleep(2)

    #We copy the masks to strain
    ORIGEN = ruta_train+'/masks/'+fid+'(1).jpg'
    if 'C' in fid:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/strain/Cáncer"
    else:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/strain/Control"
    shutil.copy(ORIGEN,DIRECTORIO_DESTINO)
    time.sleep(2)

    #We read the mask and transform to bbox
    ruta_orig = ruta_train+'/images/'+fid+'.jpg'
    orig_image = imread(ruta_orig)
    Yimg = imread(ORIGEN,as_gray=True)
    mask=np.zeros([250, 250])
    ind =(Yimg>0.1)
    mask[ind]=1
    mask_tmp = torch.tensor(mask,dtype=torch.uint8)
    obj_ids = torch.unique(mask_tmp)
    obj_ids = obj_ids[1:]
    masks_tmp = mask_tmp == obj_ids[:, None, None]
    boxes = masks_to_boxes(masks_tmp)
    bbox=boxes[0]
    my_mask=np.zeros((orig_image.shape[0],orig_image.shape[1]),dtype=np.uint8)
    my_mask[int(bbox[1]):int(bbox[3]),int(bbox[0]):int(bbox[2])]=255
    my_mask=cv2.cvtColor(my_mask, cv2.COLOR_GRAY2RGB)
    # Apply the mask to the image
    result = cv2.bitwise_and(orig_image, my_mask)
    result = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)

    if 'C' in fid:
        new_dir='./data'+TYPE+'/Gold_Standard/dtrain/Cáncer/'
        label=0
    else:
        new_dir='./data'+TYPE+'/Gold_Standard/dtrain/Control/'
        label=1

    im_data={"id": n,"file_name": str(fid)+".jpg","heigth" : 250,"width" : 250}
    images.append(im_data)
    annotations.append(generate_dict(n,n,label,bbox))
    uniannotations.append(generate_dict(n,n,0,bbox))

    cv2.imwrite(new_dir+fid+'.jpg', result)
    time.sleep(2)
    cv2.imwrite('./Procesado'+TYPE+'/train/bboxes/'+fid+'.jpg',result)
    time.sleep(2)

print("Copiado en: ", round((time.time()-inicio)/60,0))

Copiando a Train...


1573it [3:32:22,  8.10s/it]

Copiado en:  212.0





In [None]:
data={"info":info,"categories":categories,"images":images,"annotations":annotations}
unidata={"info":info,"categories":cat,"images":images,"annotations":uniannotations}

nombre_archivo = ruta_train+"/images/annotation.json"
with open(nombre_archivo, 'w') as archivo_json:
    json.dump(data, archivo_json, indent=4)
nombre_archivo = ruta_train+"/images/annotation_uniclass.json"
with open(nombre_archivo, 'w') as archivo_json:
    json.dump(unidata, archivo_json, indent=4)
print("Archivo JSON creado exitosamente.")

Archivo JSON creado exitosamente.


+ *Validation*

In [None]:
#Validation
inicio = time.time()

ruta_val='./Procesado'+TYPE+'/valid'
valid_elems=[os.path.splitext(nombre)[0] for nombre in os.listdir(ruta_val+'/images/')]

images=[]
annotations=[]
uniannotations=[]

print("Copiando a valid...")
for n, fid in tqdm(enumerate(valid_elems)):

    #We copy the originals to test
    ORIGEN = ruta_val+'/images/'+fid+'.jpg'
    if 'C' in fid:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/valid/Cáncer"
    else:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/valid/Control"
    shutil.copy(ORIGEN,DIRECTORIO_DESTINO)
    time.sleep(2)

    #We copy the masks to stest
    ORIGEN = ruta_val+'/masks/'+fid+'(1).jpg'
    if 'C' in fid:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/svalid/Cáncer"
    else:
        DIRECTORIO_DESTINO = "./data"+TYPE+"/Gold_Standard/svalid/Control"
    shutil.copy(ORIGEN,DIRECTORIO_DESTINO)
    time.sleep(2)

    #We read the mask and transform to bbox
    ruta_orig = ruta_val+'/images/'+fid+'.jpg'
    orig_image = imread(ruta_orig)
    Yimg = imread(ORIGEN,as_gray=True)
    mask=np.zeros([250, 250])
    ind =(Yimg>0.1)
    mask[ind]=1
    mask_tmp = torch.tensor(mask,dtype=torch.uint8)
    obj_ids = torch.unique(mask_tmp)
    obj_ids = obj_ids[1:]
    masks_tmp = mask_tmp == obj_ids[:, None, None]
    boxes = masks_to_boxes(masks_tmp)
    bbox=boxes[0]
    my_mask=np.zeros((orig_image.shape[0],orig_image.shape[1]),dtype=np.uint8)
    my_mask[int(bbox[1]):int(bbox[3]),int(bbox[0]):int(bbox[2])]=255
    my_mask=cv2.cvtColor(my_mask, cv2.COLOR_GRAY2RGB)
    # Apply the mask to the image
    result = cv2.bitwise_and(orig_image, my_mask)
    result = cv2.cvtColor(result, cv2.COLOR_BGR2RGB)

    if 'C' in fid:
        new_dir='./data'+TYPE+'/Gold_Standard/dvalid/Cáncer/'
        label=0
    else:
        new_dir='./data'+TYPE+'/Gold_Standard/dvalid/Control/'
        label=1

    im_data={"id": n,"file_name": str(fid)+".jpg","heigth" : 250,"width" : 250}
    images.append(im_data)
    annotations.append(generate_dict(n,n,label,bbox))
    uniannotations.append(generate_dict(n,n,0,bbox))

    cv2.imwrite(new_dir+fid+'.jpg', result)
    time.sleep(2)
    cv2.imwrite('./Procesado'+TYPE+'/valid/bboxes/'+fid+'.jpg', result)
    time.sleep(2)
print("Copiado en: ", round((time.time()-inicio)/60,0))

Copiando a valid...


792it [1:45:50,  8.02s/it]

Copiado en:  106.0





In [None]:
data={"info":info,"categories":categories,"images":images,"annotations":annotations}
unidata={"info":info,"categories":cat,"images":images,"annotations":uniannotations}

nombre_archivo = ruta_val+"/images/annotation.json"
with open(nombre_archivo, 'w') as archivo_json:
    json.dump(data, archivo_json, indent=4)
nombre_archivo = ruta_val+"/images/annotation_uniclass.json"
with open(nombre_archivo, 'w') as archivo_json:
    json.dump(unidata, archivo_json, indent=4)
print("Archivo JSON creado exitosamente.")

Archivo JSON creado exitosamente.


## Final Data Review

### **Unbalanced**

In [None]:
num_can0 = sum(1 for fid in train_elems if "C" in fid and 'aug' not in fid)
num_con0 = sum(1 for fid in train_elems if "C" not in fid and 'aug' not in fid)
num_can1 = sum(1 for fid in train_elems if "C" in fid)
num_con1 = sum(1 for fid in train_elems if "C" not in fid)

numm_can0 = sum(1 for fid in test_elems if "C" in fid and 'aug' not in fid)
numm_con0 = sum(1 for fid in test_elems if "C" not in fid and 'aug' not in fid)
numm_can1 = sum(1 for fid in test_elems if "C" in fid)
numm_con1 = sum(1 for fid in test_elems if "C" not in fid)

nummm_can0 = sum(1 for fid in valid_elems if "C" in fid and 'aug' not in fid)
nummm_con0 = sum(1 for fid in valid_elems if "C" not in fid and 'aug' not in fid)
nummm_can1 = sum(1 for fid in valid_elems if "C" in fid)
nummm_con1 = sum(1 for fid in valid_elems if "C" not in fid)
# Create DataFrame with data
data = {
    " ": ["Cáncer", "Control", "Total"],
    "Train (original)": [num_can0, num_con0, num_can0+num_con0],
    "Test(original)": [numm_can0, numm_con0, numm_can0+numm_con0],
    "Valid(original)": [nummm_can0, nummm_con0, nummm_can0+nummm_con0],
    "Train (aumentado)": [num_can1,num_con1,num_can1+num_con1],
    "Test (aumentado)": [numm_can1,numm_con1,numm_can1+numm_con1],
    "Valid (aumentado)": [nummm_can1,nummm_con1,nummm_can1+nummm_con1]
}

df = pd.DataFrame(data)

# Show DataFrame with similar format to kable
df

Unnamed: 0,Unnamed: 1,Train (original),Test(original),Valid(original),Train (aumentado),Test (aumentado),Valid (aumentado)
0,Cáncer,53,40,27,583,40,297
1,Control,90,67,45,990,67,495
2,Total,143,107,72,1573,107,792


### **Balanced**

In [None]:
num_can0 = sum(1 for fid in train_elems if "C" in fid and 'aug' not in fid)
num_con0 = sum(1 for fid in train_elems if "C" not in fid and 'aug' not in fid)
num_can1 = sum(1 for fid in train_elems if "C" in fid)
num_con1 = sum(1 for fid in train_elems if "C" not in fid)

numm_can0 = sum(1 for fid in test_elems if "C" in fid and 'aug' not in fid)
numm_con0 = sum(1 for fid in test_elems if "C" not in fid and 'aug' not in fid)
numm_can1 = sum(1 for fid in test_elems if "C" in fid)
numm_con1 = sum(1 for fid in test_elems if "C" not in fid)

nummm_can0 = sum(1 for fid in valid_elems if "C" in fid and 'aug' not in fid)
nummm_con0 = sum(1 for fid in valid_elems if "C" not in fid and 'aug' not in fid)
nummm_can1 = sum(1 for fid in valid_elems if "C" in fid)
nummm_con1 = sum(1 for fid in valid_elems if "C" not in fid)
# Create DataFrame with data
data = {
    " ": ["Cáncer", "Control", "Total"],
    "Train (original)": [num_can0, num_con0, num_can0+num_con0],
    "Test(original)": [numm_can0, numm_con0, numm_can0+numm_con0],
    "Valid(original)": [nummm_can0, nummm_con0, nummm_can0+nummm_con0],
    "Train (aumentado)": [num_can1,num_con1,num_can1+num_con1],
    "Test (aumentado)": [numm_can1,numm_con1,numm_can1+numm_con1],
    "Valid (aumentado)": [nummm_can1,nummm_con1,nummm_can1+nummm_con1]
}

df = pd.DataFrame(data)

# Show DataFrame with similar format to kable
df

Unnamed: 0,Unnamed: 1,Train (original),Test(original),Valid(original),Train (aumentado),Test (aumentado),Valid (aumentado)
0,Cáncer,53,40,27,795,40,405
1,Control,90,67,45,810,67,405
2,Total,143,107,72,1605,107,810
