## About this notebook

**Prototyping pre-processing original images**

**Prototyping: CODE for preprocessing pipeline pipe001**

## Import libraries

In [1]:
import os
import glob
import re

import numpy as np 
import pandas as pd

import cv2

## Config

In [2]:
class cfg: 
    #==================================================
    # THIS NOTEBOOK ID
    THIS_NOTEBOOK_REF = "FR_t01e01nb02v1"
    
    #==================================================
    # ROOT DATA SOURCE
    ROOT_DATA = os.path.join('..', '..', '..', 
                             'data', 
                             '01_original', 
                             'IQ_OTHNCCD_LungCancer', 
                             'IQ_OTHNCCD')
    
    #==================================================
    # ROOT DATA SAVE
    ROOT_2SAVE = os.path.join('..', '..', '..', 
                              'data',
                             )


## Utility functions

### create_img_d_meta()

In [4]:
def create_img_d_meta(ROOT_DATA): 

    """
    Description: 
    ------------
        this function find all the images path and create a 
            pd.DataFrame() d_meta for easy filetering and 
            selection of the images for each experiment. 
            This function is optimize for this dataset IQ_OTHNCCD

    Pararmentter
    ------------
        ROOT_DATA: path where the image data is stored. 
                    Expected data structure within the ROOT directory
                    IQ_OTHNCCD
                        ├───Bengin cases
                        ├───Malignant cases
                        └───Normal cases


    Return: 
    --------
        d_meta: pd.DataFrame of metadata of images with columns : 
            'path', 'fname', 'img_n_order', 'cls_name', 'y_true
                col 'img_n_order' indicates the order of the image provided by the 
                                 data creators 

            map_dict use for y_true 
            map_dict = {'Normal cases': 0, 
                        'Bengin cases': 1, 
                        'Malignant cases': 2, 
                        }
        
    """

    #___________________________
    # Find all path 
    all_img_path_lst = glob.glob(os.path.join(ROOT_DATA, '*', '*.jpg'))

    #___________________________
    # adding meta to each image path. 
    store_d = []
    for p in all_img_path_lst:
    
        fname = os.path.split(p)[1]
        cls_name = os.path.split(os.path.split(p)[0])[1]
        img_n_order = int(re.findall(r'\((.*?)\)', fname)[0])

        #__________
        # Images in IQ_OTHNCCD are gray scale
        img = cv2.imread(filename=p, flags=0)
        original_shape = img.shape
    
        #------
        # Store data 
        store_d.append([p, 
                        fname, 
                        img_n_order, 
                        cls_name, 
                        original_shape,
                       ])
        #------
        #___________________________________________
        # break
    #_______________________________________________
    # Creating d_meta
    d_meta = pd.DataFrame(store_d, columns=["path_original", "fname", "img_n_order", "cls_name", "img_shape_original"])
    d_meta["img_shape_original"] = d_meta["img_shape_original"].apply(lambda x: f"{x[0]}x{x[1]}" if isinstance(x, tuple) else str(x))
    map_dict = {'Normal cases': 0, 
                'Bengin cases': 1, 
                'Malignant cases': 2
               }
    d_meta['cls_id'] = d_meta['cls_name'].map(map_dict)
    d_meta["img_ID"] = "cls" + d_meta['cls_id'].astype(str) + "_order" + d_meta["img_n_order"].astype(str)
    d_meta.sort_values(by=["cls_id", "img_n_order",], ignore_index=True, inplace=True)
    #_______________________________________________
    return d_meta


### apply_clahe()

In [5]:
def apply_clahe(image, clipLimit=2.0, tileGridSize=(8,8)):
    clahe = cv2.createCLAHE(clipLimit=clipLimit, tileGridSize=tileGridSize)
    return clahe.apply(image)

## Create Meta

In [6]:
d_meta = create_img_d_meta(ROOT_DATA=cfg.ROOT_DATA)
d_meta.info()
d_meta.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1097 entries, 0 to 1096
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   path_original       1097 non-null   object
 1   fname               1097 non-null   object
 2   img_n_order         1097 non-null   int64 
 3   cls_name            1097 non-null   object
 4   img_shape_original  1097 non-null   object
 5   cls_id              1097 non-null   int64 
 6   img_ID              1097 non-null   object
dtypes: int64(2), object(5)
memory usage: 60.1+ KB


Unnamed: 0,path_original,fname,img_n_order,cls_name,img_shape_original,cls_id,img_ID
0,../../../data/01_original/IQ_OTHNCCD_LungCance...,Normal case (1).jpg,1,Normal cases,512x512,0,cls0_order1
1,../../../data/01_original/IQ_OTHNCCD_LungCance...,Normal case (2).jpg,2,Normal cases,512x512,0,cls0_order2
2,../../../data/01_original/IQ_OTHNCCD_LungCance...,Normal case (3).jpg,3,Normal cases,512x512,0,cls0_order3


## Pipe001

In [7]:
%%time

pipeID="pipe001"
path2save_pipeID = os.path.join(cfg.ROOT_2SAVE, pipeID)
img_size = 256 # pixels 

#________________________________________________________
# Create folder
os.makedirs(path2save_pipeID, exist_ok=True)


#________________________________________________________

store_meta_lst = []
for r_id, data in d_meta.iterrows(): 



    #______________________________________________________________
    # create class folder if needed 
    path = os.path.join(path2save_pipeID, f"{pipeID}_imgs", data["cls_name"])
    os.makedirs(path, exist_ok=True)

    #______________________________________________________________
    # pipe >> READ > Resize 
    img = cv2.imread(filename=data["path_original"], flags=0)
    img_resized = cv2.resize(img, (img_size, img_size)) # out_shape 256x256


    #______________________________________________________________
    # Store 2D numpy array (image with 1 channel, gray scale)
    arr_name = f"{pipeID}_{data['img_ID']}_.npy"
    path_arr_name = os.path.join(path, arr_name)
    
    np.save(path_arr_name, img_resized)

    #______________________________________________________________
    store_meta_lst.append([
        data["path_original"],
        data["img_ID"], 
        path_arr_name,
        data["img_shape_original"],
        f"{img_size}x{img_size}",
        data["img_n_order"],
        data["cls_name"],
        data["cls_id"],
    ])

    #______________________________________________________________
    # break
#________________________

CPU times: user 3.01 s, sys: 1.44 s, total: 4.45 s
Wall time: 35.1 s


In [8]:
d_meta_DataPipe = pd.DataFrame(store_meta_lst,
                               columns= ["path_original_img", 
                                         "img_ID",
                                         f"path_img_{pipeID}",
                                         "img_shape_original",
                                         f"img_shape_{pipeID}",
                                         "img_n_order",
                                         "cls_name",
                                         "cls_id"]
                              )
d_meta_DataPipe.info()
d_meta_DataPipe.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1097 entries, 0 to 1096
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   path_original_img   1097 non-null   object
 1   img_ID              1097 non-null   object
 2   path_img_pipe001    1097 non-null   object
 3   img_shape_original  1097 non-null   object
 4   img_shape_pipe001   1097 non-null   object
 5   img_n_order         1097 non-null   int64 
 6   cls_name            1097 non-null   object
 7   cls_id              1097 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 68.7+ KB


Unnamed: 0,path_original_img,img_ID,path_img_pipe001,img_shape_original,img_shape_pipe001,img_n_order,cls_name,cls_id
0,../../../data/01_original/IQ_OTHNCCD_LungCance...,cls0_order1,../../../data/pipe001/pipe001_imgs/Normal case...,512x512,256x256,1,Normal cases,0
1,../../../data/01_original/IQ_OTHNCCD_LungCance...,cls0_order2,../../../data/pipe001/pipe001_imgs/Normal case...,512x512,256x256,2,Normal cases,0
2,../../../data/01_original/IQ_OTHNCCD_LungCance...,cls0_order3,../../../data/pipe001/pipe001_imgs/Normal case...,512x512,256x256,3,Normal cases,0
3,../../../data/01_original/IQ_OTHNCCD_LungCance...,cls0_order4,../../../data/pipe001/pipe001_imgs/Normal case...,512x512,256x256,4,Normal cases,0
4,../../../data/01_original/IQ_OTHNCCD_LungCance...,cls0_order5,../../../data/pipe001/pipe001_imgs/Normal case...,512x512,256x256,5,Normal cases,0


In [10]:
# SAVE METADATA OF PIPELINE TRANSFORMATION
fname = f"{cfg.THIS_NOTEBOOK_REF}_metadata_{pipeID}.csv"
path_DATAPIPE_d_meta = os.path.join(path2save_pipeID, fname)
print(f"Storing metadata of {pipeID} here: \n{path_DATAPIPE_d_meta}")
d_meta_DataPipe.to_csv(path_DATAPIPE_d_meta, index=False) 

Storing metadata of pipe001 here: 
../../../data/pipe001/FR_t01e01nb02v1_metadata_pipe001.csv
