In [1]:
import numpy as np
import pandas as pd 
from math import sqrt
import os
import sys
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow 
from skimage import morphology, measure
from skimage.draw import polygon, polygon_perimeter
from scipy.spatial.distance import cdist
from scipy.stats import kurtosis
from itertools import cycle

from random import randint
from random import sample

import xgboost as xgb
import utils.shapeFeatures_extraction 
import csv

In [2]:
IMG_W = 1376
IMG_H = 1020
Bethesda_classes = {'Normal':0, 'ASC-US':1, 'LSIL':2, 'ASC-H':3,'HSIL':4, 'Invasive Carcinoma':5} 
Bethesda_idx_classes = {0: 'Normal', 1:'ASC-US', 2:'LSIL', 3:'ASC-H', 4: 'HSIL', 5:'Invasive Carcinoma'}  
dataset_file_nuclei = os.path.join(os.getcwd(), 'base', 'nucleus-segmentations.csv') 
dataset_file_cytoplasm = os.path.join(os.getcwd(), 'base', 'cytoplasm-segmentations.csv') 

### For Shape features extraction exclude above comments

In [3]:
### Read dataset from \base path to make masks previous to feature extraction 
#df_nucleos, df_cyto,df_nucleos_full, df_cyto_full = utils.shapeFeatures_extraction.list_cells(dataset_file_nuclei, dataset_file_cytoplasm) 
# 
count_cells, df = utils.shapeFeatures_extraction.make_masks_DF(df_nucleos, df_cyto, df_nucleos_full, df_cyto_full)#

print(count_cells, np.sum(count_cells))


In [4]:
# Save a intermediate file for mask data by cells containing IDs (id_image, id_cell)
#    and the associated CRIC image_filename 
#    pd.to_pikle is used here since the dataframe contains columns of number type and string (the associated CRIC image_filename) 
df.to_pickle(os.path.join(os.getcwd(), 'files', 'CRIC_data_mask_cells.csv'))

In [5]:
## Fetch above file 
df_cell_masks = pd.read_pickle(os.path.join(os.getcwd(), 'files', 'CRIC_data_mask_cells.csv'))
df_cell_masks

In [6]:
## Excute shape feature extraction (all shape features included) vide TODO for fractal dim
count_cells, df_stats = utils.shapeFeatures_extraction.make_stats(df_cell_masks)

In [7]:
df_stats, count_cells, np.sum(count_cells)

In [8]:
# Since features extraction takes time, save in file
df_stats.to_csv(os.path.join(os.getcwd(), 'files', 'features.csv'))

In [10]:
## Save a file to each cells with IDs and CRIC file image name
#df_cells = df_cell_masks[['image_id', 'cell_id', 'bethesda', 'image_filename']]
#df_cells = df_cells.astype(str)
#df_cells.to_csv(os.path.join(os.getcwd(), 'files', 'cells_ids.csv'))

In [36]:
# index_col = 0 to don't feach dataframe with an Unnamed column
df_cell_feat = pd.read_csv(os.path.join(os.getcwd(), 'files', 'features.csv') , index_col=0)

In [37]:
df_cell_feat

Unnamed: 0,bethesda,image_id,cell_id,areaN,perimeterN,major_axisN,minor_axisN,equivalent_diameterN,eccentricityN,circularityN,...,Use_curv2C,Use_curv3C,major_axis_angleC,area_NC,perimetro_NC,major_axis_NC,minor_axis_NC,nucleus_position,sub_major_axis_angle_NC,convexity_NC
0,0,1,14796,72.0,77.798990,34.923416,28.487003,9.574615,0.578476,0.149484,...,5.149067,20.731059,0.827775,0.119403,0.114612,0.116392,0.135984,0.039793,-1.376747,1.058727
1,0,1,14797,88.0,90.071068,34.858045,30.680528,10.585135,0.474684,0.136308,...,10.439867,20.506915,-0.640090,0.103044,0.095068,0.088509,0.106911,0.065996,-0.457342,0.962255
2,0,1,14798,79.0,81.278175,31.952591,30.355122,10.029253,0.312235,0.150276,...,9.592713,9.763902,0.528126,0.080285,0.076722,0.072026,0.090581,0.036181,0.402702,0.999603
3,0,1,14799,94.0,95.449747,35.201144,32.238268,10.940042,0.401566,0.129655,...,28.229228,30.796864,0.823562,0.183594,0.170863,0.146818,0.168981,0.067635,-1.715415,0.907952
4,0,1,14801,88.0,89.449747,33.247761,32.304044,10.585135,0.236565,0.138208,...,12.092488,19.350183,-0.425716,0.149660,0.137763,0.115146,0.171066,0.037058,0.775961,0.964228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,5,399,11539,116.0,128.426407,59.944404,36.563744,12.153016,0.792431,0.088381,...,0.557920,1.639469,0.577861,0.157395,0.152888,0.157546,0.759736,0.275916,0.166601,1.141373
3229,5,399,11540,119.0,126.905592,61.346618,32.418162,12.309163,0.848969,0.092853,...,4.263561,10.939456,1.150439,0.269841,0.256025,0.243062,0.683621,0.089508,0.127024,0.977785
3230,5,400,11535,129.0,133.627417,55.381255,27.533199,12.815924,0.867660,0.090784,...,6.716690,8.376891,1.136939,0.570796,0.522693,0.490038,0.309698,0.170330,-2.337638,0.915215
3231,5,400,11536,123.0,132.526912,63.341183,26.769533,12.514330,0.906305,0.088005,...,0.000429,23.121492,1.454043,0.080183,0.080541,0.073120,0.383281,0.289559,-2.773659,0.996937


In [38]:
## Fractal dimension - TODO  (vide bellow list_exclude_columns) 
df_cell_feat_norm = utils.shapeFeatures_extraction.normalize_features(df_cell_feat, \
 list_skip_columns=['bethesda', 'image_id', 'cell_id'], list_exclude_columns=['FDN', 'FDC'] )

In [39]:
df_cell_feat_norm

Unnamed: 0,bethesda,image_id,cell_id,areaN,perimeterN,major_axisN,minor_axisN,equivalent_diameterN,eccentricityN,circularityN,...,Use_curv2C,Use_curv3C,major_axis_angleC,area_NC,perimetro_NC,major_axis_NC,minor_axis_NC,nucleus_position,sub_major_axis_angle_NC,convexity_NC
0,0,1,14796,0.023697,0.023501,0.035161,0.078621,0.037181,0.480013,0.676141,...,0.056802,0.039770,1.529886,0.033987,0.033975,0.040241,0.055168,0.020220,0.562239,0.188348
1,0,1,14797,0.048973,0.040395,0.034936,0.089215,0.072776,0.374363,0.609861,...,0.115168,0.039340,0.593580,0.024449,0.021862,0.017784,0.031389,0.033953,0.857376,0.121445
2,0,1,14798,0.034755,0.028290,0.024947,0.087643,0.053195,0.209007,0.680125,...,0.105822,0.018731,1.338749,0.011179,0.010492,0.004508,0.018033,0.018327,1.133457,0.147346
3,0,1,14799,0.058452,0.047800,0.036116,0.096738,0.085277,0.299937,0.576391,...,0.311411,0.059079,1.527198,0.071414,0.068837,0.064746,0.082156,0.034811,0.453524,0.083786
4,0,1,14801,0.048973,0.039540,0.029400,0.097056,0.072776,0.131983,0.619420,...,0.133398,0.037121,0.730322,0.051628,0.048323,0.039237,0.083861,0.018787,1.253276,0.122814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,5,399,11539,0.093207,0.093197,0.121189,0.117629,0.128003,0.697798,0.368765,...,0.006155,0.003145,1.370473,0.056138,0.057697,0.073387,0.565323,0.143966,1.057667,0.245663
3229,5,399,11540,0.097946,0.091103,0.126010,0.097607,0.133503,0.755347,0.391260,...,0.047034,0.020986,1.735703,0.121700,0.121617,0.142263,0.503070,0.046274,1.044962,0.132215
3230,5,400,11535,0.113744,0.100357,0.105500,0.074014,0.151354,0.774373,0.380852,...,0.074095,0.016070,1.727092,0.297172,0.286888,0.341181,0.197245,0.088631,0.253785,0.088823
3231,5,400,11536,0.104265,0.098842,0.132868,0.070326,0.140730,0.813709,0.366872,...,0.000005,0.044355,1.929363,0.011120,0.012859,0.005389,0.257427,0.151116,0.113819,0.145497


In [41]:
# Save all features normalized in [0,1]
df_cell_feat_norm.to_csv (os.path.join(os.getcwd(), 'files', 'normalized_features.csv'))