In [1]:
import numpy as np
import pandas as pd 
from math import sqrt
import os
import sys
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow 
from skimage import morphology, measure
from skimage.draw import polygon, polygon_perimeter
from scipy.spatial.distance import cdist
from scipy.stats import kurtosis
from itertools import cycle

from random import randint
from random import sample

import xgboost as xgb
import utils.shapeFeatures_extraction 
import csv

In [2]:
IMG_W = 1376
IMG_H = 1020
Bethesda_classes = {'Normal':0, 'ASC-US':1, 'LSIL':2, 'ASC-H':3,'HSIL':4, 'Invasive Carcinoma':5} 
Bethesda_idx_classes = {0: 'Normal', 1:'ASC-US', 2:'LSIL', 3:'ASC-H', 4: 'HSIL', 5:'Invasive Carcinoma'}  
dataset_file_nuclei = os.path.join(os.getcwd(), 'base', 'nucleus-segmentations.csv') 
dataset_file_cytoplasm = os.path.join(os.getcwd(), 'base', 'cytoplasm-segmentations.csv') 

### For Shape features extraction exclude above comments

In [3]:
### Read dataset from \base path to make masks previous to feature extraction 
df_nucleos, df_cyto,df_nucleos_full, df_cyto_full = utils.shapeFeatures_extraction.list_cells(dataset_file_nuclei, dataset_file_cytoplasm) 
# 
count_cells, df = utils.shapeFeatures_extraction.make_masks_DF(df_nucleos, df_cyto, df_nucleos_full, df_cyto_full)#

print(count_cells, np.sum(count_cells))


[862 286 598 536 874  77] 3233


In [4]:
# Save a intermediate file for mask data by cells containing IDs (id_image, id_cell)
#    and the associated CRIC image_filename 
#    pd.to_pikle is used here since the dataframe contains columns of number type and string (the associated CRIC image_filename) 
df.to_pickle(os.path.join(os.getcwd(), 'files', 'CRIC_data_mask_cells.csv'))

In [5]:
## Fetch above file 
df_cell_masks = pd.read_pickle(os.path.join(os.getcwd(), 'files', 'CRIC_data_mask_cells.csv'))
df_cell_masks

Unnamed: 0,image_id,cell_id,bethesda,image_filename,nparray_points_segm_Nucleus,nparray_points_segm_Cyto
0,1,14796,0,be340ee72689dfe3f8dc9c24de6127f4.png,"[[398, 157], [398, 156], [398, 155], [398, 155...","[[467, 232], [468, 232], [469, 232], [470, 232..."
1,1,14797,0,be340ee72689dfe3f8dc9c24de6127f4.png,"[[474, 379], [474, 378], [474, 378], [473, 378...","[[526, 418], [526, 417], [526, 416], [526, 415..."
2,1,14798,0,be340ee72689dfe3f8dc9c24de6127f4.png,"[[545, 983], [544, 983], [544, 983], [544, 982...","[[442, 870], [441, 870], [440, 870], [439, 870..."
3,1,14799,0,be340ee72689dfe3f8dc9c24de6127f4.png,"[[278, 124], [278, 124], [279, 124], [279, 124...","[[242, 44], [241, 44], [240, 44], [240, 44], [..."
4,1,14801,0,be340ee72689dfe3f8dc9c24de6127f4.png,"[[140, 217], [139, 217], [139, 217], [138, 217...","[[238, 197], [238, 196], [238, 195], [238, 194..."
...,...,...,...,...,...,...
3228,399,11539,5,dc2df7c3f88649ded343b13b9486cddf.png,"[[405, 562], [405, 563], [405, 563], [404, 563...","[[360, 542], [360, 543], [360, 543], [361, 543..."
3229,399,11540,5,dc2df7c3f88649ded343b13b9486cddf.png,"[[510, 498], [509, 498], [509, 498], [509, 499...","[[496, 439], [496, 440], [496, 440], [495, 441..."
3230,400,11535,5,9ae8a4edde40219bad6303cebc672ee4.png,"[[454, 782], [454, 783], [454, 783], [453, 783...","[[429, 737], [428, 737], [428, 737], [428, 738..."
3231,400,11536,5,9ae8a4edde40219bad6303cebc672ee4.png,"[[672, 596], [672, 597], [672, 597], [672, 597...","[[688, 492], [689, 492], [689, 492], [689, 493..."


In [6]:
utils.shapeFeatures_extraction.create_dictionary_features()

{'bethesda': [],
 'image_id': [],
 'cell_id': [],
 'areaN': [],
 'perimeterN': [],
 'major_axisN': [],
 'minor_axisN': [],
 'equivalent_diameterN': [],
 'eccentricityN': [],
 'circularityN': [],
 'convexityN': [],
 'solidityN': [],
 'extentN': [],
 'radial_distance_maxN': [],
 'radial_distance_meanN': [],
 'radial_distance_sdN': [],
 'RAN': [],
 'RIN': [],
 'radial_distance_EN': [],
 'radial_distance_kurtoseN': [],
 'FDN': [],
 'Use_curv1N': [],
 'Use_curv2N': [],
 'Use_curv3N': [],
 'Use_curv4N': [],
 'Use_curv5N': [],
 'Use_curv6N': [],
 'Use_curv7N': [],
 'Use_curv8N': [],
 'major_axis_angleN': [],
 'fdN0': [],
 'fdN1': [],
 'fdN2': [],
 'fdN3': [],
 'fdN4': [],
 'fdN5': [],
 'fdN6': [],
 'fdN7': [],
 'fdN8': [],
 'fdN9': [],
 'fdN10': [],
 'fdN11': [],
 'fdN12': [],
 'fdN13': [],
 'fdN14': [],
 'fdN15': [],
 'fdN16': [],
 'fdN17': [],
 'fdN18': [],
 'fdN19': [],
 'fdN20': [],
 'fdN21': [],
 'fdN22': [],
 'fdN23': [],
 'fdN24': [],
 'fdN25': [],
 'fdN26': [],
 'fdN27': [],
 'fdN28':

In [7]:
## Excute shape feature extraction (all shape features included) vide TODO for fractal dim
count_cells, df_stats = utils.shapeFeatures_extraction.make_stats(df_cell_masks)

In [22]:
print(count_cells, np.sum(count_cells))
#df_stats [['Use_curv1N', 'Use_curv2N', 'Use_curv3N', 'Use_curv4N', 'Use_curv5N', 'Use_curv6N','Use_curv7N', 'Use_curv8N', 'Use_curv1C', 'Use_curv2C', 'Use_curv3C', 'Use_curv4C', 'Use_curv5C', 'Use_curv6C','Use_curv7C', 'Use_curv8C','fdN0', 'fdN1', 'fdN2', 'fdN3', 'fdN4', 'fdN5', 'fdC0', 'fdC1', 'fdC2', 'fdC3', 'fdC4', 'fdC5']] 
df_stats [['Use_curv1C', 'Use_curv2C', 'Use_curv3C', 'Use_curv4C', 'Use_curv5C', 'Use_curv6C','Use_curv7C', 'Use_curv8C','fdN0', 'fdN1', 'fdN2', 'fdN3', 'fdN4', 'fdN5', 'fdC0', 'fdC1', 'fdC2', 'fdC3', 'fdC4', 'fdC5']] 

[862 286 598 536 874  77] 3233


Unnamed: 0,Use_curv1C,Use_curv2C,Use_curv3C,Use_curv4C,Use_curv5C,Use_curv6C,Use_curv7C,Use_curv8C,fdN0,fdN1,fdN2,fdN3,fdN4,fdN5,fdC0,fdC1,fdC2,fdC3,fdC4,fdC5
0,1.643333,1.787298,1.986517,2.289334,4.032746,4.448283,5.744767,6.319493,0.026018,0.057460,0.027537,0.028677,0.003322,0.005922,0.044544,0.126583,0.021460,0.010005,0.016182,0.015148
1,1.205595,3.019750,3.188392,4.225552,4.941684,6.900952,7.941445,8.463917,0.023052,0.053551,0.010465,0.026997,0.010459,0.008724,0.042765,0.120374,0.046103,0.015505,0.010006,0.025724
2,3.807582,4.151939,5.475083,9.158027,10.503415,11.335936,12.230499,18.169032,0.025276,0.019972,0.021093,0.029548,0.010678,0.008202,0.022193,0.101964,0.079066,0.029653,0.004841,0.016760
3,1.750649,1.970800,2.446674,2.939819,3.066192,3.066192,4.450172,4.508393,0.046842,0.041232,0.016892,0.021267,0.003549,0.005692,0.031090,0.073437,0.030070,0.026050,0.014373,0.011782
4,0.414334,1.097352,1.359601,1.509612,2.281318,2.284132,3.287301,3.794501,0.017213,0.014622,0.042330,0.026752,0.015976,0.012920,0.029914,0.133810,0.018626,0.022402,0.008641,0.009973
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,0.120537,0.698503,0.783233,1.861372,2.231867,2.556804,2.648593,3.451221,0.010256,0.154742,0.018864,0.012967,0.014871,0.009318,0.033702,0.342082,0.084448,0.044444,0.018979,0.025756
3229,1.044020,1.329923,1.933830,5.492380,9.167825,9.395714,9.668120,11.244330,0.035799,0.174106,0.015563,0.019152,0.018242,0.010674,0.014537,0.370449,0.029009,0.035330,0.017213,0.027784
3230,0.113150,0.278206,0.278206,0.327624,0.327624,0.991538,1.408226,1.475312,0.132478,0.240385,0.071296,0.042484,0.033015,0.013607,0.004982,0.071394,0.027665,0.010791,0.006869,0.008994
3231,0.045238,0.055304,0.393786,5.495320,6.014015,6.057313,7.981260,10.076392,0.007492,0.257566,0.015806,0.041346,0.008096,0.011883,0.011153,0.378830,0.012003,0.018909,0.010824,0.027059


In [23]:
# Since features extraction takes time, save in file
df_stats.to_csv(os.path.join(os.getcwd(), 'files', 'features.csv'))

In [24]:
## Save a file to each cells with IDs and CRIC file image name
df_cells = df_cell_masks[['image_id', 'cell_id', 'bethesda', 'image_filename']]
df_cells = df_cells.astype(str)
df_cells.to_csv(os.path.join(os.getcwd(), 'files', 'cells_ids.csv'))

In [25]:
# index_col = 0 to don't feach dataframe with an Unnamed column
df_cell_feat = pd.read_csv(os.path.join(os.getcwd(), 'files', 'features.csv') , index_col=0)

In [27]:
df_cell_feat.shape

(3233, 144)

In [28]:
## Fractal dimension - TODO  (vide bellow list_exclude_columns) 
df_cell_feat_norm = utils.shapeFeatures_extraction.normalize_features(df_cell_feat, \
 list_skip_columns=['bethesda', 'image_id', 'cell_id'], list_exclude_columns=['FDN', 'FDC'] )

In [29]:
df_cell_feat_norm

Unnamed: 0,bethesda,image_id,cell_id,areaN,perimeterN,major_axisN,minor_axisN,equivalent_diameterN,eccentricityN,circularityN,...,fdC37,fdC38,fdC39,area_NC,perimetro_NC,major_axis_NC,minor_axis_NC,nucleus_position,sub_major_axis_angle_NC,convexity_NC
0,0,1,14796,0.023697,0.023501,0.035161,0.078621,0.037181,0.480013,0.676141,...,0.072694,0.035297,0.110236,0.033987,0.033975,0.040241,0.055168,0.020220,0.562239,0.188348
1,0,1,14797,0.048973,0.040395,0.034936,0.089215,0.072776,0.374363,0.609861,...,0.095733,0.054821,0.023745,0.024449,0.021862,0.017784,0.031389,0.033953,0.857376,0.121445
2,0,1,14798,0.034755,0.028290,0.024947,0.087643,0.053195,0.209007,0.680125,...,0.020435,0.113171,0.083899,0.011179,0.010492,0.004508,0.018033,0.018327,1.133457,0.147346
3,0,1,14799,0.058452,0.047800,0.036116,0.096738,0.085277,0.299937,0.576391,...,0.094803,0.110122,0.052611,0.071414,0.068837,0.064746,0.082156,0.034811,0.453524,0.083786
4,0,1,14801,0.048973,0.039540,0.029400,0.097056,0.072776,0.131983,0.619420,...,0.112478,0.194277,0.106398,0.051628,0.048323,0.039237,0.083861,0.018787,1.253276,0.122814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,5,399,11539,0.093207,0.093197,0.121189,0.117629,0.128003,0.697798,0.368765,...,0.391862,0.164132,0.188642,0.056138,0.057697,0.073387,0.565323,0.143966,1.057667,0.245663
3229,5,399,11540,0.097946,0.091103,0.126010,0.097607,0.133503,0.755347,0.391260,...,0.100154,0.059482,0.190277,0.121700,0.121617,0.142263,0.503070,0.046274,1.044962,0.132215
3230,5,400,11535,0.113744,0.100357,0.105500,0.074014,0.151354,0.774373,0.380852,...,0.097788,0.057434,0.346744,0.297172,0.286888,0.341181,0.197245,0.088631,0.253785,0.088823
3231,5,400,11536,0.104265,0.098842,0.132868,0.070326,0.140730,0.813709,0.366872,...,0.038296,0.008662,0.079041,0.011120,0.012859,0.005389,0.257427,0.151116,0.113819,0.145497


In [30]:
# Save all features normalized in [0,1]
df_cell_feat_norm.to_csv (os.path.join(os.getcwd(), 'files', 'normalized_features.csv'))