# Generation of the datasets from sliced data

In [1]:
import numpy as np
import pandas as pd
#import seaborn as sns
import random
import math

%matplotlib inline

In [2]:
data_sliced_path = 'data/liver_tumors_slices.csv'

df = pd.read_csv(data_sliced_path)
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.dropna(axis=1, inplace = True)

for col in df:
    if 'diagnostics_' in col:
        df.drop([col], axis = 1, inplace = True)

df_CHC = df.loc[df['classe_name']=='CHC']
df_CCK = df.loc[df['classe_name']=='CCK']
df_Mixtes = df.loc[df['classe_name']=='Mixtes']

In [3]:
def build_sliced_dataset(dataframe, max_n_patient = 223, id_shift = 0):
    new_dataframe = []
    for n_patient in range(max_n_patient):
        phases = []
        
        a = dataframe.loc[dataframe['patient_num']==n_patient]
        slice_nums = set(a['slice_num'])
        for index0, row0 in a.iterrows():
            phases.append(row0['temps_inj'])

        if 'ART' in phases and 'PORT' in phases and 'TARD' in phases:
            for num in slice_nums:
                new_row = {}
                a_slice = dataframe.loc[(dataframe['patient_num']==n_patient) & ((dataframe['slice_num']==num))]
                for index, row in a_slice.iterrows():
                    for label, content in dataframe.items():
                        #On ajoute les données de chaque phase à la ligne du patient
                        b = row['temps_inj']
                        if b != 'VEIN':
                            # On drop la phase veineuse
                            new_row[f'{label}_{b}'] = row[label]
                new_dataframe.append(new_row)
    slices_df = pd.DataFrame(new_dataframe)
    slices_df.dropna(axis=0, inplace = True)

    
    for col in slices_df:
            if 'temps_inj' in col:
                slices_df.drop([col], axis = 1, inplace = True)
            if 'slice_num' in col:
                slices_df.drop([col], axis = 1, inplace = True)
            if 'patient_num' in col and 'ART' not in col:
                slices_df.drop([col], axis = 1, inplace = True)
            if 'patient_num' in col and 'ART' in col: 
                slices_df.rename(columns={'patient_num_ART': 'patient_num'}, inplace=True)

            if 'classe_name' in col and 'ART' not in col:
                slices_df.drop([col], axis = 1, inplace = True)
            if 'classe_name' in col and 'ART' in col: 
                slices_df.rename(columns={'classe_name_ART': 'classe_name'}, inplace=True)
    labels_df = slices_df[['classe_name', 'patient_num']]
    slices_df.drop(['classe_name'], axis = 1, inplace = True)
    
    labels_df.loc[:,'patient_num']+=id_shift
    slices_df.loc[:,'patient_num']+=id_shift

    return slices_df,labels_df
    
slices_CCK, labels_CCK = build_sliced_dataset(df_CCK)
slices_CHC, labels_CHC = build_sliced_dataset(df_CHC, id_shift= 26)
slices_Mixtes, labels_Mixtes = build_sliced_dataset(df_Mixtes, id_shift=251)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels_df.loc[:,'patient_num']+=id_shift
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels_df.loc[:,'patient_num']+=id_shift
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labels_df.loc[:,'patient_num']+=id_shift


In [5]:
p_train = 0.7
p_val = 0.15
p_test = 0.15

pat_nums_CCK = list(set(labels_CCK['patient_num']))
pat_nums_CHC = list(set(labels_CHC['patient_num']))

n_CHC = len(pat_nums_CHC)
n_CCK = len(pat_nums_CCK)



random.shuffle(pat_nums_CCK)
random.shuffle(pat_nums_CHC)



nums_CCK_train = pat_nums_CCK[:math.floor(p_train*n_CCK)]
nums_CCK_val = pat_nums_CCK[math.floor(p_train*n_CCK):math.floor((p_train+ p_val)*n_CCK) ]
nums_CCK_test = pat_nums_CCK[math.floor((p_train+ p_val)*n_CCK): ]

nums_CHC_train = pat_nums_CHC[:math.floor(p_train*n_CHC)]
nums_CHC_val = pat_nums_CHC[math.floor(p_train*n_CHC):math.floor((p_train+ p_val)*n_CHC) ]
nums_CHC_test = pat_nums_CHC[math.floor((p_train+ p_val)*n_CHC): ]


slices_CCK_train = slices_CCK[slices_CCK['patient_num'].isin(nums_CCK_train)]
slices_CCK_val = slices_CCK[slices_CCK['patient_num'].isin(nums_CCK_val) ]
slices_CCK_test = slices_CCK[slices_CCK['patient_num'].isin(nums_CCK_test)]

slices_CHC_train = slices_CHC[slices_CHC['patient_num'].isin(nums_CHC_train)]
slices_CHC_val = slices_CHC[slices_CHC['patient_num'].isin(nums_CHC_val)]
slices_CHC_test = slices_CHC[slices_CHC['patient_num'].isin(nums_CHC_test)]


labels_CCK_train = labels_CCK[labels_CCK['patient_num'].isin(nums_CCK_train)]
labels_CCK_val = labels_CCK[labels_CCK['patient_num'].isin(nums_CCK_val)]
labels_CCK_test = labels_CCK[labels_CCK['patient_num'].isin(nums_CCK_test)]

labels_CHC_train = labels_CHC[labels_CHC['patient_num'].isin(nums_CHC_train)]
labels_CHC_val = labels_CHC[labels_CHC['patient_num'].isin(nums_CHC_val)]
labels_CHC_test = labels_CHC[labels_CHC['patient_num'].isin(nums_CHC_test)]

slices_train = pd.concat([slices_CCK_train, slices_CHC_train], ignore_index=True)
slices_val = pd.concat([slices_CCK_val, slices_CHC_val], ignore_index=True)
slices_test = pd.concat([slices_CCK_test, slices_CHC_test], ignore_index=True)

labels_train = pd.concat([labels_CCK_train, labels_CHC_train], ignore_index=True)
labels_val = pd.concat([labels_CCK_val, labels_CHC_val], ignore_index=True)
labels_test = pd.concat([labels_CCK_test, labels_CHC_test], ignore_index=True)


In [6]:
slices_train.to_csv('data/slices_train.csv', index=False)
slices_val.to_csv('data/slices_val.csv', index=False)
slices_test.to_csv('data/slices_test.csv', index=False)

labels_train.to_csv('data/labels_train.csv', index=False)
labels_val.to_csv('data/labels_val.csv', index=False)
labels_test.to_csv('data/labels_test.csv', index=False)

In [7]:
slices_train

Unnamed: 0,original_firstorder_10Percentile_PORT,original_firstorder_90Percentile_PORT,original_firstorder_Energy_PORT,original_firstorder_Entropy_PORT,original_firstorder_InterquartileRange_PORT,original_firstorder_Kurtosis_PORT,original_firstorder_Maximum_PORT,original_firstorder_Mean_PORT,original_firstorder_MeanAbsoluteDeviation_PORT,original_firstorder_Median_PORT,...,original_glszm_SmallAreaLowGrayLevelEmphasis_ART,original_glszm_ZoneEntropy_ART,original_glszm_ZonePercentage_ART,original_glszm_ZoneVariance_ART,original_ngtdm_Busyness_ART,original_ngtdm_Coarseness_ART,original_ngtdm_Complexity_ART,original_ngtdm_Contrast_ART,original_ngtdm_Strength_ART,patient_num
0,588.3,630.5,8144173.0,1.741684,27.75,3.971938,648.0,608.045455,16.867769,607.0,...,0.106260,2.842371,0.636364,0.387755,0.415138,0.243094,4.683577,0.099181,1.855799,1.0
1,612.1,665.0,9640975.0,1.735773,25.75,2.619244,671.0,633.458333,17.201389,626.5,...,0.167042,2.604807,0.791667,0.299169,0.564433,0.219178,3.661820,0.072157,1.111111,1.0
2,627.8,680.6,10689884.0,1.516148,34.00,1.768019,692.0,653.600000,17.536000,654.0,...,0.315359,2.498078,0.680000,0.602076,4.321429,0.206612,1.554714,0.108715,0.560000,1.0
3,616.6,685.4,11680234.0,2.204118,34.00,2.559860,700.0,657.185185,21.497942,663.0,...,0.109871,2.804597,0.814815,0.175620,0.157343,0.300000,7.394236,0.053041,3.540741,1.0
4,603.4,693.0,15838472.0,2.075016,33.00,2.377959,698.0,653.513514,25.415632,660.0,...,0.112452,3.484184,0.540541,0.827500,0.210526,0.289062,3.564984,0.060099,2.423714,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
734,616.6,685.4,11680234.0,2.204118,34.00,2.559860,700.0,657.185185,21.497942,663.0,...,0.109871,2.804597,0.814815,0.175620,0.157343,0.300000,7.394236,0.053041,3.540741,243.0
735,603.4,693.0,15838472.0,2.075016,33.00,2.377959,698.0,653.513514,25.415632,660.0,...,0.112452,3.484184,0.540541,0.827500,0.210526,0.289062,3.564984,0.060099,2.423714,243.0
736,594.5,672.0,14699812.0,2.202292,43.50,2.531900,710.0,638.277778,24.287037,641.0,...,0.138894,3.003620,0.750000,0.370370,0.536900,0.123711,11.851026,0.091903,1.723077,243.0
737,580.6,667.4,15439418.0,2.222483,45.00,2.603712,709.0,628.410256,25.548981,628.0,...,0.105652,3.238921,0.717949,0.452806,0.302846,0.174497,10.065896,0.070465,2.465409,243.0


In [8]:
slices_test

Unnamed: 0,original_firstorder_10Percentile_PORT,original_firstorder_90Percentile_PORT,original_firstorder_Energy_PORT,original_firstorder_Entropy_PORT,original_firstorder_InterquartileRange_PORT,original_firstorder_Kurtosis_PORT,original_firstorder_Maximum_PORT,original_firstorder_Mean_PORT,original_firstorder_MeanAbsoluteDeviation_PORT,original_firstorder_Median_PORT,...,original_glszm_SmallAreaLowGrayLevelEmphasis_ART,original_glszm_ZoneEntropy_ART,original_glszm_ZonePercentage_ART,original_glszm_ZoneVariance_ART,original_ngtdm_Busyness_ART,original_ngtdm_Coarseness_ART,original_ngtdm_Complexity_ART,original_ngtdm_Contrast_ART,original_ngtdm_Strength_ART,patient_num
0,588.3,630.5,8144173.0,1.741684,27.75,3.971938,648.0,608.045455,16.867769,607.0,...,0.106260,2.842371,0.636364,0.387755,0.415138,0.243094,4.683577,0.099181,1.855799,6.0
1,612.1,665.0,9640975.0,1.735773,25.75,2.619244,671.0,633.458333,17.201389,626.5,...,0.167042,2.604807,0.791667,0.299169,0.564433,0.219178,3.661820,0.072157,1.111111,6.0
2,627.8,680.6,10689884.0,1.516148,34.00,1.768019,692.0,653.600000,17.536000,654.0,...,0.315359,2.498078,0.680000,0.602076,4.321429,0.206612,1.554714,0.108715,0.560000,6.0
3,616.6,685.4,11680234.0,2.204118,34.00,2.559860,700.0,657.185185,21.497942,663.0,...,0.109871,2.804597,0.814815,0.175620,0.157343,0.300000,7.394236,0.053041,3.540741,6.0
4,603.4,693.0,15838472.0,2.075016,33.00,2.377959,698.0,653.513514,25.415632,660.0,...,0.112452,3.484184,0.540541,0.827500,0.210526,0.289062,3.564984,0.060099,2.423714,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,616.6,685.4,11680234.0,2.204118,34.00,2.559860,700.0,657.185185,21.497942,663.0,...,0.109871,2.804597,0.814815,0.175620,0.157343,0.300000,7.394236,0.053041,3.540741,244.0
182,603.4,693.0,15838472.0,2.075016,33.00,2.377959,698.0,653.513514,25.415632,660.0,...,0.112452,3.484184,0.540541,0.827500,0.210526,0.289062,3.564984,0.060099,2.423714,244.0
183,594.5,672.0,14699812.0,2.202292,43.50,2.531900,710.0,638.277778,24.287037,641.0,...,0.138894,3.003620,0.750000,0.370370,0.536900,0.123711,11.851026,0.091903,1.723077,244.0
184,580.6,667.4,15439418.0,2.222483,45.00,2.603712,709.0,628.410256,25.548981,628.0,...,0.105652,3.238921,0.717949,0.452806,0.302846,0.174497,10.065896,0.070465,2.465409,244.0
