# Project MDS #

This notebook contains a part of the work made by Grégoire Gissot for the project MDS.
In this notebook, the only thing done is a preprocess of the data to store it as tensor for purposes of predictions in a multiway paradigm.

Two types of data are important here : 
- Global data gives aggregation of the IRM data on a patient level for different times (which makes it a 3D tensor individual x features x time)
- Slice data gives the same data but for each slice of a patient for different times. Every patient do not have the same number of slices depending on the size of the tumor (which makes it a 4D tensor individual x features x nb slice x time)

If the reader has the raw data and want to reuse the code in the file tensor_analysis.ipynb, he or she must run these cells because it will create the file used in the other file. 
The raw data used are the one provided by Laurent Le Brusquet in the context of AMDA lectures.

In [2]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd


### Load the data ###

In [3]:
PATH_TO_DATA_SLICE = './Données-20240401/radiomiques_multislice.xlsx'
PATH_TO_DATA_GLOBAL = './Données-20240401/radiomiques_global.xlsx'
PATH_TO_DATA_PATIENT = './Données-20240401/Descriptif_patients.xlsx'

### Global Data (Naturally 3D-tensor) ###

In [4]:
# Load the data
df = pd.read_excel(PATH_TO_DATA_GLOBAL, sheet_name='liver_tumors_all_LLB')
df

Unnamed: 0,classe_name,temps_inj,patient_num,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,original_firstorder_Maximum,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,Mixtes,VEIN,9,175.0,324.0,354376998.0,3.013456,74.00,4.780095,504.0,...,20.803398,0.025307,6.664670,0.124815,284.093281,5.160969,0.002559,59.816065,0.023803,0.482011
1,Mixtes,TARD,9,156.0,271.0,277383044.0,2.735473,56.00,4.454940,414.0,...,12.453376,0.029698,6.641130,0.098596,361.311999,7.259237,0.002152,34.917062,0.018849,0.226643
2,Mixtes,PORT,9,174.0,289.0,297110913.0,2.730577,51.00,4.103634,441.0,...,14.353061,0.034081,6.529717,0.111934,286.446393,6.781975,0.002379,36.571451,0.018595,0.249370
3,Mixtes,ART,9,180.7,416.3,553036349.0,3.716371,115.00,3.639202,671.0,...,50.323983,0.019024,6.964388,0.204971,59.313141,1.936159,0.003218,146.176224,0.031472,0.950978
4,Mixtes,VEIN,8,130.0,148.1,1189941.0,0.620400,8.25,4.121147,159.0,...,2.629845,0.279624,2.250000,0.133333,62.750000,0.417342,0.345594,0.448039,0.006297,0.473381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,CCK,ART,11,260.1,894.0,732384007.0,5.230583,408.75,2.690676,1591.0,...,437.528095,0.009052,6.051931,0.807651,0.372228,0.240971,0.004572,5314.309551,0.312577,5.425102
546,CCK,VEIN,10,264.0,359.0,338852314.0,2.628112,55.00,4.110808,403.0,...,33.413109,0.009836,6.090384,0.166814,87.980503,1.770498,0.003704,39.277987,0.015188,0.332041
547,CCK,TARD,10,253.0,341.0,307968612.0,2.540563,50.00,5.047944,384.0,...,39.320701,0.010246,6.049510,0.144376,135.393353,1.626557,0.003709,38.563334,0.013078,0.379284
548,CCK,PORT,10,256.0,362.0,335671262.0,2.830569,61.00,5.944696,461.0,...,55.721126,0.006331,5.962496,0.217659,52.197973,1.103165,0.003508,78.669903,0.014902,0.534732


In [5]:
# Remove columns that started  by diagnostic
df = df.loc[:, ~df.columns.str.startswith('diagnostic')]
df = df.loc[:, ~df.columns.str.startswith('Unnamed')]
df = df.loc[:, ~df.columns.str.startswith('Column')]
df[df["patient_num"]==209]

Unnamed: 0,classe_name,temps_inj,patient_num,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,original_firstorder_Maximum,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
382,CHC,VEIN,209,113.0,241.0,57694164.0,3.026888,64.75,3.38104,305.0,...,34.122267,0.019503,5.662129,0.266139,30.801575,1.412162,0.006326,50.312115,0.039555,0.414733
383,CHC,TARD,209,121.0,211.0,36482241.0,2.629101,47.0,3.903166,263.0,...,29.120744,0.017169,5.414737,0.253629,43.788894,1.588336,0.006942,35.743607,0.029739,0.326541
384,CHC,PORT,209,128.0,254.0,82522386.0,2.933232,53.0,4.24594,330.0,...,37.666164,0.022469,5.705455,0.264812,40.912977,1.349063,0.004839,62.779333,0.03477,0.410418
385,CHC,ART,209,70.0,99.0,13880944.0,1.232328,15.0,5.085917,123.0,...,5.774008,0.046536,4.737571,0.067491,816.012237,10.07355,0.004066,3.834792,0.010559,0.049786


In [6]:
# Modify patient_num column to add 1000 if patient has CCK as classe_name and 2000 if patient has Mixtes as classe_name to make it a real ID 

for index, row in df.iterrows():
    if row['classe_name'] == 'CCK':
        df.at[index, 'patient_num'] = row['patient_num'] + 1000
    if row['classe_name'] == 'Mixtes':
        df.at[index, 'patient_num'] = row['patient_num'] + 2000

df


Unnamed: 0,classe_name,temps_inj,patient_num,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,original_firstorder_Maximum,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,Mixtes,VEIN,2009,175.0,324.0,354376998.0,3.013456,74.00,4.780095,504.0,...,20.803398,0.025307,6.664670,0.124815,284.093281,5.160969,0.002559,59.816065,0.023803,0.482011
1,Mixtes,TARD,2009,156.0,271.0,277383044.0,2.735473,56.00,4.454940,414.0,...,12.453376,0.029698,6.641130,0.098596,361.311999,7.259237,0.002152,34.917062,0.018849,0.226643
2,Mixtes,PORT,2009,174.0,289.0,297110913.0,2.730577,51.00,4.103634,441.0,...,14.353061,0.034081,6.529717,0.111934,286.446393,6.781975,0.002379,36.571451,0.018595,0.249370
3,Mixtes,ART,2009,180.7,416.3,553036349.0,3.716371,115.00,3.639202,671.0,...,50.323983,0.019024,6.964388,0.204971,59.313141,1.936159,0.003218,146.176224,0.031472,0.950978
4,Mixtes,VEIN,2008,130.0,148.1,1189941.0,0.620400,8.25,4.121147,159.0,...,2.629845,0.279624,2.250000,0.133333,62.750000,0.417342,0.345594,0.448039,0.006297,0.473381
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,CCK,ART,1011,260.1,894.0,732384007.0,5.230583,408.75,2.690676,1591.0,...,437.528095,0.009052,6.051931,0.807651,0.372228,0.240971,0.004572,5314.309551,0.312577,5.425102
546,CCK,VEIN,1010,264.0,359.0,338852314.0,2.628112,55.00,4.110808,403.0,...,33.413109,0.009836,6.090384,0.166814,87.980503,1.770498,0.003704,39.277987,0.015188,0.332041
547,CCK,TARD,1010,253.0,341.0,307968612.0,2.540563,50.00,5.047944,384.0,...,39.320701,0.010246,6.049510,0.144376,135.393353,1.626557,0.003709,38.563334,0.013078,0.379284
548,CCK,PORT,1010,256.0,362.0,335671262.0,2.830569,61.00,5.944696,461.0,...,55.721126,0.006331,5.962496,0.217659,52.197973,1.103165,0.003508,78.669903,0.014902,0.534732


Separate the different phases 

In [7]:
df_art = df[df['temps_inj']=='ART']
df_port = df[df['temps_inj']=='PORT']
df_vein = df[df['temps_inj']=='VEIN']   
df_tard = df[df['temps_inj']=='TARD']

print(len(df_art))
print(len(df_port))
print(len(df_vein))
print(len(df_tard))

142
140
133
135


We do not have the same number of patients for each time so we need to investigate the different intersections.

In [8]:
# Looking for the intersection between the 4 datasets

set_art = set(df_art['patient_num'])
set_port = set(df_port['patient_num'])
set_vein = set(df_vein['patient_num'])
set_tard = set(df_tard['patient_num'])

print(len(set_art & set_port & set_vein & set_tard))
print(len(set_art & set_port & set_vein))
print(len(set_art & set_port & set_tard))
print(len(set_art & set_vein & set_tard))
print(len(set_port & set_vein & set_tard))


117
122
126
121
121


Considering the number of the different intersections, let's keep the patients with the four times.

In [9]:
df_art = df_art[df_art['patient_num'].isin(set_art & set_port & set_vein & set_tard)]
df_port = df_port[df_port['patient_num'].isin(set_art & set_port & set_vein & set_tard)]
df_vein = df_vein[df_vein['patient_num'].isin(set_art & set_port & set_vein & set_tard)]
df_tard = df_tard[df_tard['patient_num'].isin(set_art & set_port & set_vein & set_tard)]

print(len(df_art))
print(len(df_port))
print(len(df_vein))
print(len(df_tard))

117
117
117
117


In [10]:
# Sorting the values to be sure of the order when we will merge the datasets

df_art.sort_values(by=['patient_num'], inplace=True)
df_port.sort_values(by=['patient_num'], inplace=True)
df_vein.sort_values(by=['patient_num'], inplace=True)
df_tard.sort_values(by=['patient_num'], inplace=True)

In [14]:
# These columns are useless for the next steps
 
df_art_mod = df_art.drop(columns=['temps_inj', 'patient_num'])
df_port_mod = df_port.drop(columns=['temps_inj', 'patient_num'])
df_vein_mod = df_vein.drop(columns=['temps_inj', 'patient_num'])
df_tard_mod = df_tard.drop(columns=['temps_inj', 'patient_num'])


array_art = df_art_mod.to_numpy()
array_port = df_port_mod.to_numpy()
array_vein = df_vein_mod.to_numpy()
array_tard = df_tard_mod.to_numpy()

#array_art.shape # 117,108

array_tot = np.array([array_art, array_port, array_vein, array_tard])
array_tot = np.moveaxis(array_tot, 0, -1)
print(array_tot.shape)

array_tot

(117, 108, 4)


array([[['CHC', 'CHC', 'CHC', 'CHC'],
        [158.5, 249.6, 225.4, 196.0],
        [260.5, 295.4, 282.0, 243.0],
        ...,
        [21.704557492833366, 6.770914206971624, 7.189070439828896,
         8.00671786304326],
        [0.0765875260949968, 0.0201577921280927, 0.0287080541190117,
         0.0244292870857662],
        [1.0944021634513812, 0.5648420133724993, 0.3659540828090517,
         0.459584546626976]],

       [['CHC', 'CHC', 'CHC', 'CHC'],
        [75.0, 71.0, 73.0, 74.0],
        [257.0, 284.0, 276.0, 233.0],
        ...,
        [104.00506496123708, 204.28553168980105, 183.97120045041376,
         74.75877749670286],
        [0.0458451619247605, 0.0371437518611409, 0.0397445198417052,
         0.0414182530421824],
        [0.1121345058667184, 0.2588203288070583, 0.2140829122563279,
         0.0760727763333614]],

       [['CHC', 'CHC', 'CHC', 'CHC'],
        [59.0, 78.0, 85.0, 84.0],
        [85.0, 130.0, 130.0, 124.0],
        ...,
        [2.7605659144518104, 10.7913

In [221]:
# Save the array
np.save('global_tensored_data.npy', array_tot)


### Slice Data (Naturally 4D-tensor) ###

In [65]:
df_slice = pd.read_excel(PATH_TO_DATA_SLICE, sheet_name='liver_tumors_slices_LLB')
df_slice

Unnamed: 0,slice_num,classe_name,temps_inj,patient_num,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,54.0,CCK,ART,1,405.8,784.6,108013119.0,4.423730,236.50,2.044339,...,177.411029,0.020461,4.807961,0.906355,0.114786,0.141803,0.031819,481.985606,0.238792,6.686087
1,55.0,CCK,ART,1,374.6,877.0,217855694.0,4.842085,295.50,2.146068,...,344.967045,0.011439,5.303537,0.897959,0.109814,0.070190,0.025677,972.510179,0.200256,10.582781
2,56.0,CCK,ART,1,350.0,909.7,316932436.0,5.024121,356.00,2.095717,...,443.780551,0.009793,5.502238,0.893275,0.128115,0.075007,0.019048,1318.449846,0.241414,9.740356
3,57.0,CCK,ART,1,318.6,946.4,342297374.0,5.179079,400.00,2.042928,...,471.219561,0.012116,5.638428,0.888738,0.140052,0.076645,0.019024,1537.946771,0.262199,11.378192
4,58.0,CCK,ART,1,292.0,954.7,380468266.0,5.255932,445.00,2.011028,...,537.628159,0.009071,5.703871,0.901699,0.126742,0.064824,0.018696,1834.491369,0.236669,13.638704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9399,52.0,Mixtes,VEIN,38,121.1,252.0,25630731.0,3.052709,70.00,2.764641,...,43.815541,0.023964,4.243184,0.700893,0.567352,0.692321,0.014601,54.899149,0.042281,0.879847
9400,53.0,Mixtes,VEIN,38,128.0,246.0,24900027.0,2.852691,62.25,2.554900,...,31.645868,0.032856,4.148539,0.686728,0.711001,0.897469,0.015630,34.150679,0.040393,0.635830
9401,54.0,Mixtes,VEIN,38,135.0,245.0,19515066.0,2.804114,56.00,3.007619,...,31.082282,0.032314,4.162378,0.656566,0.889467,0.700445,0.018500,35.414902,0.040218,0.810971
9402,55.0,Mixtes,VEIN,38,162.8,249.0,9604550.0,2.424573,46.00,2.659171,...,11.633240,0.110633,3.965590,0.614679,0.995099,1.009248,0.042057,10.105152,0.046395,0.669801


In [66]:
# Remove columns that started  by diagnostic
df_slice = df_slice.loc[:, ~df_slice.columns.str.startswith('diagnostic')]
df_slice = df_slice.loc[:, ~df_slice.columns.str.startswith('Unnamed')]
df_slice = df_slice.loc[:, ~df_slice.columns.str.startswith('Column')]

df_slice

Unnamed: 0,slice_num,classe_name,temps_inj,patient_num,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,54.0,CCK,ART,1,405.8,784.6,108013119.0,4.423730,236.50,2.044339,...,177.411029,0.020461,4.807961,0.906355,0.114786,0.141803,0.031819,481.985606,0.238792,6.686087
1,55.0,CCK,ART,1,374.6,877.0,217855694.0,4.842085,295.50,2.146068,...,344.967045,0.011439,5.303537,0.897959,0.109814,0.070190,0.025677,972.510179,0.200256,10.582781
2,56.0,CCK,ART,1,350.0,909.7,316932436.0,5.024121,356.00,2.095717,...,443.780551,0.009793,5.502238,0.893275,0.128115,0.075007,0.019048,1318.449846,0.241414,9.740356
3,57.0,CCK,ART,1,318.6,946.4,342297374.0,5.179079,400.00,2.042928,...,471.219561,0.012116,5.638428,0.888738,0.140052,0.076645,0.019024,1537.946771,0.262199,11.378192
4,58.0,CCK,ART,1,292.0,954.7,380468266.0,5.255932,445.00,2.011028,...,537.628159,0.009071,5.703871,0.901699,0.126742,0.064824,0.018696,1834.491369,0.236669,13.638704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9399,52.0,Mixtes,VEIN,38,121.1,252.0,25630731.0,3.052709,70.00,2.764641,...,43.815541,0.023964,4.243184,0.700893,0.567352,0.692321,0.014601,54.899149,0.042281,0.879847
9400,53.0,Mixtes,VEIN,38,128.0,246.0,24900027.0,2.852691,62.25,2.554900,...,31.645868,0.032856,4.148539,0.686728,0.711001,0.897469,0.015630,34.150679,0.040393,0.635830
9401,54.0,Mixtes,VEIN,38,135.0,245.0,19515066.0,2.804114,56.00,3.007619,...,31.082282,0.032314,4.162378,0.656566,0.889467,0.700445,0.018500,35.414902,0.040218,0.810971
9402,55.0,Mixtes,VEIN,38,162.8,249.0,9604550.0,2.424573,46.00,2.659171,...,11.633240,0.110633,3.965590,0.614679,0.995099,1.009248,0.042057,10.105152,0.046395,0.669801


In [67]:
# Modify patient_num column to add 1000 if patient has CCK as classe_name and 2000 if patient has Mixtes as classe_name to make it a real ID 

for index, row in df_slice.iterrows():
    if row['classe_name'] == 'CCK':
        df_slice.at[index, 'patient_num'] = row['patient_num'] + 1000
    if row['classe_name'] == 'Mixtes':
        df_slice.at[index, 'patient_num'] = row['patient_num'] + 2000

df_slice


Unnamed: 0,slice_num,classe_name,temps_inj,patient_num,original_firstorder_10Percentile,original_firstorder_90Percentile,original_firstorder_Energy,original_firstorder_Entropy,original_firstorder_InterquartileRange,original_firstorder_Kurtosis,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,54.0,CCK,ART,1001,405.8,784.6,108013119.0,4.423730,236.50,2.044339,...,177.411029,0.020461,4.807961,0.906355,0.114786,0.141803,0.031819,481.985606,0.238792,6.686087
1,55.0,CCK,ART,1001,374.6,877.0,217855694.0,4.842085,295.50,2.146068,...,344.967045,0.011439,5.303537,0.897959,0.109814,0.070190,0.025677,972.510179,0.200256,10.582781
2,56.0,CCK,ART,1001,350.0,909.7,316932436.0,5.024121,356.00,2.095717,...,443.780551,0.009793,5.502238,0.893275,0.128115,0.075007,0.019048,1318.449846,0.241414,9.740356
3,57.0,CCK,ART,1001,318.6,946.4,342297374.0,5.179079,400.00,2.042928,...,471.219561,0.012116,5.638428,0.888738,0.140052,0.076645,0.019024,1537.946771,0.262199,11.378192
4,58.0,CCK,ART,1001,292.0,954.7,380468266.0,5.255932,445.00,2.011028,...,537.628159,0.009071,5.703871,0.901699,0.126742,0.064824,0.018696,1834.491369,0.236669,13.638704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9399,52.0,Mixtes,VEIN,2038,121.1,252.0,25630731.0,3.052709,70.00,2.764641,...,43.815541,0.023964,4.243184,0.700893,0.567352,0.692321,0.014601,54.899149,0.042281,0.879847
9400,53.0,Mixtes,VEIN,2038,128.0,246.0,24900027.0,2.852691,62.25,2.554900,...,31.645868,0.032856,4.148539,0.686728,0.711001,0.897469,0.015630,34.150679,0.040393,0.635830
9401,54.0,Mixtes,VEIN,2038,135.0,245.0,19515066.0,2.804114,56.00,3.007619,...,31.082282,0.032314,4.162378,0.656566,0.889467,0.700445,0.018500,35.414902,0.040218,0.810971
9402,55.0,Mixtes,VEIN,2038,162.8,249.0,9604550.0,2.424573,46.00,2.659171,...,11.633240,0.110633,3.965590,0.614679,0.995099,1.009248,0.042057,10.105152,0.046395,0.669801


In [68]:
df_slice_art = df_slice[df_slice['temps_inj']=='ART']
df_slice_port = df_slice[df_slice['temps_inj']=='PORT']
df_slice_vein = df_slice[df_slice['temps_inj']=='VEIN']
df_slice_tard = df_slice[df_slice['temps_inj']=='TARD']


In [69]:
df_slice_art = df_slice_art[df_slice_art['patient_num'].isin(set_art & set_port & set_vein & set_tard)]
df_slice_port = df_slice_port[df_slice_port['patient_num'].isin(set_art & set_port & set_vein & set_tard)]
df_slice_vein = df_slice_vein[df_slice_vein['patient_num'].isin(set_art & set_port & set_vein & set_tard)]
df_slice_tard = df_slice_tard[df_slice_tard['patient_num'].isin(set_art & set_port & set_vein & set_tard)]

print(len(df_slice_art))
print(len(df_slice_port))
print(len(df_slice_vein))
print(len(df_slice_tard))

2036
2015
2004
1978


Same as global, there is a need to investigate the intersection between the different datasets. Moreover, we have to look at the number of slices per patient.

In [70]:
# Number of slices per patient
df_slice_art['patient_num'].value_counts()
df_slice_port['patient_num'].value_counts()
df_slice_vein['patient_num'].value_counts()
df_slice_tard['patient_num'].value_counts()

patient_num
2030    91
13      65
104     63
62      57
2020    50
        ..
1        3
2016     3
212      2
213      2
2008     2
Name: count, Length: 117, dtype: int64

We chose to keep the patients with at least 3 slices which is a good trade-off for us.

In [71]:
# Keep the patients with at least 3 slices for ART, PORT, VEIN and TARD

df_slice_art = df_slice_art[df_slice_art['patient_num'].isin(df_slice_art['patient_num'].value_counts()[df_slice_art['patient_num'].value_counts()>=3].index)]
df_slice_port = df_slice_port[df_slice_port['patient_num'].isin(df_slice_port['patient_num'].value_counts()[df_slice_port['patient_num'].value_counts()>=3].index)]
df_slice_vein = df_slice_vein[df_slice_vein['patient_num'].isin(df_slice_vein['patient_num'].value_counts()[df_slice_vein['patient_num'].value_counts()>=3].index)]
df_slice_tard = df_slice_tard[df_slice_tard['patient_num'].isin(df_slice_tard['patient_num'].value_counts()[df_slice_tard['patient_num'].value_counts()>=3].index)]

print(len(df_slice_art['patient_num'].unique()))
print(len(df_slice_port['patient_num'].unique()))
print(len(df_slice_vein['patient_num'].unique()))
print(len(df_slice_tard['patient_num'].unique()))

116
116
114
114


In [72]:
set_art_slice = set(df_slice_art['patient_num'])
set_port_slice = set(df_slice_port['patient_num'])
set_vein_slice = set(df_slice_vein['patient_num'])
set_tard_slice = set(df_slice_tard['patient_num'])

print(len(set_art_slice & set_port_slice & set_vein_slice & set_tard_slice))    
print(len(set_art_slice & set_port_slice & set_vein_slice))
print(len(set_art_slice & set_port_slice & set_tard_slice))
print(len(set_art_slice & set_vein_slice & set_tard_slice))
print(len(set_port_slice & set_vein_slice & set_tard_slice))



114
114
114
114
114


Finally, we got 114 patients in the dataset.

In [73]:
df_slice_art = df_slice_art[df_slice_art['patient_num'].isin(set_art_slice & set_port_slice & set_vein_slice & set_tard_slice)]
df_slice_port = df_slice_port[df_slice_port['patient_num'].isin(set_art_slice & set_port_slice & set_vein_slice & set_tard_slice)]
df_slice_vein = df_slice_vein[df_slice_vein['patient_num'].isin(set_art_slice & set_port_slice & set_vein_slice & set_tard_slice)]
df_slice_tard = df_slice_tard[df_slice_tard['patient_num'].isin(set_art_slice & set_port_slice & set_vein_slice & set_tard_slice)]



When we have the choice, we keep the 3 most central slices because we hope it gives more insights on the nature of the tumor.

In [74]:
# Keep the 3 middle slices for each patient


df_slice_art['median_slice'] = df_slice_art.groupby('patient_num')['slice_num'].transform('median').astype(int)
df_slice_art['median_minus'] = df_slice_art['median_slice'] - 1
df_slice_art['median_plus'] = df_slice_art['median_slice'] + 1

df_slice_port['median_slice'] = df_slice_port.groupby('patient_num')['slice_num'].transform('median').astype(int)
df_slice_port['median_minus'] = df_slice_port['median_slice'] - 1
df_slice_port['median_plus'] = df_slice_port['median_slice'] + 1

df_slice_vein['median_slice'] = df_slice_vein.groupby('patient_num')['slice_num'].transform('median').astype(int)
df_slice_vein['median_minus'] = df_slice_vein['median_slice'] - 1
df_slice_vein['median_plus'] = df_slice_vein['median_slice'] + 1

df_slice_tard['median_slice'] = df_slice_tard.groupby('patient_num')['slice_num'].transform('median').astype(int)
df_slice_tard['median_minus'] = df_slice_tard['median_slice'] - 1
df_slice_tard['median_plus'] = df_slice_tard['median_slice'] + 1

In [75]:
df_slice_art_med = df_slice_art[(df_slice_art['slice_num'] == df_slice_art['median_slice']) ]
df_slice_art_minus = df_slice_art[(df_slice_art['slice_num'] == df_slice_art['median_minus']) ]
df_slice_art_plus = df_slice_art[(df_slice_art['slice_num'] == df_slice_art['median_plus']) ]

df_slice_port_med = df_slice_port[(df_slice_port['slice_num'] == df_slice_port['median_slice']) ]
df_slice_port_minus = df_slice_port[(df_slice_port['slice_num'] == df_slice_port['median_minus']) ]
df_slice_port_plus = df_slice_port[(df_slice_port['slice_num'] == df_slice_port['median_plus']) ]

df_slice_vein_med = df_slice_vein[(df_slice_vein['slice_num'] == df_slice_vein['median_slice']) ]
df_slice_vein_minus = df_slice_vein[(df_slice_vein['slice_num'] == df_slice_vein['median_minus']) ]
df_slice_vein_plus = df_slice_vein[(df_slice_vein['slice_num'] == df_slice_vein['median_plus']) ]

df_slice_tard_med = df_slice_tard[(df_slice_tard['slice_num'] == df_slice_tard['median_slice']) ]
df_slice_tard_minus = df_slice_tard[(df_slice_tard['slice_num'] == df_slice_tard['median_minus']) ]
df_slice_tard_plus = df_slice_tard[(df_slice_tard['slice_num'] == df_slice_tard['median_plus']) ]

In [76]:
df_slice_art_med = df_slice_art_med.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])
df_slice_art_minus = df_slice_art_minus.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])
df_slice_art_plus = df_slice_art_plus.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])

df_slice_port_med = df_slice_port_med.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])
df_slice_port_minus = df_slice_port_minus.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])
df_slice_port_plus = df_slice_port_plus.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])

df_slice_vein_med = df_slice_vein_med.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])
df_slice_vein_minus = df_slice_vein_minus.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])
df_slice_vein_plus = df_slice_vein_plus.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])

df_slice_tard_med = df_slice_tard_med.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])
df_slice_tard_minus = df_slice_tard_minus.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])
df_slice_tard_plus = df_slice_tard_plus.drop(columns=['median_slice', 'median_minus', 'median_plus','slice_num','temps_inj'])



In [77]:
df_slice_art_med.sort_values(by=['patient_num'], inplace=True)
df_slice_art_minus.sort_values(by=['patient_num'], inplace=True)
df_slice_art_plus.sort_values(by=['patient_num'], inplace=True)

df_slice_port_med.sort_values(by=['patient_num'], inplace=True)
df_slice_port_minus.sort_values(by=['patient_num'], inplace=True)
df_slice_port_plus.sort_values(by=['patient_num'], inplace=True)

df_slice_tard_med.sort_values(by=['patient_num'], inplace=True)
df_slice_tard_minus.sort_values(by=['patient_num'], inplace=True)
df_slice_tard_plus.sort_values(by=['patient_num'], inplace=True)

df_slice_vein_med.sort_values(by=['patient_num'], inplace=True)
df_slice_vein_minus.sort_values(by=['patient_num'], inplace=True)
df_slice_vein_plus.sort_values(by=['patient_num'], inplace=True)





In [78]:
array_art_med = df_slice_art_med.to_numpy()
array_art_minus = df_slice_art_minus.to_numpy()
array_art_plus = df_slice_art_plus.to_numpy()

# Concatenate the 3 array on a new axis
array_art = np.array([array_art_minus, array_art_med, array_art_plus])
array_art = np.moveaxis(array_art, 0, 2)

array_port_med = df_slice_port_med.to_numpy()
array_port_minus = df_slice_port_minus.to_numpy()
array_port_plus = df_slice_port_plus.to_numpy()

# Concatenate the 3 array on a new axis
array_port = np.array([array_port_minus, array_port_med, array_port_plus])
array_port = np.moveaxis(array_port, 0, 2)

array_vein_med = df_slice_vein_med.to_numpy()
array_vein_minus = df_slice_vein_minus.to_numpy()
array_vein_plus = df_slice_vein_plus.to_numpy()

# Concatenate the 3 array on a new axis
array_vein = np.array([array_vein_minus, array_vein_med, array_vein_plus])
array_vein = np.moveaxis(array_vein, 0, 2)

array_tard_med = df_slice_tard_med.to_numpy()
array_tard_minus = df_slice_tard_minus.to_numpy()
array_tard_plus = df_slice_tard_plus.to_numpy()

# Concatenate the 3 array on a new axis
array_tard = np.array([array_tard_minus, array_tard_med, array_tard_plus])
array_tard = np.moveaxis(array_tard, 0, 2)

In [79]:
array_final = np.array([array_art, array_port, array_vein, array_tard])
array_final = np.moveaxis(array_final, 0, 3)

print(array_final.shape)
array_final


(114, 95, 3, 4)


array([[[['CHC', 'CHC', 'CHC', 'CHC'],
         ['CHC', 'CHC', 'CHC', 'CHC'],
         ['CHC', 'CHC', 'CHC', 'CHC']],

        [[1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1]],

        [[183.4, 258.0, 233.6, 201.8],
         [184.8, 261.0, 231.0, 194.0],
         [187.0, 258.4, 231.5, 190.6]],

        ...,

        [[4.5814814814814815, 2.795913637434733, 1.1548500881834214,
          3.2481721563460697],
         [20.946367829701163, 4.601851200828104, 4.616158411685088,
          7.383861661512116],
         [11.850729131979133, 1.3263507283791258, 5.591705211825014,
          3.355005593568697]],

        [[0.1337448559670782, 0.0331772847547323, 0.0529052143135362,
          0.0423411065386374],
         [0.1049588356401342, 0.0268152489717886, 0.0540304627265523,
          0.0687362145189408],
         [0.1085575810185185, 0.0358189300411522, 0.0523526752691707,
          0.0330080907795644]],

        [[1.606837606837607, 0.838235294117647, 0.6666666666666666,
     

Important to notice that we keep the slice number which is not a real feature and must not be present when doing prediction (second feature)

In [273]:
# Save the array in a npy file
np.save('./Données-20240401/tensored_data_3slices.npy', array_final)