# Data processing for machine learning



## Import librairies

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder

### Function folder creation

In [2]:
def createFolder(folder_path):
    folder_path = Path(folder_path)
    folder_path.mkdir(parents=True, exist_ok=True)


## Data importation and pre-processing

### Load data from different experiments
Each dataframe corresponding to one experiment is loaded and then all dataframes with the same cell type are merged into one while each experiment's id and data has been kept tracked of.

In [3]:
# Folder containing raw data of PDX and A673 cell type experiments
folder_rawdata_path = Path("data/raw_data")

# Get the file names and paths of the .csv files of the raw data folder
csv_file_names = [f.name for f in folder_rawdata_path.glob("*.csv")]
csv_file_paths = [f for f in folder_rawdata_path.iterdir() if f.is_file() and f.suffix == '.csv']

# Initialize empty list to store the data from the two types of cells
df_A673_list = []
df_PDX_list = []

# The file names are in the following pattern: Celltype_exp1234_X where Celltype can be A673 or PDX, 1234_X is a four digit number corresponding to the experiment id
# with X that is optional depending on number of experiments with same id, if it exists it can be A or B
# Create a pattern to get the experiment ID from the file name
pattern_exp = r"exp(.*?)\.csv"

# Open each .csv file and store it
for csv_file_name, csv_file_path in zip(csv_file_names, csv_file_paths):
    # Read .csv file
    df = pd.read_csv(csv_file_path,sep=';',decimal=',' )

    # Add a column 'Experiment ID' with the name of the experiment
    exp_id = re.search(pattern_exp, csv_file_name)
    df.insert(0, 'Experiment ID', exp_id.group(1))

    # Add the dataframe to one of the dataframe list depending on its cell type
    if "A673" in csv_file_name:
        df_A673_list.append(df)

    elif "PDX" in csv_file_name:
        df_PDX_list.append(df)


# Concatenate dataframes of the different experiment
df_concat_A673 = pd.concat(df_A673_list, ignore_index=True)
df_concat_PDX = pd.concat(df_PDX_list, ignore_index=True)


In [4]:
df_concat_A673
df_concat_PDX

Unnamed: 0,Experiment ID,day,drug concentration (uM),spheroIndex,Name,Index,Wells,multiple,Area (pix2),Centroid,Perimeter (pix),Solidity,Equivalent Diameter (pix),Circularity,Aspect ratio,Mean grey value,Homogeneity,Energy,Correlation,viability score
0,1305_A,0,0.0,3,3_0905_0,0,905,False,1718.5,"(73, 76)",168.811182,0.940098,46.776727,0.757804,0.973806,452.462699,0.964388,0.917504,0.928949,110.042697
1,1305_A,0,0.0,4,4_1205_0,0,1205,False,1778.5,"(82, 70)",194.468036,0.880228,47.586306,0.590973,0.827615,692.847931,0.961579,0.914106,0.896839,107.184445
2,1305_A,0,0.0,5,5_1504_0,0,1504,False,2285.0,"(65, 68)",252.024385,0.820909,53.938413,0.452075,0.663747,603.939982,0.948965,0.889628,0.913529,84.847736
3,1305_A,0,0.0,7,7_1101_0,0,1101,False,1362.5,"(70, 69)",146.811182,0.952464,41.650797,0.794380,0.914499,541.460982,0.970697,0.934297,0.912357,81.036734
4,1305_A,0,0.0,8,8_0801_0,0,801,False,1354.5,"(80, 72)",171.154328,0.880976,41.528339,0.581049,0.961336,651.215484,0.971015,0.933683,0.918272,89.188045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1442,2406_B,2,20.0,12,12_0210_0,0,210,False,2251.5,"(72, 64)",289.923879,0.798121,53.541562,0.336600,0.617927,979.383469,0.948824,0.890408,0.899373,7.289499
1443,2406_B,2,50.0,1,1_0605_0,0,605,False,2813.0,"(89, 60)",358.308656,0.683846,59.846661,0.275337,0.438507,865.428460,0.939658,0.862436,0.894758,-0.384724
1444,2406_B,2,50.0,6,6_1101_0,0,1101,False,2733.0,"(73, 76)",406.592926,0.668379,58.989522,0.207745,0.771601,860.249515,0.949003,0.864529,0.891042,0.674954
1445,2406_B,2,50.0,7,7_0801_0,0,801,False,1984.0,"(86, 74)",247.681239,0.814450,50.260395,0.406411,0.823776,1066.632205,0.963622,0.903435,0.884403,5.349007


### Remove unnecessary columns, convert columns to integers
The dataframes contains data not needed for the machine learning: paths of the images, wells...

In [5]:
df_concat_A673.columns

Index(['Experiment ID', 'day', 'drug concentration (uM)', 'spheroIndex',
       'Name', 'Index', 'Wells', 'multiple', 'Area (pix2)', 'Centroid',
       'Perimeter (pix)', 'Solidity', 'Equivalent Diameter (pix)',
       'Circularity', 'Aspect ratio', 'Mean grey value', 'Homogeneity',
       'Energy', 'Correlation', 'viability score'],
      dtype='object')

In [6]:
def dropColumns(df, columns_to_drop):
    # Check if the columns to drop are in the dataframe
    columns_to_drop_existing = [col for col in columns_to_drop if col in df.columns]

    # Drop the columns
    df = df.drop(columns = columns_to_drop_existing)

    return df

def convertValues_toInt(df, columns):
    # Convert the values of some columns to integers
    for column in columns:
        df[column] = df[column].astype(int)

    return df

In [7]:

columns_to_drop  = ['Index', 'Name', 'Wells', 'Centroid', 'Aspect ratio']
columns_to_convert = ['day', 'spheroIndex'] # day is the day of the spheroid in experiment ranging from 0 to 2 and spheroIndex corresponds to the position of spheroid in tube that allows us to track it

# For A673
df_concat_clean_A673 = dropColumns(df_concat_A673, columns_to_drop)
df_concat_clean_A673 = convertValues_toInt(df_concat_clean_A673, columns_to_convert)

# For PDX
df_concat_clean_PDX = dropColumns(df_concat_PDX, columns_to_drop)
df_concat_clean_PDX = convertValues_toInt(df_concat_clean_PDX, columns_to_convert)

df_concat_clean_A673.columns

Index(['Experiment ID', 'day', 'drug concentration (uM)', 'spheroIndex',
       'multiple', 'Area (pix2)', 'Perimeter (pix)', 'Solidity',
       'Equivalent Diameter (pix)', 'Circularity', 'Mean grey value',
       'Homogeneity', 'Energy', 'Correlation', 'viability score'],
      dtype='object')

### Add a column 'Spheroid ID'
Each spheroid can be identified by a unique combination: experiment ID, spheroid index and drug concentration. This ID is added to the dataframe for each cell type.

In [8]:
# Create a new column 'spheroid ID' with a tuple value (Experiment, Index, Drug)

# For A673
new_col = df_concat_clean_A673.apply(lambda row: f"({row['Experiment ID']}, {row['spheroIndex']}, {row['drug concentration (uM)']})", axis=1)
df_concat_clean_A673.insert(1, 'spheroid ID', new_col)

# For PDX
new_col = df_concat_clean_PDX.apply(lambda row: f"({row['Experiment ID']}, {row['spheroIndex']}, {row['drug concentration (uM)']})", axis=1)
df_concat_clean_PDX.insert(1, 'spheroid ID', new_col)
df_concat_clean_PDX['Experiment ID'] = df_concat_clean_PDX['Experiment ID'].str.split('_').str[0] # Remove the 'A' or 'B' from Experiment ID'

#df_concat_clean_PDX


### Save .csv file as pre-processed data

In [9]:
pre_processed_folder = '/content/data/pre_processed_data'
createFolder(pre_processed_folder)

df_concat_clean_A673.to_csv(f'{pre_processed_folder}//A673_pre_processed_data.csv', index=False)
df_concat_clean_PDX.to_csv(f'{pre_processed_folder}//PDX_pre_processed_data.csv', index=False)

## Transformation of data

### Deal with multiple spheroids

Some spheroids are divided into several small spheroids (escpecially the first day). So we concatenate the features to get a single set of data per spheroid: area, perimeter, diameter are summed, mean grey level, solidity, homogeneity, correlation and energy are averaged.

For the area:
$$ V_f = V_1 + V_2 $$
$$ \frac{4}{3} \pi r^3_f =  \frac{4}{3} \pi r^3_1 +  \frac{4}{3} \pi r^3_2 $$
$$ r^3_f = r^3_1 + r^3_2 $$
$$ d^3_f = d^3_1 + d^3_2 $$

$$ d_f = (d^3_1 + d^3_2)^{1/3} $$


$$ \frac{4 A_f}{\pi}^{3/2} = \frac{4 A_1}{\pi}^{3/2} + \frac{4 A_2}{\pi}^{3/2} $$
$$ \frac{4 A_f}{\pi} = \left(\frac{4 A_1}{\pi}^{3/2} + \frac{4 A_2}{\pi}^{3/2}\right)^{2/3} $$

$$ A_f = (A_1^{3/2} + A_2^{3/2})^{2/3} $$

In [10]:
def multipleSpheroidManagement(df):
    # Split the DataFrame into single and multiple spheroids
    df_mean = df[df['multiple'] == False]           # Keep single spheroids as is
    df_multiple = df[df['multiple'] == True]        # Process multiple spheroids

    # Group multiple spheroids in the same droplet with their ID and day
    groups = df_multiple.groupby(['day', 'spheroid ID'])

    rows = []              # List to store aggregated rows for each group
    for name, group in groups:      # Iterate over each group

        if len(group) > 1:          # If the group has more than one row
            row = {}                # Dictionary to store the aggregated row

            # Use group keys to populate the row
            row['day'] = name[0]
            row['spheroid ID'] = name[1]

            # Retain values from the first row for certain attributes
            row['multiple'] = group['multiple'].iloc[0]
            row['spheroIndex'] = group['spheroIndex'].iloc[0]
            row['drug concentration (uM)'] = group['drug concentration (uM)'].iloc[0]
            row['Experiment ID'] = group['Experiment ID'].iloc[0]

            # Deal with viability
            row['viability score'] = group['viability score'].dropna().iloc[0] if not group['viability score'].dropna().empty else np.nan

            # Special calculations for area, perimeter, diameter
            row['Area (pix2)'] = (group['Area (pix2)']**(3/2)).sum()**(2/3)
            row['Perimeter (pix)'] = group['Perimeter (pix)'].sum()
            row['Equivalent Diameter (pix)'] = (group['Equivalent Diameter (pix)']**3).sum()**(1/3)

            # Mean calculations for other features
            row['Solidity'] = group['Solidity'].mean()
            row['Circularity'] = group['Circularity'].mean()
            row['Mean grey value'] = group['Mean grey value'].mean()
            row['Homogeneity'] = group['Homogeneity'].mean()
            row['Energy'] = group['Energy'].mean()
            row['Correlation'] = group['Correlation'].mean()

            # Fill any remaining columns with NaN
            for col in df_multiple.columns:
                if col not in row:
                    row[col] = np.nan

            # Add the row of multiple spheroids to the list
            rows.append(row)

    # Convert rows corresponding to multiple spheroids into a DataFrame
    rows_df = pd.DataFrame(rows)

    # Combine single spheroids and multiple spheroids data
    df_concat = pd.concat([df_mean, rows_df], ignore_index=True)

    return df_concat


In [11]:
df_concat_clean_multi_A673 = multipleSpheroidManagement(df_concat_clean_A673)
#df_concat_clean_multi_A673[df_concat_clean_multi_A673["multiple"]==True]

df_concat_clean_multi_PDX = multipleSpheroidManagement(df_concat_clean_PDX)
#df_concat_clean_multi_PDX


#### Multiple spheroid count

In [12]:
# For A673 count the number of cases (droplets) with multiple spheroids vs single spheroids
number_multi_sphero_A673 = df_concat_clean_multi_A673[df_concat_clean_multi_A673['day']==0].value_counts('multiple')
n_multiple_A673 = number_multi_sphero_A673[True]
n_single_A673 = number_multi_sphero_A673[False]
print(f"A673: {n_multiple_A673} multiples spheroids, {n_single_A673} single spheroids")

prop_multi_sphero_A673 = df_concat_clean_multi_A673[df_concat_clean_multi_A673['day']==0].value_counts('multiple', normalize=True)
p_multiple = prop_multi_sphero_A673[True]
p_single = prop_multi_sphero_A673[False]
print(f"A673: {p_multiple:.2%} multiples spheroids, {p_single:.2%} single spheroids\n")


# For PDX count the number of cases (droplets) with multiple spheroids vs single spheroids
number_multi_sphero_PDX = df_concat_clean_multi_PDX[df_concat_clean_multi_PDX['day']==0].value_counts('multiple')
n_multiple_PDX = number_multi_sphero_PDX[True]
n_single_PDX = number_multi_sphero_PDX[False]
print(f"PDX: {n_multiple_PDX} multiples spheroids, {n_single_PDX} single spheroids")

prop_multi_sphero_PDX = df_concat_clean_multi_PDX[df_concat_clean_multi_PDX['day']==0].value_counts('multiple', normalize=True)
p_multiple = prop_multi_sphero_PDX[True]
p_single = prop_multi_sphero_PDX[False]
print(f"PDX: {p_multiple:.2%} multiples spheroids, {p_single:.2%} single spheroids")



A673: 25 multiples spheroids, 307 single spheroids
A673: 7.53% multiples spheroids, 92.47% single spheroids

PDX: 37 multiples spheroids, 443 single spheroids
PDX: 7.71% multiples spheroids, 92.29% single spheroids


### Transform units: pix to µm

In [13]:
pix = 3.26 #µm based on plate reader device 4X magnification

In [14]:
df_concat_clean_multi_A673.columns

Index(['Experiment ID', 'spheroid ID', 'day', 'drug concentration (uM)',
       'spheroIndex', 'multiple', 'Area (pix2)', 'Perimeter (pix)', 'Solidity',
       'Equivalent Diameter (pix)', 'Circularity', 'Mean grey value',
       'Homogeneity', 'Energy', 'Correlation', 'viability score'],
      dtype='object')

In [15]:
# For A673 convert equivalent diameter, perimeter and area from pixels to µm
df_concat_clean_multi_convert_A673 = df_concat_clean_multi_A673.copy()

df_concat_clean_multi_convert_A673['Equivalent Diameter (pix)'] = df_concat_clean_multi_convert_A673['Equivalent Diameter (pix)']*pix
df_concat_clean_multi_convert_A673['Perimeter (pix)'] = df_concat_clean_multi_convert_A673['Perimeter (pix)']*pix
df_concat_clean_multi_convert_A673['Area (pix2)'] = df_concat_clean_multi_convert_A673['Area (pix2)']*(pix**2)

df_concat_clean_multi_convert_A673.rename(columns={
    'Equivalent Diameter (pix)': 'Equivalent Diameter (um)',
    'Perimeter (pix)': 'Perimeter (um)',
    'Area (pix2)': 'Area (um2)'
}, inplace=True)

#df_concat_clean_multi_convert_A673

# For PDX convert equivalent diameter, perimeter and area from pixels to µm
df_concat_clean_multi_convert_PDX = df_concat_clean_multi_PDX.copy()
df_concat_clean_multi_convert_PDX['Equivalent Diameter (pix)'] = df_concat_clean_multi_convert_PDX['Equivalent Diameter (pix)']*pix
df_concat_clean_multi_convert_PDX['Perimeter (pix)'] = df_concat_clean_multi_convert_PDX['Perimeter (pix)']*pix
df_concat_clean_multi_convert_PDX['Area (pix2)'] = df_concat_clean_multi_convert_PDX['Area (pix2)']*(pix**2)

df_concat_clean_multi_convert_PDX.rename(columns={
    'Equivalent Diameter (pix)': 'Equivalent Diameter (um)',
    'Perimeter (pix)': 'Perimeter (um)',
    'Area (pix2)': 'Area (um2)'
}, inplace=True)

#df_concat_clean_multi_convert_PDX

### Concatenate features of the same spheroid at different days on the same line

In [16]:
def fuseLines(df, columns_to_transform):
    # Initialize a new DataFrame containing only unique 'spheroid ID'
    df_transformed = df[['spheroid ID']].drop_duplicates()

    # Iterate over each column specified for transformation
    for column in columns_to_transform:
        # Create a pivot table for the current column where each row corresponds to one spheroid distinguished by its spheroid id followed by the values of each variable for days 0, 1 and 2 in the columns
        pivot_df = df.pivot_table(index=['spheroid ID'],
                                  columns='day', values=column)

        # Rename columns to include the day number in the column name
        pivot_df.columns = [f"{column}_day{round(col)}" for col in pivot_df.columns]

       # Reset the index to maintain the 'spheroid ID' column for merging
        pivot_df.reset_index(inplace=True)

        # Merge all of the data from day 0, 1 and 2 of all parameters to with its respective spheroid id
        df_transformed = pd.merge(df_transformed, pivot_df, on=['spheroid ID'], how='left')
    
    # Add the viability score of each spheroid as a column
    viab_df = df.groupby(['spheroid ID'], as_index=False).first()
    viab_df = viab_df[['spheroid ID', 'Experiment ID', 'drug concentration (uM)', 'spheroIndex', 'viability score']]

    df_transformed = pd.merge(df_transformed, viab_df, on=['spheroid ID'], how='left')
    
    # Remove redundant columns of data
    for col_name in ['spheroIndex', 'drug concentration (uM)', 'Experiment ID']:
        col_data = df_transformed.pop(col_name)  # Remove column
        df_transformed.insert(df_transformed.columns.get_loc('spheroid ID') + 1, col_name, col_data)  # Insert after 'spheroid ID'

    return df_transformed


In [17]:
df_concat_clean_multi_convert_PDX.columns

Index(['Experiment ID', 'spheroid ID', 'day', 'drug concentration (uM)',
       'spheroIndex', 'multiple', 'Area (um2)', 'Perimeter (um)', 'Solidity',
       'Equivalent Diameter (um)', 'Circularity', 'Mean grey value',
       'Homogeneity', 'Energy', 'Correlation', 'viability score'],
      dtype='object')

In [18]:
df_PDX = df_concat_clean_multi_convert_PDX.copy()
df_A673 = df_concat_clean_multi_convert_A673.copy()

# Liste of columns to transform
columns_to_transform = ['Area (um2)', 'Mean grey value', 'Homogeneity',
                        'Energy', 'Correlation', 'Solidity', 'Circularity', 'Equivalent Diameter (um)', 'Perimeter (um)']

# For A673
df_transformed_A673 = fuseLines(df_A673, columns_to_transform)
df_transformed_A673

# For PDX
df_transformed_PDX = fuseLines(df_PDX, columns_to_transform)
df_transformed_PDX

Unnamed: 0,spheroid ID,Experiment ID,drug concentration (uM),spheroIndex,Area (um2)_day0,Area (um2)_day1,Area (um2)_day2,Mean grey value_day0,Mean grey value_day1,Mean grey value_day2,...,Circularity_day0,Circularity_day1,Circularity_day2,Equivalent Diameter (um)_day0,Equivalent Diameter (um)_day1,Equivalent Diameter (um)_day2,Perimeter (um)_day0,Perimeter (um)_day1,Perimeter (um)_day2,viability score
0,"(1305_A, 3, 0.0)",1305,0.0,3,18263.530600,22317.9600,,452.462699,743.934581,,...,0.757804,0.724489,,152.492129,168.570784,,550.324453,622.180169,,110.042697
1,"(1305_A, 4, 0.0)",1305,0.0,4,18901.186600,23380.7200,34497.1896,692.847931,723.634531,736.379985,...,0.590973,0.573600,0.216512,155.131358,172.537698,209.578591,633.965798,715.697479,1414.998991,107.184445
2,"(1305_A, 5, 0.0)",1305,0.0,5,24284.066000,20654.7406,32849.9116,603.939982,482.042587,609.314213,...,0.452075,0.730784,0.525697,175.839225,162.167915,204.513585,821.599496,595.964454,886.144201,84.847736
3,"(1305_A, 7, 0.0)",1305,0.0,7,14480.105000,23189.4232,22631.4742,541.460982,479.448646,484.823966,...,0.794380,0.458641,0.647482,135.781598,171.830412,169.750664,478.604453,797.101512,662.746470,81.036734
4,"(1305_A, 8, 0.0)",1305,0.0,8,14395.084200,23518.8788,18784.2830,651.215484,755.764029,723.974596,...,0.581049,0.592149,0.770526,135.382386,173.046718,154.650871,557.963109,706.476806,553.488487,89.188045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,"(2406_B, 5, 0.0)",2406,0.0,5,,18640.8104,19421.9390,,527.415994,484.763118,...,,0.540808,0.583024,,154.059135,157.253874,,658.136134,647.005797,84.841667
493,"(2406_B, 20, 0.0)",2406,0.0,20,13049.020701,14687.3432,20335.9126,340.145456,613.133927,509.934624,...,0.649721,0.447248,0.136717,128.897359,136.749794,160.911429,1028.137896,642.395463,1367.177900,104.415321
494,"(2406_B, 15, 1.0)",2406,1.0,15,13126.941394,12200.4848,11493.7494,493.838503,584.754559,904.353863,...,0.586117,0.704941,0.617735,129.281634,124.636029,120.972295,840.040840,466.355463,483.542436,46.774284
495,"(2406_B, 7, 5.0)",2406,5.0,7,15672.611311,18199.7650,20984.1962,439.134062,941.414720,1046.240905,...,0.609730,0.446087,0.567516,141.262127,152.225689,163.456136,875.013864,716.025126,681.651176,22.189100


### Add   $\Delta$ Area,  $\Delta$ Grey level,   $\Delta$ Correlation

In [19]:
def substractCols(df, col1, col2, new_col):
    for (c1, c2, c3) in zip(col1, col2, new_col):
        df[c3] = df[c1] - df[c2]
    return df

def divideCols(df, col1, col2, new_col):
    for (c1, c2, c3) in zip(col1, col2, new_col):
        df[c3] = df[c1] / df[c2]
    return df

In [20]:
# Compute variations of area, grey intensity and correlation between time points and add corresponding columns into the dataframe

transformations = [
    {
        "col1": ['Area (um2)_day1', 'Area (um2)_day2', 'Area (um2)_day2'],
        "col2": ['Area (um2)_day0', 'Area (um2)_day0', 'Area (um2)_day1'],
        "new_cols": ['Growth1-0', 'Growth2-0', 'Growth2-1'],
        "description": "Growth"
    },
    {
        "col1": ['Mean grey value_day1', 'Mean grey value_day2', 'Mean grey value_day2'],
        "col2": ['Mean grey value_day0', 'Mean grey value_day0', 'Mean grey value_day1'],
        "new_cols": ['Grey1-0', 'Grey2-0', 'Grey2-1'],
        "description": "Delta Grey Intensity"
    },
    {
        "col1": ['Correlation_day1', 'Correlation_day2', 'Correlation_day2'],
        "col2": ['Correlation_day0', 'Correlation_day0', 'Correlation_day1'],
        "new_cols": ['DCorrelation_1-0', 'DCorrelation_2-0', 'DCorrelation_2-1'],
        "description": "Delta Correlation"
    }
]


for transform in transformations:
        df_transformed_A673 = substractCols(df_transformed_A673, transform["col1"], transform["col2"], transform["new_cols"])

        df_transformed_PDX = substractCols(df_transformed_PDX, transform["col1"], transform["col2"], transform["new_cols"])

# Ensure 'viability score' is the last column
df_transformed_A673 = df_transformed_A673.reindex(columns=[col for col in df_transformed_A673.columns if col != 'viability score'] + ['viability score'])
df_transformed_PDX = df_transformed_PDX.reindex(columns=[col for col in df_transformed_PDX.columns if col != 'viability score'] + ['viability score'])


df_transformed_A673


Unnamed: 0,spheroid ID,Experiment ID,drug concentration (uM),spheroIndex,Area (um2)_day0,Area (um2)_day1,Area (um2)_day2,Mean grey value_day0,Mean grey value_day1,Mean grey value_day2,...,Growth1-0,Growth2-0,Growth2-1,Grey1-0,Grey2-0,Grey2-1,DCorrelation_1-0,DCorrelation_2-0,DCorrelation_2-1,viability score
0,"(1803, 2, 0.1)",1803,0.1,2,21010.765200,,40799.3564,273.563070,,363.215620,...,,19788.591200,,,89.652550,,,0.014911,,78.738935
1,"(1803, 3, 0.1)",1803,0.1,3,18284.785800,30485.2706,41872.7440,446.811893,477.670048,357.425043,...,12200.4848,23587.958200,11387.4734,30.858155,-89.386850,-120.245005,0.008844,0.027155,0.018311,81.441218
2,"(1803, 4, 0.1)",1803,0.1,4,15930.772400,27041.9282,,233.473857,386.175899,,...,11111.1558,,,152.702042,,,0.007834,,,103.152664
3,"(1803, 5, 0.1)",1803,0.1,5,18412.317000,27653.0152,42882.3660,316.993179,370.756607,350.027196,...,9240.6982,24470.049000,15229.3508,53.763428,33.034017,-20.729411,0.028224,0.028457,0.000233,76.813170
4,"(1803, 6, 0.1)",1803,0.1,6,15771.358400,26930.3384,41112.8706,311.382544,334.841864,466.537552,...,11158.9800,25341.512200,14182.5322,23.459320,155.155008,131.695688,0.005906,-0.002487,-0.008393,73.893462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,"(2901, 6, 20.0)",2901,20.0,6,18838.827310,,23247.8750,346.617821,,1069.836070,...,,4409.047690,,,723.218249,,,-0.060875,,42.281003
348,"(2901, 11, 20.0)",2901,20.0,11,19408.304872,,27647.7014,242.379733,,1047.332597,...,,8239.396528,,,804.952864,,,-0.054548,,34.153426
349,"(2901, 18, 20.0)",2901,20.0,18,17977.776798,,21930.0526,306.838603,,924.446711,...,,3952.275802,,,617.608107,,,-0.039745,,44.481610
350,"(2901, 12, 50.0)",2901,50.0,12,14365.307914,,16345.2488,305.499310,,1027.062685,...,,1979.940886,,,721.563375,,,-0.061309,,4.012441


### Data encoding

#### Encoding of the 'viability score' into categorical classes

In [21]:
def encoding(df, num_class, bins, labels):
    # Split into two parts: NaN and non-NaN values in 'viability score'
    df_not_nan = df[df['viability score'].notna()].copy()

     # Create bins dynamically based on provided bin limit (e.g., [50] will create two bins: (-inf, 50) and (50, inf))
    bins = [-float('inf')] + bins + [float('inf')]  # Add -inf and +inf to create ranges for the classes

    df_not_nan.loc[:, f'{num_class} classes'] = pd.cut(df_not_nan['viability score'], bins=bins, labels=labels)

    label_encoder = LabelEncoder()
    df_not_nan.loc[:, f'{num_class} classes encoded'] = label_encoder.fit_transform(df_not_nan[f'{num_class} classes'])

    dict_labels = {}
    for index, class_label in enumerate(label_encoder.classes_):
        dict_labels[class_label] = index

    # Copy the DataFrame to return the final encoded DataFrame
    df_encoded = df_not_nan.copy()

    return df_encoded, dict_labels

In [22]:
df_A673_encoded = df_transformed_A673.copy()
df_PDX_encoded = df_transformed_PDX.copy()

# Encoding 2 classes
num_class = 2
bins_2classes = [50]
labels_2classes = ['dead', 'alive']

df_A673_encoded, class_A673_2 = encoding(df_A673_encoded, num_class, bins_2classes, labels_2classes)
df_PDX_encoded, class_PDX_2 = encoding(df_PDX_encoded, num_class, bins_2classes, labels_2classes)

# Encoding 3 classes
num_class = 3
bins_3classes = [25, 75]
labels_3classes = ['dead', 'middle', 'alive']

df_A673_encoded, class_A673_3 = encoding(df_A673_encoded, num_class, bins_3classes, labels_3classes)
df_PDX_encoded, class_PDX_3 = encoding(df_PDX_encoded, num_class, bins_3classes, labels_3classes)
df_A673_encoded


Unnamed: 0,spheroid ID,Experiment ID,drug concentration (uM),spheroIndex,Area (um2)_day0,Area (um2)_day1,Area (um2)_day2,Mean grey value_day0,Mean grey value_day1,Mean grey value_day2,...,Grey2-0,Grey2-1,DCorrelation_1-0,DCorrelation_2-0,DCorrelation_2-1,viability score,2 classes,2 classes encoded,3 classes,3 classes encoded
0,"(1803, 2, 0.1)",1803,0.1,2,21010.765200,,40799.3564,273.563070,,363.215620,...,89.652550,,,0.014911,,78.738935,alive,0,alive,0
1,"(1803, 3, 0.1)",1803,0.1,3,18284.785800,30485.2706,41872.7440,446.811893,477.670048,357.425043,...,-89.386850,-120.245005,0.008844,0.027155,0.018311,81.441218,alive,0,alive,0
2,"(1803, 4, 0.1)",1803,0.1,4,15930.772400,27041.9282,,233.473857,386.175899,,...,,,0.007834,,,103.152664,alive,0,alive,0
3,"(1803, 5, 0.1)",1803,0.1,5,18412.317000,27653.0152,42882.3660,316.993179,370.756607,350.027196,...,33.034017,-20.729411,0.028224,0.028457,0.000233,76.813170,alive,0,alive,0
4,"(1803, 6, 0.1)",1803,0.1,6,15771.358400,26930.3384,41112.8706,311.382544,334.841864,466.537552,...,155.155008,131.695688,0.005906,-0.002487,-0.008393,73.893462,alive,0,middle,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
347,"(2901, 6, 20.0)",2901,20.0,6,18838.827310,,23247.8750,346.617821,,1069.836070,...,723.218249,,,-0.060875,,42.281003,dead,1,middle,2
348,"(2901, 11, 20.0)",2901,20.0,11,19408.304872,,27647.7014,242.379733,,1047.332597,...,804.952864,,,-0.054548,,34.153426,dead,1,middle,2
349,"(2901, 18, 20.0)",2901,20.0,18,17977.776798,,21930.0526,306.838603,,924.446711,...,617.608107,,,-0.039745,,44.481610,dead,1,middle,2
350,"(2901, 12, 50.0)",2901,50.0,12,14365.307914,,16345.2488,305.499310,,1027.062685,...,721.563375,,,-0.061309,,4.012441,dead,1,dead,1


## Save processed dataframes

In [23]:
df_A673 = df_A673_encoded.copy()
df_PDX = df_PDX_encoded.copy()

In [24]:
processed_folder = 'data/processed_data'
createFolder(processed_folder)

df_A673.to_csv(f'{processed_folder}/A673_processed_data.csv')
df_PDX.to_csv(f'{processed_folder}//PDX_processed_data.csv')

In [25]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
