### Define dataset pre-processing functions

In [1]:
import numpy as np
import pandas as pd

# Pre-process data
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Preprocess features
from sklearn.preprocessing import LabelEncoder

types_of_cells = 'All'
mol_profiles = 'All'
therapies = 'All'

features = ['GlobalCellType','cycif.slide','TIM3','pSTAT1','CD45RO','CD20','CD11c','CD207','GranzymeB','CD163','CD4','CD3d','CD8a','FOXP3','PD1','CD15','PDL1_488','Ki67','Vimentin','MHCII','MHCI','ECadherin','aSMA','CD31','pTBK1','CK7','yH2AX','cPARP1',
            'Area','Eccentricity','Roundness','CD11c.MY','CD15.MY','CD163.MP','CD207.MY','CD31.stromal','CD4.T.cells','CD68.MP','CD8.T.cells','Cancer','Other','Other.MY','Stromal','T.regs','B.cells','Other.immune',
            'Molecular.profile2','therapy_sequence','patient']

transformation = 'LOG2' 
features_to_transform = ['TIM3','pSTAT1','CD45RO','CD20','CD11c','CD207','GranzymeB','CD163','CD4','CD3d','CD8a','FOXP3','PD1','CD15','PDL1_488','Ki67','Vimentin','MHCII','MHCI','ECadherin','aSMA','CD31','pTBK1','CK7','yH2AX','cPARP1']

operation = 'remove'
features_for_outliers = features_to_scale = ['TIM3','pSTAT1','CD45RO','CD20','CD11c','CD207','GranzymeB','CD163','CD4','CD3d','CD8a','FOXP3','PD1','CD15','PDL1_488','Ki67','Vimentin','MHCII','MHCI','ECadherin','aSMA','CD31','pTBK1','CK7','yH2AX','cPARP1',
            'Area','Eccentricity','Roundness']
        
# Scale data
scaler_dict = {'MinMax': MinMaxScaler(),'Standard': StandardScaler()}
# MinMaxScaler scales the data to be within a specific range, usually between 0 and 1
# StandardScaler scales the data to have a mean of zero and a standard deviation of one
scaler_type = 'Standard' # Scaling type
scale_by = 'slide' # Define how to scale. Available options: whole, patient, slide

# Variables for machine learning step
categorical_variables = ['Molecular.profile2', 'therapy_sequence']

def choose_features(df):
        df = df.loc[:,features]
        
        return df
    
def transform_df(df):

    try:
        if transformation == 'LOG':
            df.loc[:, features_to_transform] = np.log(df.loc[:, features_to_transform] + 1)

        elif transformation == 'LOG2':
            df.loc[:, features_to_transform] = np.log2(df.loc[:, features_to_transform] + 1)
            
        elif transformation == 'BOXCOX':
            # transform data & save lambda value
            for feature in features_to_transform:
                df[feature], _ = stats.boxcox(df[feature].values)

        else:
            raise ValueError("Invalid transformation specified.")
        
        return df
        
    except Exception as e:
        print(f"\nAn error occurred data transformation: {e}", exc_info=True)

    
            
def remove_outliers(df):
    
    try:
        if operation == 'trim_by_slide':
            slides = df['cycif.slide'].unique()
        
            for slide in slides:
                # Create a mask to filter data belonging to the current slide
                slide_mask = df['cycif.slide'] == slide
                
                for feature in features_for_outliers:
                    percentiles = np.percentile(df.loc[slide_mask, feature], [1, 99])
                    
                    # Replace values below the 1st percentile with the 1st percentile value
                    df.loc[slide_mask & (df[feature] < percentiles[0]), feature] = percentiles[0]
                    # Replace values above the 99th percentile with the 99th percentile value
                    df.loc[slide_mask & (df[feature] > percentiles[1]), feature] = percentiles[1]

        elif operation == 'remove':

            df_sub = df.loc[:, features_for_outliers]

            # Identify outliers using the 1st (0.01) and 99th (0.99) percentiles for each feature
            # For each data point, 'lim' will be True if the value is within the range [0.01, 0.99], otherwise False
            lim = np.logical_and(df_sub < df_sub.quantile(0.99, numeric_only=False),
                            df_sub > df_sub.quantile(0.01, numeric_only=False))

            # Data points outside the range [0.01, 0.99] will be replaced with NaN
            df.loc[:, features_for_outliers] = df_sub.where(lim, np.nan)
            
            # Drop rows with NaN in numerical columns
            df.dropna(subset=features_for_outliers, inplace=True)
                    
        else: 
            raise ValueError("Invalid operation specified.")
        
        return df
    
    except Exception as e:
        print(f"\nAn error occurred  outliers removal: {e}", exc_info=True)
    
def scaler(df):
    
        # Get a scaler from the dictionary of supported scaler types
        scaler = scaler_dict.get(scaler_type)

        return scaler.fit_transform(df)

def scaling(df):
    
    try:
        if scaler_type in scaler_dict:
            
            if scale_by not in ['whole','patient', 'slide']:
                raise ValueError(f"Invalid order: {scale_by}")
            
            if scale_by == 'patient':
                patients = df['patient'].unique()
                # Iterate through each unique patient ID and scale the specified features for each patient separately
                for patient in patients:
                    df.loc[df['patient'] == patient, features_to_scale] = scaler(df.loc[df['patient'] == patient, features_to_scale])
            
            elif scale_by == 'slide':
                slides = df['cycif.slide'].unique()
                # Iterate through each unique slide and scale the specified features for each slide separately
                for slide in slides:
                    df.loc[df['cycif.slide'] == slide, features_to_scale] = scaler(df.loc[df['cycif.slide'] == slide, features_to_scale])
            
            elif scale_by == 'whole':
                # Scale the specified features on the entire DataFrame
                df.loc[features_to_scale] = scaler(df.loc[features_to_scale])
            
            return df
    
    except Exception as e:
        print(f"\nAn error occurred  scaling: {e}", exc_info=True)

def choose_types_of_cells(df):
        
    # Remove Others and group immune and stromal cells
    df = df[~df["GlobalCellType"].isin(['Others','Other'])]
    df["GlobalCellType"] = df["GlobalCellType"].replace(to_replace = ['CD8.T.cells', 'B.cells', 'T.regs', 'CD4.T.cells'], value = 'Lymphocytes')
    df["GlobalCellType"] = df["GlobalCellType"].replace(to_replace = ['CD11c.MY', 'Other.MY', 'CD163.MP', 'CD207.MY', 'CD68.MP', 'CD15.MY'], value = 'Myeloids')
    df["GlobalCellType"] = df["GlobalCellType"].replace(to_replace = ['CD31.stromal'], value = 'Stromal')
    
    try:
        # Leave only chosen cell types in df
        if types_of_cells == "Non-cancer":
            df = df[~df["GlobalCellType"].isin(['Cancer'])]
            
        elif types_of_cells == "Lymphocytes-stromal":
            df = df[df["GlobalCellType"].isin(['Lymphocytes', 'Stromal'])]
            
        elif types_of_cells == "Myeloid-stromal":
            df = df[df["GlobalCellType"].isin(['Myeloids', 'Stromal'])]
            
        elif types_of_cells == "Immune":
            df = df[df["GlobalCellType"].isin(['Myeloids', 'Lymphocytes', 'Other.immune'])]

        elif types_of_cells in ["Cancer", "Stromal", "Myeloids"]:
            df = df[df["GlobalCellType"] == types_of_cells]
                
        elif types_of_cells not in ["All"]:
            raise ValueError("Invalid cell type(s) specified.")
        
    except Exception as e:
        print(f"\nAn error occurred while choosing cell type: {e}", exc_info=True)
    
    print(f"\nChose successfully cell types: {df['GlobalCellType'].unique()}.")
    
    df["GlobalCellType"] = df["GlobalCellType"].replace(to_replace = ['Myeloids', 'Lymphocytes', 'Other.immune'], value = 'Immune')

    cell_types = df["GlobalCellType"].unique()

    unique_patients = df['patient'].unique()
    
    # Remove patients with too few cells for each chosen cell type
    for cell_type in cell_types:
        # Group df by patient and get the size of each group
        grouped_df = df[df['GlobalCellType'] == cell_type].groupby('patient').size()

        # Filter patients with 100 or fewer cells and store them
        selected_patients = grouped_df[grouped_df <= 100].index

        # Remove patients with 100 and fewer cells
        if len(selected_patients) > 0:
            df = df[~df['patient'].isin(selected_patients)]

    unique_patients = df['patient'].unique()
    print(f"\nAfter thresholding number of patients: {len(unique_patients)}")
    
    # Remove column GlobalCellType
    df = df.drop(columns='GlobalCellType')

    return df
        
def choose_mol_profiles(df):
    
    if mol_profiles != 'All':
        df = df[df['Molecular.profile2'].isin(mol_profiles)]
    else:
        df = df[df['Molecular.profile2'].isin(['BRCAmutmet','HRD','HRP','CCNE1amp'])]
        
    return df

def choose_therapy_sequences(df):
    if therapies != 'All':
        df = df[df['therapy_sequence'] == therapies]
    
    return df

def prepare_categorical_inputs(df):

    # Encode categorical variables 
    label_encoder_dict = {}
    for variable in categorical_variables:
        
        label_encoder = LabelEncoder()
        
        # Convert the values of the current categorical variable to strings and encode them
        # The encoded values will replace the original values in the DataFrame
        df[variable] = label_encoder.fit_transform(df[variable].astype('str'))
        
        # Save the encoding results for the current variable in 'label_encoder_dict'
        # The dictionary will store the unique class labels and their corresponding encoded values
        label_encoder_dict[variable] = {
            'label_codes': label_encoder.classes_.tolist(), 
            'label_values': label_encoder.transform(label_encoder.classes_).tolist()
        }
        
        print(label_encoder_dict)
        
    
    return df

def prep_train_test_data(df):

    # To receive same number of patients independent of runs
    np.random.seed(33)
    
    # Group df by Molecular profile and therapy
    grouped = df.groupby(['Molecular.profile2', 'therapy_sequence'])

    # Assign 80% of patients from each unique group of Molecular profile and therapy to training and rest to test sets
    X_train_full = grouped.apply(lambda x: x.loc[x['patient'].isin(np.random.choice(x['patient'].unique(), size=int(0.8*len(x['patient'].unique())), replace=False))])
    X_test_full = grouped.apply(lambda x: x.loc[x['patient'].isin(np.setdiff1d(x['patient'].unique(), X_train_full['patient']))])
    
    X_train_full.reset_index(drop=True, inplace=True)
    X_test_full.reset_index(drop=True, inplace=True)
    
    # Assign target columns to y_train and y_test
    y_train=X_train_full['Molecular.profile2']
    y_test=X_test_full['Molecular.profile2']
    
    # Drop target columns
    X_train = X_train_full.drop(columns = ['Molecular.profile2', 'patient', 'cycif.slide'])
    X_test = X_test_full.drop(columns = ['Molecular.profile2', 'patient', 'cycif.slide'])
    
    return X_train_full.drop(columns = ['patient', 'cycif.slide']),X_test_full.drop(columns = ['patient', 'cycif.slide']),X_train,X_test,y_train,y_test

### Load dataset

In [2]:
df = pd.read_csv("final_dataset_202403.csv")
df['Molecular.profile2'] = df['Molecular.profile2'].replace('BRCAmut/met', 'BRCAmutmet')

### Characterise dataset

In [3]:
print(df['Molecular.profile2'].unique())
print(df['therapy_sequence'].unique())
print(len(df['patient'].unique()))

['Other' 'BRCAmutmet' 'HRD' 'HRP' 'CCNE1amp']
['PDS' 'IDS']
233


### Pre-process dataset

In [4]:
df = choose_features(df)
print(len(df['patient'].unique()))
df = transform_df(df)
print(len(df['patient'].unique()))
df = remove_outliers(df)
print(len(df['patient'].unique()))
df = scaling(df)
print(len(df['patient'].unique()))
df = choose_types_of_cells(df)
print(len(df['patient'].unique()))
df = choose_mol_profiles(df)
print(len(df['patient'].unique()))
df = choose_therapy_sequences(df)
print(len(df['patient'].unique()))

df = prepare_categorical_inputs(df)
print(len(df['patient'].unique()))
X_train_full,X_test_full,X_train,X_test,y_train,y_test = prep_train_test_data(df)

233
233
233
233


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["GlobalCellType"] = df["GlobalCellType"].replace(to_replace = ['CD8.T.cells', 'B.cells', 'T.regs', 'CD4.T.cells'], value = 'Lymphocytes')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["GlobalCellType"] = df["GlobalCellType"].replace(to_replace = ['CD11c.MY', 'Other.MY', 'CD163.MP', 'CD207.MY', 'CD68.MP', 'CD15.MY'], value = 'Myeloids')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pa


Chose successfully cell types: ['Cancer' 'Stromal' 'Lymphocytes' 'Myeloids' 'Other.immune'].


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["GlobalCellType"] = df["GlobalCellType"].replace(to_replace = ['Myeloids', 'Lymphocytes', 'Other.immune'], value = 'Immune')



After thresholding number of patients: 224
224
157
157
{'Molecular.profile2': {'label_codes': ['BRCAmutmet', 'CCNE1amp', 'HRD', 'HRP'], 'label_values': [0, 1, 2, 3]}}
{'Molecular.profile2': {'label_codes': ['BRCAmutmet', 'CCNE1amp', 'HRD', 'HRP'], 'label_values': [0, 1, 2, 3]}, 'therapy_sequence': {'label_codes': ['IDS', 'PDS'], 'label_values': [0, 1]}}
157


In [5]:
print(X_train_full.head(5))

       TIM3    pSTAT1    CD45RO      CD20     CD11c     CD207  GranzymeB  \
0  0.856119  0.444584  2.022115  3.584032  1.704338  1.265360   0.903239   
1  1.759301  2.009022  1.929934  1.920628  2.629491  2.370827   3.202361   
2  2.331201 -0.251148  0.229573  1.336762 -0.138185  1.880480   0.795485   
3  1.963668 -0.040149  0.640188  1.639086  0.205153  1.789590  -0.062243   
4  1.631058  1.235478  1.616692  1.150106  2.582299  1.903677   0.711585   

      CD163       CD4      CD3d  ...  CD8.T.cells    Cancer  Other  Other.MY  \
0  0.392078  4.417961  2.625805  ...     0.095238  0.333333    0.0       0.0   
1  1.085341  2.623066  2.927081  ...     0.187500  0.562500    0.0       0.0   
2 -0.380442  0.653997  1.238564  ...     0.050000  0.550000    0.0       0.0   
3 -0.163565  0.998075  1.086835  ...     0.055556  0.611111    0.0       0.0   
4  0.759488  3.059049  2.155379  ...     0.117647  0.000000    0.0       0.0   

   Stromal    T.regs   B.cells  Other.immune  Molecular.profil

In [6]:
print(X_train_full.shape)
print(X_test_full.shape)

(1555200, 46)
(405232, 46)


In [7]:
X_train_full.index

RangeIndex(start=0, stop=1555200, step=1)

In [8]:
full_df = pd.concat([X_train_full, X_test_full], ignore_index=True)
print(full_df.shape)

(1960432, 46)


In [9]:
print(full_df.head(5))

       TIM3    pSTAT1    CD45RO      CD20     CD11c     CD207  GranzymeB  \
0  0.856119  0.444584  2.022115  3.584032  1.704338  1.265360   0.903239   
1  1.759301  2.009022  1.929934  1.920628  2.629491  2.370827   3.202361   
2  2.331201 -0.251148  0.229573  1.336762 -0.138185  1.880480   0.795485   
3  1.963668 -0.040149  0.640188  1.639086  0.205153  1.789590  -0.062243   
4  1.631058  1.235478  1.616692  1.150106  2.582299  1.903677   0.711585   

      CD163       CD4      CD3d  ...  CD8.T.cells    Cancer  Other  Other.MY  \
0  0.392078  4.417961  2.625805  ...     0.095238  0.333333    0.0       0.0   
1  1.085341  2.623066  2.927081  ...     0.187500  0.562500    0.0       0.0   
2 -0.380442  0.653997  1.238564  ...     0.050000  0.550000    0.0       0.0   
3 -0.163565  0.998075  1.086835  ...     0.055556  0.611111    0.0       0.0   
4  0.759488  3.059049  2.155379  ...     0.117647  0.000000    0.0       0.0   

   Stromal    T.regs   B.cells  Other.immune  Molecular.profil

In [10]:
full_df.index

RangeIndex(start=0, stop=1960432, step=1)

### Set up ML experiment

In [11]:
import mlflow 
mlflow.set_tracking_uri("http://127.0.0.1:8080")

In [12]:
import pandas as pd
from pycaret.classification import *

# Init setup
clf1 = setup(X_train_full, target='Molecular.profile2', log_experiment = 'mlflow', experiment_name = 'Choice_of_model_202403', test_data=X_test_full, index = False, preprocess=False)

Unnamed: 0,Description,Value
0,Session id,3565
1,Target,Molecular.profile2
2,Target type,Multiclass
3,Original data shape,"(1960432, 46)"
4,Transformed data shape,"(1960432, 46)"
5,Transformed train set shape,"(1555200, 46)"
6,Transformed test set shape,"(405232, 46)"
7,Numeric features,45


### Compare nine models on training set

In [14]:
best_model = compare_models(include = ['lr','nb','dt','svm','rf','ada','gbc','lightgbm','lda'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.5101,0.6797,0.5101,0.497,0.4731,0.2412,0.2567,80.31
lightgbm,Light Gradient Boosting Machine,0.4683,0.6532,0.4683,0.444,0.4381,0.1924,0.2002,659.712
gbc,Gradient Boosting Classifier,0.4629,0.6363,0.4629,0.4202,0.4108,0.1573,0.1724,928.207
svm,SVM - Linear Kernel,0.45,0.0,0.45,0.3743,0.3592,0.0961,0.1142,1.828
lr,Logistic Regression,0.4322,0.5797,0.4322,0.3577,0.3606,0.1035,0.1139,14.235
lda,Linear Discriminant Analysis,0.4317,0.5843,0.4317,0.3665,0.3653,0.1088,0.1195,1.419
ada,Ada Boost Classifier,0.4227,0.5961,0.4227,0.3905,0.3818,0.1102,0.1185,44.216
dt,Decision Tree Classifier,0.3994,0.5742,0.3994,0.4063,0.3994,0.1463,0.1473,12.799
nb,Naive Bayes,0.3158,0.5858,0.3158,0.3923,0.3114,0.0845,0.0925,0.455
