In [1]:
import numpy as np
import pandas as pd

# Pre-process data
import scipy.stats as stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Preprocess features
from sklearn.preprocessing import LabelEncoder

types_of_cells = 'All'
mol_profiles = 'All'
therapies = 'All'

features = ['GlobalCellType','cycif.slide','TIM3','pSTAT1','CD45RO','CD20','CD11c','CD207','GranzymeB','CD163','CD4','CD3d','CD8a','FOXP3','PD1','CD15','PDL1_488','Ki67','Vimentin','MHCII','MHCI','ECadherin','aSMA','CD31','pTBK1','CK7','yH2AX','cPARP1',
            'Area','Eccentricity','Roundness','CD11c.MY','CD15.MY','CD163.MP','CD207.MY','CD31.stromal','CD4.T.cells','CD68.MP','CD8.T.cells','Cancer','Other','Other.MY','Stromal','T.regs','B.cells','Other.immune',
            'Molecular.profile2','therapy_sequence','patient']

transformation = 'LOG2' 
features_to_transform = ['TIM3','pSTAT1','CD45RO','CD20','CD11c','CD207','GranzymeB','CD163','CD4','CD3d','CD8a','FOXP3','PD1','CD15','PDL1_488','Ki67','Vimentin','MHCII','MHCI','ECadherin','aSMA','CD31','pTBK1','CK7','yH2AX','cPARP1']

operation = 'remove'
features_for_outliers = features_to_scale = ['TIM3','pSTAT1','CD45RO','CD20','CD11c','CD207','GranzymeB','CD163','CD4','CD3d','CD8a','FOXP3','PD1','CD15','PDL1_488','Ki67','Vimentin','MHCII','MHCI','ECadherin','aSMA','CD31','pTBK1','CK7','yH2AX','cPARP1',
            'Area','Eccentricity','Roundness']
        
# Scale data
scaler_dict = {'MinMax': MinMaxScaler(),'Standard': StandardScaler()}
# MinMaxScaler scales the data to be within a specific range, usually between 0 and 1
# StandardScaler scales the data to have a mean of zero and a standard deviation of one
scaler_type = 'Standard' # Scaling type
scale_by = 'slide' # Define how to scale. Available options: whole, patient, slide

# Variables for machine learning step
categorical_variables = ['Molecular.profile2', 'therapy_sequence']

def choose_features(df):
        df = df.loc[:,features]
        
        return df
    
def transform_df(df):

    try:
        if transformation == 'LOG':
            df.loc[:, features_to_transform] = np.log(df.loc[:, features_to_transform] + 1)

        elif transformation == 'LOG2':
            df.loc[:, features_to_transform] = np.log2(df.loc[:, features_to_transform] + 1)
            
        elif transformation == 'BOXCOX':
            # transform data & save lambda value
            for feature in features_to_transform:
                df[feature], _ = stats.boxcox(df[feature].values)

        else:
            raise ValueError("Invalid transformation specified.")
        
        return df
        
    except Exception as e:
        print(f"\nAn error occurred data transformation: {e}", exc_info=True)

    
            
def remove_outliers(df):
    
    try:
        if operation == 'trim_by_slide':
            slides = df['cycif.slide'].unique()
        
            for slide in slides:
                # Create a mask to filter data belonging to the current slide
                slide_mask = df['cycif.slide'] == slide
                
                for feature in features_for_outliers:
                    percentiles = np.percentile(df.loc[slide_mask, feature], [1, 99])
                    
                    # Replace values below the 1st percentile with the 1st percentile value
                    df.loc[slide_mask & (df[feature] < percentiles[0]), feature] = percentiles[0]
                    # Replace values above the 99th percentile with the 99th percentile value
                    df.loc[slide_mask & (df[feature] > percentiles[1]), feature] = percentiles[1]

        elif operation == 'remove':

            df_sub = df.loc[:, features_for_outliers]

            # Identify outliers using the 1st (0.01) and 99th (0.99) percentiles for each feature
            # For each data point, 'lim' will be True if the value is within the range [0.01, 0.99], otherwise False
            lim = np.logical_and(df_sub < df_sub.quantile(0.99, numeric_only=False),
                            df_sub > df_sub.quantile(0.01, numeric_only=False))

            # Data points outside the range [0.01, 0.99] will be replaced with NaN
            df.loc[:, features_for_outliers] = df_sub.where(lim, np.nan)
            
            # Drop rows with NaN in numerical columns
            df.dropna(subset=features_for_outliers, inplace=True)
                    
        else: 
            raise ValueError("Invalid operation specified.")
        
        return df
    
    except Exception as e:
        print(f"\nAn error occurred  outliers removal: {e}", exc_info=True)
    
def scaler(df):
    
        # Get a scaler from the dictionary of supported scaler types
        scaler = scaler_dict.get(scaler_type)

        return scaler.fit_transform(df)

def scaling(df):
    
    try:
        if scaler_type in scaler_dict:
            
            if scale_by not in ['whole','patient', 'slide']:
                raise ValueError(f"Invalid order: {scale_by}")
            
            if scale_by == 'patient':
                patients = df['patient'].unique()
                # Iterate through each unique patient ID and scale the specified features for each patient separately
                for patient in patients:
                    df.loc[df['patient'] == patient, features_to_scale] = scaler(df.loc[df['patient'] == patient, features_to_scale])
            
            elif scale_by == 'slide':
                slides = df['cycif.slide'].unique()
                # Iterate through each unique slide and scale the specified features for each slide separately
                for slide in slides:
                    df.loc[df['cycif.slide'] == slide, features_to_scale] = scaler(df.loc[df['cycif.slide'] == slide, features_to_scale])
            
            elif scale_by == 'whole':
                # Scale the specified features on the entire DataFrame
                df.loc[features_to_scale] = scaler(df.loc[features_to_scale])
            
            return df
    
    except Exception as e:
        print(f"\nAn error occurred  scaling: {e}", exc_info=True)

def choose_types_of_cells(df):

    # Replace all immune cells as Immune
    df["GlobalCellType"] = df["GlobalCellType"].replace(to_replace = ['CD8.T.cells', 'CD4.T.cells'], value = 'Immune')
    
    cell_types = df["GlobalCellType"].unique()
    
    # Remove patients with too few cells for each chosen cell type
    for cell_type in cell_types:
        # Group df by patient and get the size of each group
        grouped_df = df[df['GlobalCellType'] == cell_type].groupby('patient').size()

        # Filter patients with 100 or fewer cells and store them
        selected_patients = grouped_df[grouped_df <= 100].index

        # Remove patients with 100 and fewer cells
        if len(selected_patients) > 0:
            df = df[~df['patient'].isin(selected_patients)]

    print("Number of unique patients after thresholding: ", len(df['patient'].unique()))

    try:
        # Leave only chosen cell types in df
        if types_of_cells == "Non-cancer":
            df = df[~df["GlobalCellType"].isin(['Cancer'])]

        elif types_of_cells in ["Cancer", "Stromal", "Immune"]:
            df = df[df["GlobalCellType"] == types_of_cells]

        elif types_of_cells not in ["All"]:
            raise ValueError("Invalid cell type(s) specified.")
    except Exception as e:
        print(f"\nAn error occurred  while choosing cell type: {e}", exc_info=True)
    
    # Remove column GlobalCellType
    df = df.drop(columns='GlobalCellType')
    
    return df
        
def choose_mol_profiles(df):
    
    if mol_profiles != 'All':
        df = df[df['Molecular.profile2'].isin(mol_profiles)]
    else:
        df = df[df['Molecular.profile2'].isin(['BRCAmutmet','HRD','HRP','CCNE1amp'])]
        
    return df

def choose_therapy_sequences(df):
    if therapies != 'All':
        df = df[df['therapy_sequence'] == therapies]
    
    return df

def prepare_categorical_inputs(df):

    # Encode categorical variables 
    label_encoder_dict = {}
    for variable in categorical_variables:
        
        label_encoder = LabelEncoder()
        
        # Convert the values of the current categorical variable to strings and encode them
        # The encoded values will replace the original values in the DataFrame
        df[variable] = label_encoder.fit_transform(df[variable].astype('str'))
        
        # Save the encoding results for the current variable in 'label_encoder_dict'
        # The dictionary will store the unique class labels and their corresponding encoded values
        label_encoder_dict[variable] = {
            'label_codes': label_encoder.classes_.tolist(), 
            'label_values': label_encoder.transform(label_encoder.classes_).tolist()
        }
        
        print(label_encoder_dict)
        
    
    return df

def prep_train_test_data(df):

    # To receive same number of patients independent of runs
    np.random.seed(33)
    
    # Group df by Molecular profile and therapy
    grouped = df.groupby(['Molecular.profile2', 'therapy_sequence'])

    # Assign 80% of patients from each unique group of Molecular profile and therapy to training and rest to test sets
    X_train_full = grouped.apply(lambda x: x.loc[x['patient'].isin(np.random.choice(x['patient'].unique(), size=int(0.8*len(x['patient'].unique())), replace=False))])
    X_test_full = grouped.apply(lambda x: x.loc[x['patient'].isin(np.setdiff1d(x['patient'].unique(), X_train_full['patient']))])
    
    X_train_full.reset_index(drop=True, inplace=True)
    X_test_full.reset_index(drop=True, inplace=True)
    
    # Assign target columns to y_train and y_test
    y_train=X_train_full['Molecular.profile2']
    y_test=X_test_full['Molecular.profile2']
    
    # Drop target columns
    X_train = X_train_full.drop(columns = ['Molecular.profile2', 'patient', 'cycif.slide'])
    X_test = X_test_full.drop(columns = ['Molecular.profile2', 'patient', 'cycif.slide'])
    
    return X_train_full.drop(columns = ['patient', 'cycif.slide']),X_test_full.drop(columns = ['patient', 'cycif.slide']),X_train,X_test,y_train,y_test

In [2]:
df = pd.read_csv("/Users/alex/Desktop/Laboratory/Projects/Data/final_dataset_202310.csv")
df['Molecular.profile2'] = df['Molecular.profile2'].replace('BRCAmut/met', 'BRCAmutmet')

In [3]:
print(df['Molecular.profile2'].unique())
print(df['therapy_sequence'].unique())
print(len(df['patient'].unique()))

['Other' 'BRCAmutmet' 'HRD' 'HRP' 'CCNE1amp']
['PDS' 'IDS']
233


In [4]:
df = choose_features(df)
print(len(df['patient'].unique()))
df = transform_df(df)
print(len(df['patient'].unique()))
df = remove_outliers(df)
print(len(df['patient'].unique()))
df = scaling(df)
print(len(df['patient'].unique()))
df = choose_types_of_cells(df)
print(len(df['patient'].unique()))
df = choose_mol_profiles(df)
print(len(df['patient'].unique()))
df = choose_therapy_sequences(df)
print(len(df['patient'].unique()))

df = prepare_categorical_inputs(df)
print(len(df['patient'].unique()))
X_train_full,X_test_full,X_train,X_test,y_train,y_test = prep_train_test_data(df)

233
233
233
233
Number of unique patients after thresholding:  178
178
126
126
{'Molecular.profile2': {'label_codes': ['BRCAmutmet', 'CCNE1amp', 'HRD', 'HRP'], 'label_values': [0, 1, 2, 3]}}
{'Molecular.profile2': {'label_codes': ['BRCAmutmet', 'CCNE1amp', 'HRD', 'HRP'], 'label_values': [0, 1, 2, 3]}, 'therapy_sequence': {'label_codes': ['IDS', 'PDS'], 'label_values': [0, 1]}}
126


In [5]:
print(X_train_full.head(5))

       TIM3    pSTAT1    CD45RO      CD20     CD11c     CD207  GranzymeB  \
0  2.521771 -0.158645  0.352831  1.628848  0.123093  2.128489   1.009294   
1  2.143067  0.062037  0.783278  1.970874  0.603620  2.032706   0.070582   
2  1.759250  0.545668  1.147808  1.754337  0.924718  1.481162  -0.180398   
3  1.585871 -0.298474 -0.122487  3.143137  2.441250  1.400469   0.658624   
4  2.143382  0.721063  1.266073  1.520861  0.973570  1.697329   0.555848   

      CD163       CD4      CD3d  ...  CD8.T.cells    Cancer  Other  Other.MY  \
0 -0.229116  0.907544  1.379350  ...     0.050000  0.550000    0.0  0.000000   
1  0.076709  1.291857  1.221643  ...     0.055556  0.611111    0.0  0.000000   
2 -0.015344  1.356749  1.727427  ...     0.000000  0.722222    0.0  0.000000   
3  0.056790  1.562355  2.027515  ...     0.035714  0.464286    0.0  0.035714   
4  0.455264  1.703434  1.564869  ...     0.000000  0.739130    0.0  0.000000   

    Stromal    T.regs   B.cells  Other.immune  Molecular.profi

In [6]:
print(X_train_full.shape)
print(X_test_full.shape)

(1083621, 46)
(336620, 46)


In [7]:
X_train_full.index

RangeIndex(start=0, stop=1083621, step=1)

In [8]:
full_df = pd.concat([X_train_full, X_test_full], ignore_index=True)
print(full_df.shape)

(1420241, 46)


In [9]:
print(full_df.head(5))

       TIM3    pSTAT1    CD45RO      CD20     CD11c     CD207  GranzymeB  \
0  2.521771 -0.158645  0.352831  1.628848  0.123093  2.128489   1.009294   
1  2.143067  0.062037  0.783278  1.970874  0.603620  2.032706   0.070582   
2  1.759250  0.545668  1.147808  1.754337  0.924718  1.481162  -0.180398   
3  1.585871 -0.298474 -0.122487  3.143137  2.441250  1.400469   0.658624   
4  2.143382  0.721063  1.266073  1.520861  0.973570  1.697329   0.555848   

      CD163       CD4      CD3d  ...  CD8.T.cells    Cancer  Other  Other.MY  \
0 -0.229116  0.907544  1.379350  ...     0.050000  0.550000    0.0  0.000000   
1  0.076709  1.291857  1.221643  ...     0.055556  0.611111    0.0  0.000000   
2 -0.015344  1.356749  1.727427  ...     0.000000  0.722222    0.0  0.000000   
3  0.056790  1.562355  2.027515  ...     0.035714  0.464286    0.0  0.035714   
4  0.455264  1.703434  1.564869  ...     0.000000  0.739130    0.0  0.000000   

    Stromal    T.regs   B.cells  Other.immune  Molecular.profi

In [10]:
full_df.index

RangeIndex(start=0, stop=1420241, step=1)

In [11]:
import mlflow 
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [12]:
import pandas as pd
from pycaret.classification import *

# Init setup
clf1 = setup(full_df, target='Molecular.profile2', log_experiment = True, experiment_name = 'Choice_of_model1', train_size=len(X_train_full)/len(full_df), test_data=X_test_full, index = False, preprocess=False)

Unnamed: 0,Description,Value
0,Session id,4941
1,Target,Molecular.profile2
2,Target type,Multiclass
3,Original data shape,"(1756861, 46)"
4,Transformed data shape,"(1756861, 46)"
5,Transformed train set shape,"(1420241, 46)"
6,Transformed test set shape,"(336620, 46)"
7,Numeric features,45


In [13]:
# best_model = compare_models()

In [14]:
best_model = compare_models(include = ['lr','nb','dt','svm','rf','ada','gbc','lightgbm','lda'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.5497,0.7141,0.5497,0.5549,0.5132,0.2866,0.3126,144.442
lightgbm,Light Gradient Boosting Machine,0.5119,0.6791,0.5119,0.4997,0.479,0.2386,0.2547,10.387
gbc,Gradient Boosting Classifier,0.5087,0.6685,0.5087,0.4789,0.4521,0.2087,0.2339,982.867
ada,Ada Boost Classifier,0.4615,0.6316,0.4615,0.4265,0.4093,0.145,0.1621,53.579
lr,Logistic Regression,0.4519,0.6018,0.4519,0.3883,0.3842,0.1197,0.1356,10.527
lda,Linear Discriminant Analysis,0.4508,0.6029,0.4508,0.3951,0.3876,0.1233,0.1386,1.574
svm,SVM - Linear Kernel,0.4505,0.0,0.4505,0.3812,0.3613,0.0831,0.1026,2.572
dt,Decision Tree Classifier,0.43,0.5869,0.43,0.4393,0.4277,0.1748,0.1769,16.46
nb,Naive Bayes,0.3236,0.6035,0.3236,0.4126,0.3081,0.0961,0.1083,0.748
