In [1]:
from pycaret.utils import version
version()

'3.2.0'

In [2]:
import pandas as pd
from typing import List
from pathlib import Path
import os
from pycaret.classification import *

In [3]:
patients = ["9_2","9_3","9_14", "9_15"]
radius = 120
COLUMNS_OF_INTEREST = ['pRB', 'CD45', 'CK19', 'Ki67', 'aSMA', 'Ecad', 'PR', 'CK14', 'HER2', 'AR', 'CK17', 'p21', 'Vimentin',
                  'pERK', 'EGFR', 'ER', "Treatment"]


In [4]:
save_folder = Path("v2", str(radius))

if not save_folder.exists():
    save_folder.mkdir(parents=True)

# Data Loader

In [5]:
def load_patient(patient: str) -> {}:
    data_path = Path("..","data_2", "mapped_data", f"{radius}")

    data_frames: dict = {}

    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient not in file:
                continue

            print(file)

            data_frames[Path(file).stem] = pd.read_csv(Path(data_path, file))

    return data_frames

In [6]:
def load_files(patient_to_be_excluded: str) -> (pd.DataFrame, List):
    if not patient_to_be_excluded:
        raise ValueError("Patient to be excluded needs to be specified.")

    data_path = Path("..", "data_2", "mapped_data",  f"{radius}" )

    data_frames: [pd.DataFrame] = []
    loaded_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient_to_be_excluded in file:
                continue

            print(file)

            data_frames.append(pd.read_csv(Path(data_path, file)))
            loaded_files.append(file)

    data_frames = pd.concat(data_frames, axis=0)
    return data_frames, loaded_files

In [7]:
for patient in patients:
    print("Loading train sets...")
    train_set, _ = load_files(patient)
    print("Loading test sets...")
    test_sets: dict = load_patient(patient=patient)
    test_set = list(test_sets.values())
    test_set = pd.concat(test_set, axis=0)

    
    
    if "CellID" in train_set.columns:
        train_set.drop(columns=["CellID"])
    train_set = train_set[COLUMNS_OF_INTEREST].copy()
    #train_set = train_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    train_set.reset_index(drop=True, inplace=True)
   
    if "CellID" in test_set.columns:
        test_set.drop(columns=["CellID"])
    test_set = test_set[COLUMNS_OF_INTEREST].copy()
    #test_set = test_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    experiment = setup(data=train_set,test_data=test_set, target='Treatment', index=False)
    # Perform initial model comparison.
    best = compare_models()
    model_comparison_df = pull()
    #save model comparison
    excluded_patient_file_name: Path = Path(f"{patient}_excluded_model_performance.csv")
    if not excluded_patient_file_name.exists():
        model_comparison_df.to_csv(Path(save_folder,excluded_patient_file_name))

    # Predict on test set
    # Evaluate the model (this will show the performance metrics in PyCaret's UI)
    predictions = predict_model(best, data=test_set)
    metrics = pull()
    metrics.to_csv(Path(save_folder, f"{patient}_metrics.csv"))

    


Loading train sets...
9_15_1.csv
9_15_2.csv
9_14_2.csv
9_14_1.csv
9_3_2.csv
9_3_1.csv
Loading test sets...
9_2_1.csv
9_2_2.csv


Unnamed: 0,Description,Value
0,Session id,4514
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(545099, 17)"
5,Transformed data shape,"(545099, 17)"
6,Transformed train set shape,"(392713, 17)"
7,Transformed test set shape,"(152386, 17)"
8,Numeric features,16
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.9165,0.0,0.9165,0.9396,0.9178,0.7982,0.8162,0.424
lda,Linear Discriminant Analysis,0.9165,0.9762,0.9165,0.9402,0.9179,0.7997,0.8176,0.85
dt,Decision Tree Classifier,0.9111,0.9131,0.9111,0.9499,0.9105,0.8132,0.834,1.371
rf,Random Forest Classifier,0.909,0.9953,0.909,0.9607,0.9017,0.8402,0.8558,10.156
lightgbm,Light Gradient Boosting Machine,0.9082,0.9933,0.9082,0.9605,0.9012,0.838,0.8546,2.262
et,Extra Trees Classifier,0.9053,0.9968,0.9053,0.9588,0.8976,0.8318,0.8486,2.689
knn,K Neighbors Classifier,0.9028,0.9247,0.9028,0.955,0.896,0.8211,0.8391,22.253
gbc,Gradient Boosting Classifier,0.9021,0.965,0.9021,0.9527,0.8961,0.8147,0.8337,22.794
ada,Ada Boost Classifier,0.8935,0.9441,0.8935,0.9458,0.8867,0.7903,0.8119,5.508
svm,SVM - Linear Kernel,0.8865,0.0,0.8865,0.9291,0.8852,0.7526,0.775,0.472


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ridge Classifier,0.5011,0,0.5011,0.4957,0.4966,-0.0155,-0.0157


Loading train sets...
9_15_1.csv
9_15_2.csv
9_2_1.csv
9_2_2.csv
9_14_2.csv
9_14_1.csv
Loading test sets...
9_3_2.csv
9_3_1.csv


Unnamed: 0,Description,Value
0,Session id,2305
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(528181, 17)"
5,Transformed data shape,"(528181, 17)"
6,Transformed train set shape,"(391633, 17)"
7,Transformed test set shape,"(136548, 17)"
8,Numeric features,16
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.92,0.9735,0.92,0.9432,0.9222,0.7741,0.7927,24.653
lightgbm,Light Gradient Boosting Machine,0.9197,0.9782,0.9197,0.9512,0.9235,0.7892,0.8126,2.651
ada,Ada Boost Classifier,0.9184,0.9574,0.9184,0.9372,0.92,0.7602,0.7762,6.055
et,Extra Trees Classifier,0.9145,0.9823,0.9145,0.9525,0.9196,0.7918,0.8153,3.064
knn,K Neighbors Classifier,0.9095,0.9206,0.9095,0.9468,0.9155,0.7773,0.799,298.753
rf,Random Forest Classifier,0.9079,0.9828,0.9079,0.9537,0.9124,0.786,0.8128,11.737
ridge,Ridge Classifier,0.8935,0.0,0.8935,0.9134,0.8926,0.6639,0.6882,0.692
lda,Linear Discriminant Analysis,0.8898,0.9179,0.8898,0.9131,0.8904,0.6642,0.6882,0.959
svm,SVM - Linear Kernel,0.8893,0.0,0.8893,0.9137,0.8892,0.6615,0.6865,0.868
lr,Logistic Regression,0.8884,0.9075,0.8884,0.9083,0.8881,0.6537,0.6751,1.046


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.447,0.4894,0.447,0.5013,0.4431,-0.0395,-0.0449


Loading train sets...
9_15_1.csv
9_15_2.csv
9_2_1.csv
9_2_2.csv
9_3_2.csv
9_3_1.csv
Loading test sets...
9_14_2.csv
9_14_1.csv


Unnamed: 0,Description,Value
0,Session id,3351
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(485925, 17)"
5,Transformed data shape,"(485925, 17)"
6,Transformed train set shape,"(266562, 17)"
7,Transformed test set shape,"(219363, 17)"
8,Numeric features,16
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9383,0.9853,0.9383,0.9456,0.9379,0.877,0.8838,2.517
knn,K Neighbors Classifier,0.9363,0.9455,0.9363,0.9435,0.9359,0.873,0.8797,11.38
rf,Random Forest Classifier,0.9268,0.986,0.9268,0.9361,0.9262,0.8541,0.8628,11.212
lightgbm,Light Gradient Boosting Machine,0.9235,0.9857,0.9235,0.9328,0.923,0.8472,0.8561,2.401
gbc,Gradient Boosting Classifier,0.9056,0.9797,0.9056,0.9146,0.905,0.811,0.8197,26.02
dt,Decision Tree Classifier,0.8917,0.8928,0.8917,0.9002,0.8911,0.7838,0.7917,1.818
ada,Ada Boost Classifier,0.8645,0.936,0.8645,0.8786,0.8632,0.7298,0.7428,5.896
qda,Quadratic Discriminant Analysis,0.8472,0.917,0.8472,0.87,0.8439,0.6966,0.7167,0.44
svm,SVM - Linear Kernel,0.8373,0.0,0.8373,0.863,0.8342,0.6759,0.6997,0.384
ridge,Ridge Classifier,0.8328,0.0,0.8328,0.8595,0.8289,0.665,0.6907,0.358


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.8277,0.6601,0.8277,0.9239,0.8688,0.1119,0.1378


Loading train sets...
9_2_1.csv
9_2_2.csv
9_14_2.csv
9_14_1.csv
9_3_2.csv
9_3_1.csv
Loading test sets...
9_15_1.csv
9_15_2.csv


Unnamed: 0,Description,Value
0,Session id,7599
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(542479, 17)"
5,Transformed data shape,"(542479, 17)"
6,Transformed train set shape,"(375169, 17)"
7,Transformed test set shape,"(167310, 17)"
8,Numeric features,16
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.9077,0.9276,0.9077,0.944,0.9107,0.8003,0.8255,21.847
lightgbm,Light Gradient Boosting Machine,0.8802,0.9873,0.8802,0.942,0.878,0.7722,0.8027,2.694
dt,Decision Tree Classifier,0.8778,0.8872,0.8778,0.928,0.8786,0.7612,0.7831,1.603
rf,Random Forest Classifier,0.8721,0.989,0.8721,0.9357,0.8715,0.7484,0.7835,10.067
et,Extra Trees Classifier,0.8714,0.9908,0.8714,0.9363,0.872,0.7501,0.7856,2.812
gbc,Gradient Boosting Classifier,0.8616,0.9771,0.8616,0.9276,0.8613,0.7314,0.7642,23.892
ridge,Ridge Classifier,0.8546,0.0,0.8546,0.9077,0.8547,0.6773,0.7138,0.443
lda,Linear Discriminant Analysis,0.8537,0.9468,0.8537,0.9086,0.8539,0.6782,0.7154,0.938
lr,Logistic Regression,0.8511,0.9293,0.8511,0.9101,0.8527,0.6846,0.7208,1.044
svm,SVM - Linear Kernel,0.8498,0.0,0.8498,0.9107,0.8505,0.6801,0.7186,0.519


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,K Neighbors Classifier,0.5988,0.5107,0.5988,0.5618,0.4843,0.0214,0.0433
