In [1]:
from pycaret.utils import version
version()

'3.2.0'

In [2]:
import pandas as pd
from typing import List
from pathlib import Path
import os
from pycaret.classification import *

In [3]:
patients = ["9_2","9_3","9_14", "9_15"]
radius = 15
COLUMNS_OF_INTEREST = ['pRB', 'CD45', 'CK19', 'Ki67', 'aSMA', 'Ecad', 'PR', 'CK14', 'HER2', 'AR', 'CK17', 'p21', 'Vimentin',
                  'pERK', 'EGFR', 'ER', "Treatment"]


In [4]:
save_folder = Path("v2", str(radius))

if not save_folder.exists():
    save_folder.mkdir(parents=True)

# Data Loader

In [5]:
def load_patient(patient: str) -> {}:
    data_path = Path("..","data_2", "mapped_data", f"{radius}")

    data_frames: dict = {}

    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient not in file:
                continue

            print(file)

            data_frames[Path(file).stem] = pd.read_csv(Path(data_path, file))

    return data_frames

In [6]:
def load_files(patient_to_be_excluded: str) -> (pd.DataFrame, List):
    if not patient_to_be_excluded:
        raise ValueError("Patient to be excluded needs to be specified.")

    data_path = Path("..", "data_2", "mapped_data",  f"{radius}" )

    data_frames: [pd.DataFrame] = []
    loaded_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient_to_be_excluded in file:
                continue

            print(file)

            data_frames.append(pd.read_csv(Path(data_path, file)))
            loaded_files.append(file)

    data_frames = pd.concat(data_frames, axis=0)
    return data_frames, loaded_files

In [None]:
for patient in patients:
    print("Loading train sets...")
    train_set, _ = load_files(patient)
    print("Loading test sets...")
    test_sets: dict = load_patient(patient=patient)
    test_set = list(test_sets.values())
    test_set = pd.concat(test_set, axis=0)

    
    
    if "CellID" in train_set.columns:
        train_set.drop(columns=["CellID"])
    train_set = train_set[COLUMNS_OF_INTEREST].copy()
    #train_set = train_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    train_set.reset_index(drop=True, inplace=True)
   
    if "CellID" in test_set.columns:
        test_set.drop(columns=["CellID"])
    test_set = test_set[COLUMNS_OF_INTEREST].copy()
    #test_set = test_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    experiment = setup(data=train_set,test_data=test_set, target='Treatment', index=False)
    # Perform initial model comparison.
    best = compare_models()
    model_comparison_df = pull()
    #save model comparison
    excluded_patient_file_name: Path = Path(f"{patient}_excluded_model_performance.csv")
    if not excluded_patient_file_name.exists():
        model_comparison_df.to_csv(Path(save_folder,excluded_patient_file_name))

    # Predict on test set
    # Evaluate the model (this will show the performance metrics in PyCaret's UI)
    predictions = predict_model(best, data=test_set)
    metrics = pull()
    metrics.to_csv(Path(save_folder, f"{patient}_metrics.csv"))

    


Loading train sets...
9_15_1.csv
9_15_2.csv
9_14_2.csv
9_14_1.csv
9_3_2.csv
9_3_1.csv
Loading test sets...
9_2_1.csv
9_2_2.csv


Unnamed: 0,Description,Value
0,Session id,643
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(545099, 17)"
5,Transformed data shape,"(545099, 17)"
6,Transformed train set shape,"(392713, 17)"
7,Transformed test set shape,"(152386, 17)"
8,Numeric features,16
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9067,0.9942,0.9067,0.9593,0.8993,0.8358,0.8515,15.488
et,Extra Trees Classifier,0.9063,0.996,0.9063,0.9591,0.8988,0.8343,0.8504,5.028
lightgbm,Light Gradient Boosting Machine,0.9063,0.9942,0.9063,0.9591,0.899,0.835,0.8508,3.255
gbc,Gradient Boosting Classifier,0.9036,0.977,0.9036,0.952,0.899,0.815,0.8341,27.357
dt,Decision Tree Classifier,0.9023,0.8922,0.9023,0.9495,0.8988,0.8124,0.8301,1.894
knn,K Neighbors Classifier,0.8975,0.9024,0.8975,0.9508,0.89,0.8093,0.8268,25.7
lda,Linear Discriminant Analysis,0.8972,0.963,0.8972,0.92,0.8967,0.7383,0.7595,1.03
ridge,Ridge Classifier,0.8963,0.0,0.8963,0.9184,0.8953,0.7325,0.7544,0.499
ada,Ada Boost Classifier,0.8864,0.9793,0.8864,0.939,0.8792,0.7747,0.7943,6.449
svm,SVM - Linear Kernel,0.8793,0.0,0.8793,0.9169,0.8778,0.72,0.7432,0.581


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.4644,0.4697,0.4644,0.4735,0.4621,-0.0572,-0.0589


Loading train sets...
9_15_1.csv
9_15_2.csv
9_2_1.csv
9_2_2.csv
9_14_2.csv
9_14_1.csv
Loading test sets...
9_3_2.csv
9_3_1.csv


Unnamed: 0,Description,Value
0,Session id,7433
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(528181, 17)"
5,Transformed data shape,"(528181, 17)"
6,Transformed train set shape,"(391633, 17)"
7,Transformed test set shape,"(136548, 17)"
8,Numeric features,16
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8622,0.8899,0.8622,0.879,0.8614,0.5641,0.5831,1.163


Processing:   0%|          | 0/61 [00:00<?, ?it/s]