In [8]:
from pycaret.utils import version
version()

'3.2.0'

In [9]:
import pandas as pd
from typing import List
from pathlib import Path
import os
from pycaret.classification import *

In [10]:
patients = ["patient_c", "patient_d"]
radius = 90


In [11]:
save_folder = Path(str(radius))

if not save_folder.exists():
    save_folder.mkdir(parents=True)

# Data Loader

In [12]:
def load_patient(patient: str) -> {}:
    data_path = Path("..","data", "mapped_data", f"{radius}")

    data_frames: dict = {}

    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient not in file:
                continue

            print(file)

            data_frames[Path(file).stem] = pd.read_csv(Path(data_path, file))

    return data_frames

In [13]:
def load_files(patient_to_be_excluded: str) -> (pd.DataFrame, List):
    if not patient_to_be_excluded:
        raise ValueError("Patient to be excluded needs to be specified.")

    data_path = Path("..", "data", "mapped_data",  f"{radius}" )

    data_frames: [pd.DataFrame] = []
    loaded_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient_to_be_excluded in file:
                continue

            print(file)

            data_frames.append(pd.read_csv(Path(data_path, file)))
            loaded_files.append(file)

    data_frames = pd.concat(data_frames, axis=0)
    return data_frames, loaded_files

In [14]:
for patient in patients:

    train_set, _ = load_files(patient)
    test_sets: dict = load_patient(patient=patient)
    test_set = list(test_sets.values())
    test_set = pd.concat(test_set, axis=0)
    
    
    if "CellID" in train_set.columns:
        train_set.drop(columns=["CellID"])
    train_set = train_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    train_set.reset_index(drop=True, inplace=True)
   
    if "CellID" in test_set.columns:
        test_set.drop(columns=["CellID"])
    test_set = test_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    experiment = setup(data=train_set,test_data=test_set, target='Treatment', index=False)
    # Perform initial model comparison.
    best = compare_models()
    model_comparison_df = pull()
    #save model comparison
    excluded_patient_file_name: Path = Path(f"{patient}_excluded_model_performance.csv")
    if not excluded_patient_file_name.exists():
        model_comparison_df.to_csv(Path(save_folder,excluded_patient_file_name))

    # Predict on test set
    # Evaluate the model (this will show the performance metrics in PyCaret's UI)
    predictions = predict_model(best, data=test_set)
    metrics = pull()
    metrics.to_csv(Path(save_folder, f"{patient}_metrics.csv"))

    


patient_d_272840.csv
patient_d_321920.csv
patient_c_272830.csv
patient_c_303148.csv


Unnamed: 0,Description,Value
0,Session id,8854
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(153144, 55)"
5,Transformed data shape,"(153144, 55)"
6,Transformed train set shape,"(51732, 55)"
7,Transformed test set shape,"(101412, 55)"
8,Numeric features,54
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9866,0.9963,0.9866,0.9881,0.9855,0.944,0.9501,0.566
gbc,Gradient Boosting Classifier,0.9819,0.9943,0.9819,0.9847,0.9808,0.9281,0.9358,14.498
lr,Logistic Regression,0.9816,0.9954,0.9816,0.9842,0.9798,0.9227,0.932,2.975
ada,Ada Boost Classifier,0.9813,0.9935,0.9813,0.9839,0.9808,0.9292,0.9352,2.652
rf,Random Forest Classifier,0.9806,0.9978,0.9806,0.9845,0.9798,0.9258,0.9344,4.511
lightgbm,Light Gradient Boosting Machine,0.9793,0.9946,0.9793,0.9843,0.9786,0.9225,0.9323,1.474
ridge,Ridge Classifier,0.9718,0.0,0.9718,0.977,0.971,0.8953,0.9052,0.073
lda,Linear Discriminant Analysis,0.9711,0.9764,0.9711,0.9776,0.9707,0.8963,0.9068,0.273
svm,SVM - Linear Kernel,0.963,0.0,0.963,0.9734,0.9641,0.8792,0.8906,0.172
knn,K Neighbors Classifier,0.9618,0.9713,0.9618,0.9645,0.9627,0.8677,0.8692,0.857


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.4899,0.4969,0.4899,0.4946,0.3892,-0.0036,-0.0063


patient_c_272830.csv
patient_c_303148.csv
patient_d_272840.csv
patient_d_321920.csv


Unnamed: 0,Description,Value
0,Session id,8964
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(120482, 55)"
5,Transformed data shape,"(120482, 55)"
6,Transformed train set shape,"(51740, 55)"
7,Transformed test set shape,"(68742, 55)"
8,Numeric features,54
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.983,0.9902,0.983,0.9838,0.983,0.9661,0.9668,2.457
et,Extra Trees Classifier,0.9817,0.9958,0.9817,0.9819,0.9817,0.9633,0.9635,0.609
rf,Random Forest Classifier,0.9764,0.9937,0.9764,0.9769,0.9764,0.9527,0.9532,3.93
gbc,Gradient Boosting Classifier,0.976,0.9933,0.976,0.9764,0.9759,0.9518,0.9523,13.597
lightgbm,Light Gradient Boosting Machine,0.9734,0.9917,0.9734,0.974,0.9734,0.9468,0.9474,1.587
ada,Ada Boost Classifier,0.9723,0.9886,0.9723,0.9734,0.9722,0.9447,0.9457,2.728
qda,Quadratic Discriminant Analysis,0.9607,0.9753,0.9607,0.9657,0.9601,0.9209,0.926,0.2
svm,SVM - Linear Kernel,0.9597,0.0,0.9597,0.9623,0.9596,0.9194,0.9219,0.187
lda,Linear Discriminant Analysis,0.9533,0.9859,0.9533,0.9586,0.9531,0.907,0.912,0.595
ridge,Ridge Classifier,0.9532,0.0,0.9532,0.9586,0.953,0.9068,0.9118,0.078


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.2486,0.547,0.2486,0.451,0.1243,-0.0186,-0.0939
