In [1]:
from pycaret.utils import version
version()

'3.2.0'

In [2]:
import pandas as pd
from typing import List
from pathlib import Path
import os
from pycaret.classification import *

In [9]:
patients = ["patient_c", "patient_d"]
radius = 120


In [18]:
save_folder = Path(str(radius))

if not save_folder.exists():
    save_folder.mkdir(parents=True)

# Data Loader

In [14]:
def load_patient(patient: str) -> {}:
    data_path = Path("..","data", "mapped_data", f"{radius}")

    data_frames: dict = {}

    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient not in file:
                continue

            print(file)

            data_frames[Path(file).stem] = pd.read_csv(Path(data_path, file))

    return data_frames

In [15]:
def load_files(patient_to_be_excluded: str) -> (pd.DataFrame, List):
    if not patient_to_be_excluded:
        raise ValueError("Patient to be excluded needs to be specified.")

    data_path = Path("..", "data", "mapped_data",  f"{radius}" )

    data_frames: [pd.DataFrame] = []
    loaded_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient_to_be_excluded in file:
                continue

            print(file)

            data_frames.append(pd.read_csv(Path(data_path, file)))
            loaded_files.append(file)

    data_frames = pd.concat(data_frames, axis=0)
    return data_frames, loaded_files

In [20]:
for patient in patients:
    print(patient)
    train_set, _ = load_files(patient)
    test_sets: dict = load_patient(patient=patient)
    test_set = list(test_sets.values())
    test_set = pd.concat(test_set, axis=0)
    
    
    if "CellID" in train_set.columns:
        train_set.drop(columns=["CellID"])
    train_set = train_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    train_set.reset_index(drop=True, inplace=True)
   
    if "CellID" in test_set.columns:
        test_set.drop(columns=["CellID"])
    test_set = test_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    experiment = setup(data=train_set,test_data=test_set, target='Treatment', index=False)
    # Perform initial model comparison.
    best = compare_models()
    model_comparison_df = pull()
    #save model comparison
    excluded_patient_file_name: Path = Path(f"{patient}_excluded_model_performance.csv")
    if not excluded_patient_file_name.exists():
        model_comparison_df.to_csv(Path(save_folder,excluded_patient_file_name))

    # Predict on test set
    holdout_pred = predict_model(best)
    holdout_pred.to_csv(Path(save_folder, f"{file_name}_results.csv"))

    


patient_c
patient_d_272840.csv
patient_d_321920.csv
patient_c_272830.csv
patient_c_303148.csv


Unnamed: 0,Description,Value
0,Session id,1498
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(153144, 55)"
5,Transformed data shape,"(153144, 55)"
6,Transformed train set shape,"(51732, 55)"
7,Transformed test set shape,"(101412, 55)"
8,Numeric features,54
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9883,0.9983,0.9883,0.9895,0.9875,0.9524,0.957,0.518
ada,Ada Boost Classifier,0.9873,0.9947,0.9873,0.9883,0.9865,0.9488,0.953,2.84
gbc,Gradient Boosting Classifier,0.9854,0.9943,0.9854,0.987,0.9843,0.9398,0.9461,13.553
rf,Random Forest Classifier,0.9849,0.9993,0.9849,0.9872,0.9843,0.9413,0.9472,4.383
lr,Logistic Regression,0.983,0.9948,0.983,0.9854,0.9814,0.9286,0.9373,3.215
lightgbm,Light Gradient Boosting Machine,0.983,0.993,0.983,0.9856,0.9815,0.9296,0.9382,1.453
ridge,Ridge Classifier,0.9768,0.0,0.9768,0.9811,0.9762,0.9139,0.9223,0.082
lda,Linear Discriminant Analysis,0.9758,0.9798,0.9758,0.9811,0.9756,0.913,0.9218,0.271
dt,Decision Tree Classifier,0.9706,0.9506,0.9706,0.9761,0.9701,0.8927,0.9025,1.042
svm,SVM - Linear Kernel,0.9693,0.0,0.9693,0.9768,0.9701,0.8977,0.9064,0.404


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.4898,0.494,0.4898,0.4944,0.3974,-0.0043,-0.007


patient_d
patient_c_272830.csv
patient_c_303148.csv
patient_d_272840.csv
patient_d_321920.csv


Unnamed: 0,Description,Value
0,Session id,8924
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(120482, 55)"
5,Transformed data shape,"(120482, 55)"
6,Transformed train set shape,"(51740, 55)"
7,Transformed test set shape,"(68742, 55)"
8,Numeric features,54
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.9845,0.9892,0.9845,0.9854,0.9845,0.9691,0.9699,2.678
et,Extra Trees Classifier,0.9826,0.9962,0.9826,0.9828,0.9826,0.9652,0.9654,0.636
rf,Random Forest Classifier,0.9792,0.9945,0.9792,0.9796,0.9792,0.9584,0.9588,3.632
gbc,Gradient Boosting Classifier,0.9768,0.9935,0.9768,0.9772,0.9768,0.9535,0.9539,13.464
lightgbm,Light Gradient Boosting Machine,0.9766,0.9932,0.9766,0.9773,0.9766,0.9531,0.9538,1.614
ada,Ada Boost Classifier,0.9696,0.9703,0.9696,0.9712,0.9694,0.9395,0.9409,2.716
qda,Quadratic Discriminant Analysis,0.9676,0.9797,0.9676,0.9722,0.967,0.9346,0.9394,0.229
ridge,Ridge Classifier,0.9606,0.0,0.9606,0.9653,0.9604,0.9214,0.926,0.079
lda,Linear Discriminant Analysis,0.9606,0.9869,0.9606,0.9654,0.9604,0.9216,0.9261,0.295
svm,SVM - Linear Kernel,0.9514,0.0,0.9514,0.9608,0.9505,0.9036,0.9123,0.171


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Logistic Regression,0.248,0.5558,0.248,0.4455,0.1181,-0.0147,-0.0849
