In [1]:
from pycaret.utils import version
version()

'3.2.0'

In [2]:
import pandas as pd
from typing import List
from pathlib import Path
import os
from pycaret.classification import *

In [3]:
patients = ['209184',  '312158', '475139', '484265', 
            '566112', '657907', '672431', '798747', '894222']
radius = 15
COLUMNS_OF_INTEREST =['VISTA', 'CD163', 'FOXP3', 'IDO1', 'STING', 'CASP9', 'ERalpha', 'pMEK1', 'panRAS', 'CD44', 'CD38',
                     'MET', 'ICAM1', 'p53', 'p44-42MAP-ERK', 'EpCAM', 'VIM', 'IgD', 'KI67', 'FABP4', 'TIM3', 'TCF-1',
                     'PDL1', 'CD4', 'E-cad', 'CD39', 'AR', 'ICOS', 'CD14', 'CD15', 'CD8', 'GATA-3', 'PDL2', 'CD3',
                     'CD66b', 'GZMA', 'GSK3b', 'FAP', 'PARP', 'AKT', 'CD19', 'PTEN', 'pTuberin', 'CD45', 'CD123',
                     'CD31', 'p38MAPK', 'INPP4B', 'CD45RA', 'CD11a', 'CCR7', 'CD138', 'CD11b', 'IL-1B', 'pJNK', 'CD56',
                     'GITR', 'CD27', 'NF-kBp65', 'LAG3', 'CD16', 'BCL2', 'pGSK3ab', 'MART-1', 'CTLA4', 'SMA', 'GZMB',
                     'BIM', 'CD68', 'p44-42MAPK-ERK', 'CD25', 'CyclinD1', 'HLA-DRA', 'CD45RO', 'OX40L', 'LAMP1',
                     'gH2AX', 'IL-18', 'BRAF', 'PLCG1', 'CD20', 'PVR', 'B7-H3', 'iNOS', '4-1BB', 'Desmin', 'CD11c',
                     'CD40', 'CD226', 'cJun', 'pPRAS40', 'PD1', 'FN1', 'MPO', 'Beta-catenin', 'CD34', 'BCLXL', 'EGFR',
                     'RSK1p90', 'Pan-AKT', 'NY-ESO-1', 'GAPDH', 'CD80', 'T-bet', 'BCL6', 'Arg', 'CD127', "Treatment"]

In [4]:
folder="data_v3"
save_folder = Path("v3", str(radius))

if not save_folder.exists():
    save_folder.mkdir(parents=True)

# Data Loader

In [5]:
def load_patient(patient: str) -> {}:
    data_path = Path("..",folder, "mapped_data", f"{radius}")

    data_frames: dict = {}

    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient not in file:
                continue

            print(file)
            df = pd.read_csv(Path(data_path, file))
            assert "Treatment" in df.columns, f"Treatment column is missing for dataframe of patient {file}"

            data_frames[Path(file).stem] = df

    return data_frames

In [6]:
def load_files(patient_to_be_excluded: str) -> (pd.DataFrame, List):
    if not patient_to_be_excluded:
        raise ValueError("Patient to be excluded needs to be specified.")

    data_path = Path("..", folder, "mapped_data",  f"{radius}" )

    data_frames: [pd.DataFrame] = []
    loaded_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient_to_be_excluded in file:
                continue

            print(file)
            df = pd.read_csv(Path(data_path, file))
            assert "Treatment" in df.columns, f"Treatment column is missing for dataframe of patient {file}"
            assert "VISTA" in df.columns, f"VISTA column is missing for dataframe of patient {file}"
            data_frames.append(df)
            loaded_files.append(file)

    data_frames = pd.concat(data_frames, axis=0)
    return data_frames, loaded_files

In [None]:
for patient in patients:
    print(f"Handling patient {patient}")
    print("Loading train sets...")
    train_set, _ = load_files(patient_to_be_excluded=patient)
    print("Loading test sets...")
    test_sets: dict = load_patient(patient=patient)
    test_set = list(test_sets.values())
    test_set = pd.concat(test_set, axis=0)

    
    if "CellID" in train_set.columns:
        train_set.drop(columns=["CellID"])

    train_set = train_set[COLUMNS_OF_INTEREST].copy()
    #train_set = train_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    train_set.reset_index(drop=True, inplace=True)
   
    if "CellID" in test_set.columns:
        test_set.drop(columns=["CellID"])
    test_set = test_set[COLUMNS_OF_INTEREST].copy()


    
    #test_set = test_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    experiment = setup(data=train_set,test_data=test_set, target='Treatment', index=False)
    # Perform initial model comparison.
    best = compare_models()
    model_comparison_df = pull()
    #save model comparison
    excluded_patient_file_name: Path = Path(f"{patient}_excluded_model_performance.csv")
    if not excluded_patient_file_name.exists():
        model_comparison_df.to_csv(Path(save_folder,excluded_patient_file_name))

    # Predict on test set
    # Evaluate the model (this will show the performance metrics in PyCaret's UI)
    predictions = predict_model(best, data=test_set)
    metrics = pull()
    metrics.to_csv(Path(save_folder, f"{patient}_metrics.csv"))

    


Handling patient 209184
Loading train sets...
672431_1.csv
475139_2.csv
657907_1.csv
672431_2.csv
475139_1.csv
657907_2.csv
894222_2.csv
312158_1.csv
894222_1.csv
312158_2.csv
484265_2.csv
798747_2.csv
484265_1.csv
798747_1.csv
566112_2.csv
566112_1.csv
Loading test sets...
209184_2.csv
209184_1.csv


Unnamed: 0,Description,Value
0,Session id,2040
1,Target,Treatment
2,Target type,Binary
3,Target mapping,"ON: 0, PRE: 1"
4,Original data shape,"(575313, 108)"
5,Transformed data shape,"(575313, 108)"
6,Transformed train set shape,"(523739, 108)"
7,Transformed test set shape,"(51574, 108)"
8,Numeric features,107
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8728,0.9975,0.8728,0.962,0.8703,0.7989,0.8277,43.524


Processing:   0%|          | 0/61 [00:00<?, ?it/s]