In [1]:
from pycaret.utils import version
version()

'3.2.0'

In [2]:
import pandas as pd
from typing import List
from pathlib import Path
import os
from pycaret.classification import *

In [9]:
patients = ['209184_1', '209184_2', '312158_1', '312158_2', '475139_1', '475139_2', '484265_1', '484265_2', '566112_1', 
            '566112_2', '657907_1', '657907_2', '672431_1', '672431_2', '798747_1', '798747_2', '894222_1', '894222_2']
radius = 0
COLUMNS_OF_INTEREST = ['4-1BB', 'AKT', 'AR', 'Arg', 'B7-H3', 'BCL2', 'BCL6', 'BCLXL', 'BIM', 'BRAF', 'Beta-catenin',
              'C902-B2M', 'C902-panCK', 'CASP9', 'CCR7', 'CD11a', 'CD11b', 'CD11c', 'CD123', 'CD127', 'CD138',
              'CD14', 'CD15', 'CD16', 'CD163', 'CD19', 'CD20', 'CD226', 'CD25', 'CD27', 'CD3', 'CD31', 'CD34', 'CD38',
              'CD39', 'CD4', 'CD40', 'CD44', 'CD45', 'CD45RA', 'CD45RO', 'CD56', 'CD66b', 'CD68', 'CD8', 'CD80',
              'CTLA4',
              'CyclinD1', 'Desmin', 'E-cad', 'EGFR', 'ERalpha', 'EpCAM', 'FABP4', 'FAP', 'FN1', 'FOXP3', 'GAPDH',
              'GATA-3', 'GITR',
              'GSK3b', 'GZMA', 'GZMB', 'HER2.y', 'HLA-DRA', 'ICAM1', 'ICOS', 'IDO1', 'IL-18', 'IL-1B', 'INPP4B', 'IgD',
              'KI67', 'LAG3',
              'LAMP1', 'MART-1', 'MET', 'MPO', 'NF-kBp65', 'NY-ESO-1', 'OX40L', 'PARP', 'PD1', 'PDL1', 'PDL2', 'PLCG1',
              'PR.y', 'PTEN',
              'PVR', 'Pan-AKT', 'RSK1p90', 'SMA', 'STING', 'T-bet', 'TCF-1', 'TIM3', 'VIM', 'VISTA', 'cJun', 'gH2AX',
              'iNOS', 'p38MAPK',
              'p44-42MAP-ERK', 'p44-42MAPK-ERK', 'p53', 'pGSK3ab', 'pJNK', 'pMEK1', 'pPRAS40', 'pTuberin', 'panRAS',
              "ER", "PR", "HER2.x"]


In [10]:
folder="data_v3"
save_folder = Path("v3", str(radius))

if not save_folder.exists():
    save_folder.mkdir(parents=True)

# Data Loader

In [11]:
def load_patient(patient: str) -> {}:
    data_path = Path("..",folder, "mapped_data", f"{radius}")

    data_frames: dict = {}

    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient not in file:
                continue

            print(file)

            data_frames[Path(file).stem] = pd.read_csv(Path(data_path, file))

    return data_frames

In [12]:
def load_files(patient_to_be_excluded: str) -> (pd.DataFrame, List):
    if not patient_to_be_excluded:
        raise ValueError("Patient to be excluded needs to be specified.")

    data_path = Path("..", folder, "mapped_data",  f"{radius}" )

    data_frames: [pd.DataFrame] = []
    loaded_files = []
    for root, dirs, files in os.walk(data_path):
        for file in files:

            if Path(file).suffix != ".csv":
                continue

            if patient_to_be_excluded in file:
                continue

            print(file)

            data_frames.append(pd.read_csv(Path(data_path, file)))
            loaded_files.append(file)

    data_frames = pd.concat(data_frames, axis=0)
    return data_frames, loaded_files

In [13]:
for patient in patients:
    print("Loading train sets...")
    train_set, _ = load_files(patient)
    print("Loading test sets...")
    test_sets: dict = load_patient(patient=patient)
    test_set = list(test_sets.values())
    test_set = pd.concat(test_set, axis=0)

    
    
    if "CellID" in train_set.columns:
        train_set.drop(columns=["CellID"])
    train_set = train_set[COLUMNS_OF_INTEREST].copy()
    #train_set = train_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    train_set.reset_index(drop=True, inplace=True)
   
    if "CellID" in test_set.columns:
        test_set.drop(columns=["CellID"])
    test_set = test_set[COLUMNS_OF_INTEREST].copy()
    #test_set = test_set.drop(columns=["Patient Id", "Sample Id", "MouseIgG1"])
    experiment = setup(data=train_set,test_data=test_set, target='Treatment', index=False)
    # Perform initial model comparison.
    best = compare_models()
    model_comparison_df = pull()
    #save model comparison
    excluded_patient_file_name: Path = Path(f"{patient}_excluded_model_performance.csv")
    if not excluded_patient_file_name.exists():
        model_comparison_df.to_csv(Path(save_folder,excluded_patient_file_name))

    # Predict on test set
    # Evaluate the model (this will show the performance metrics in PyCaret's UI)
    predictions = predict_model(best, data=test_set)
    metrics = pull()
    metrics.to_csv(Path(save_folder, f"{patient}_metrics.csv"))

    


Loading train sets...
672431_1.csv
475139_2.csv
657907_1.csv
672431_2.csv
475139_1.csv
657907_2.csv
894222_2.csv
312158_1.csv
894222_1.csv
312158_2.csv
209184_2.csv
484265_2.csv
798747_2.csv
484265_1.csv
798747_1.csv
566112_2.csv
566112_1.csv
Loading test sets...
209184_1.csv


KeyError: "['PR'] not in index"