# Risk Group Prediction for B Cell Patients Treated with CHOP

## 1. Data
* Data File: `CanineLymphoma_Bcell_PFS_data.csv` (This dataset includes data merged from the flow cytometry dates of
FC_4000-7429_20241128.xlsx and FC_4000-7429_20241128.xlsx,
filtered by cases where Type = B and retained based on the PFS3mo criteria.)
* test_Patient Information File: `yyyymmdd_PatientInformation.csv`
* test_Blood test Information File: `yyyymmdd_CBC.CSV`
* test_Drug_senstivity File: `yyyymmdd_sample_modeling.xlsx.CSV`
* test_Flow Cytometry Data File: `Flow Cytometry Data_Master file_KR_V2.xlsx`

## 2. Model Quality Control (QC)
* Compare Result File: `Bcell_chop_test_result_20250312.csv`

## 3. Inference Method
1. Extract B cell patients using the patient information file  
   * If only T cells are present, classify as T cell.
2. Convert values to binary based on predefined thresholds.  
3. Predict classification results:  
   * **High Risk**: predict_proba >= 0.3389855457837697 
   * **Low Risk**:  predict_proba < 0.3389855457837697
     
## 4. library version
* Python version: 3.11.8
* pandas version: 2.2.1
* numpy version: 1.26.4
* scikit-learn version: 1.6.1
* supervised (mljar AutoML) version: 1.1.15

In [38]:
import os
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")  # 모든 경고 무시

pd.options.display.max_columns = 300

DATE = '20250301'

# 데이터 불러오기 함수
def load_data(file_path):
    _, file_extension = os.path.splitext(file_path)
    if file_extension == '.xlsx':
        return pd.read_excel(file_path)
    elif file_extension == '.csv':
        return pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file type. Provide a .csv or .xlsx file.")

# 데이터 전처리 함수
def preprocess_data(DATE):
    data_path = f'data/{DATE}/'

    # Patient Information 데이터 불러오기
    base_test = pd.read_csv(data_path + DATE + '_PatientInformation.csv')

    # 혈액 데이터 불러오기 및 전처리
    blood_test = pd.read_csv(data_path + DATE + '_CBC.CSV')
    blood_test = blood_test.dropna(subset='AnalyzerEnteredPatientID')
    blood_test = blood_test.pivot(index="AnalyzerEnteredPatientID", columns="TestLongName", values="TestValue").reset_index()
    blood_test.columns = blood_test.columns.get_level_values(0)
    blood_test = blood_test.rename(columns={'AnalyzerEnteredPatientID': 'ID'})

    # 약물 감수성 데이터 불러오기
    drug_test = load_data(data_path + DATE + '_sample_modeling.xlsx')
    drug_test = drug_test.rename(columns={'Patient': 'ID'})

    # Flow Cytometry 데이터 불러오기
    flow_test = pd.read_excel(data_path + 'Flow Cytometry Data_Master file_KR_V2.xlsx', header=1)
    flow_test = flow_test.rename(columns={'Sample ID': 'ID'})

    # 데이터 병합
    test = base_test.merge(blood_test, on='ID', how='left')
    test = test.merge(drug_test, on='ID', how='left')
    test = test.merge(flow_test, on='ID', how='left')

    # 컬럼 이름 변경
    rename_columns = {
        'WBC': 'WBC_1', 'RBC': 'RBC_1', 'HGB': 'HGB_1', 'HCT': 'HCT_1', 'MCV': 'MCV_1',
        'MCH': 'MCH_1', 'PLT': 'PLT_1', 'NEU': 'Neut_1', 'LYM': 'Lymp_1', 'MONO': 'Mono_1',
        'EOS': 'Eos_1', 'BAS': 'Baso_1',
        'LAsparaginase_bfIC50': 'ic50_lasp', 'Mitoxantrone_bfIC50': 'ic50_mito',
        'Vincristine_bfIC50': 'ic50_vinc', 'Vinblastine_bfIC50': 'ic50_vinb', 'Doxorubicin_bfIC50': 'ic50_doxo',
        'Actinomycin_bfIC50': 'ic50_acti', 'Tanovea_bfIC50': 'ic50_tano',
        'Chlorambucil_bfIC50': 'ic50_chlo', 'Mechlorethamine_bfIC50': 'ic50_mech',
        'Lomustine_bfIC50': 'ic50_lomu', 'Prednisolone_bfIC50': 'ic50_pred',
        'Mafosfamide_bfIC50': 'ic50_mafo', 'Melphalan_bfIC50': 'ic50_melp',
        'Dexamethasone_bfIC50': 'ic50_dexa', 'LAsparaginase_maxper': 'emax_lasp',
        'Mitoxantrone_maxper': 'emax_mito', 'Vincristine_maxper': 'emax_vinc',
        'Vinblastine_maxper': 'emax_vinb', 'Doxorubicin_maxper': 'emax_doxo',
        'Actinomycin_maxtox': 'emax_acti', 'Tanovea_maxper': 'emax_tano',
        'Chlorambucil_maxper': 'emax_chlo', 'Mechlorethamine_maxper': 'emax_mech',
        'Lomustine_maxper': 'emax_lomu', 'Prednisolone_maxper': 'emax_pred',
        'Mafosfamide_maxper': 'emax_mafo', 'Melphalan_maxper': 'emax_melp',
        'Dexamethasone_maxper': 'emax_dexa'
    }
    test = test.rename(columns=rename_columns)

    # IsNaive feature 변환
    test['IsNaive'] = test['IsNaive'].apply(lambda x: 1 if x == 'Y' else (0 if x == 'N' else x))

    # Small Pop. (% of lymph) 클래스 생성
    test['Small Pop.  (% of lymph)_class'] = test['Small Pop.  (% of lymph)'].apply(lambda x: 1 if x >= 49 else 0)

    # AUC feature 생성 (값이 없으므로 NaN 처리), 희주님이 따로 rename에 넣어야하는 columns 입니다. 미국DS 데이터에는 있는 값들입니다.
    for feature in ['auc_doxo', 'auc_mito', 'auc_tano', 'auc_mafo', 'auc_melp', 'auc_vinc']:
        test[feature] = np.nan

    return test

test = preprocess_data(DATE)

In [39]:
bcell_df = pd.read_csv('data/CanineLymphoma_Bcell_PFS_data.csv')

# '2ndline_OR', 'PFS2days', 'PFS2state', '2ndline'제거
filter_bcell_df = bcell_df[['ID', 'PFSdays', 'PFSstate', 
       'Age', 'IsNaive', 'WBC_1', 'Neut_1',
       'Lymp_1', 'Mono_1', 'Eos_1', 'Baso_1', 'RBC_1', 'HGB_1', 'HCT_1',
       'MCV_1', 'MCH_1', 'PLT_1', 'ic50_doxo', 'emax_doxo',
       'auc_doxo', 'ic50_mito', 'emax_mito', 'auc_mito', 'ic50_tano',
       'emax_tano', 'auc_tano', 'ic50_mafo', 'emax_mafo', 'auc_mafo',
       'ic50_melp', 'emax_melp', 'auc_melp', 'ic50_vinc', 'emax_vinc',
       'auc_vinc', 'Lymphocytes \n(% of Total)',
       'Small Pop. (%  of total)', 'Large Pop. (% of total)',
       'Small Pop.  (% of lymph)', 'Large Pop. (% of lymph)', 'All CD21+',
       'All MHC+', 'CD21+MHC-', 'CD21+MHC+', 'CD21-MHC+', 'All CD3+',
       'All CD4+', 'All CD8+', 'CD3+CD4+', 'CD3+CD8+', 'All CD5+', 'CD5-CD45+',
       'CD5+CD45+', 'CD5+CD45-', 'CD34+', 'CD14+', 'FSC, Median', 'FSC, Mean',
       'FSC, Std. Dev.', 'SSC, Median', 'SSC, Mean', 'SSC, Std. Dev.','Small Pop.  (% of lymph)_class',
       'CD3, Mean.1', 'PFS3mo']]

target = filter_bcell_df[['ID', 'PFSdays', 'PFSstate', 'PFS3mo']]

filter_bcell_df = filter_bcell_df.drop(columns=['ID', 'PFSdays', 'PFSstate', 'PFS3mo'])

feature_list = list(filter_bcell_df.columns)

In [40]:
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, confusion_matrix
from supervised.automl import AutoML

# 저장되어 있는 test 결과를 불러온다.
save_result = pd.read_csv('data/Bcell_chop_test_result_20250312.csv')

# 결과 저장을 위한 데이터프레임 초기화
results = []
data = filter_bcell_df

# 데이터를 train과 test로 나눈다. (stratify 옵션으로 클래스 비율 유지)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    data, target, test_size=0.2, random_state=33, stratify=target['PFS3mo']
)

# 최종 train 데이터와 레이블 준비
train = X_train_full
train_label = y_train_full

# 미리 학습된 AutoML 모델을 불러온다.
automl = AutoML(results_path='20250304_rs_all_new_33')

# test 데이터에 대한 class 예측 수행
y_test['class_predict'] = automl.predict(X_test)

# test 데이터에 대한 확률 예측 수행
result = automl.predict_proba(X_test)
y_test['predict'] = result[:,1]

# 저장된 예측값(save_predict)을 기존 y_test에 병합
# (이전에 저장된 값과 현재 모델의 예측값 비교)
y_test = pd.merge(y_test, save_result[['ID','save_predict']], on='ID', how='left')

# 소수점 7자리에서 반올림 처리
# (csv 저장 및 불러오기 과정에서 발생하는 오차 제거 목적)
y_test['predict'] = y_test['predict'].round(7)
y_test['save_predict'] = y_test['save_predict'].round(7)

# 두 예측 결과가 완전히 같은지 확인
are_identical = y_test['predict'].equals(y_test['save_predict'])

# 최종 결과 출력
print(f"Test result match: {are_identical}")

Test result match: True










57_RandomForest_GoldenFeatures auc 0.658436 trained in 21.96 seconds
58_CatBoost auc 0.62259 trained in 13.7 seconds
59_CatBoost auc 0.594462 trained in 13.07 seconds
60_CatBoost auc 0.649641 trained in 13.28 seconds
61_Xgboost auc 0.601205 trained in 7.22 seconds
62_Xgboost auc 0.585077 trained in 7.57 seconds
63_CatBoost auc 0.662821 trained in 15.98 seconds
64_Xgboost auc 0.588128 trained in 7.18 seconds
65_Xgboost auc 0.605667 trained in 7.24 seconds
66_NeuralNetwork_SelectedFeatures auc 0.571692 trained in 4.32 seconds
67_NeuralNetwork_SelectedFeatures auc 0.55641 trained in 5.35 seconds
68_ExtraTrees auc 0.572615 trained in 22.76 seconds
69_ExtraTrees auc 0.603128 trained in 21.17 seconds
70_NeuralNetwork auc 0.518051 trained in 8.15 seconds
71_NeuralNetwork auc 0.500359 trained in 9.35 seconds
72_NeuralNetwork auc 0.484256 trained in 8.32 seconds
73_NeuralNetwork auc 0.477795 trained in 8.78 seconds
* Step hill_climbing_2 will try to check up to 20 models
74_ExtraTrees_SelectedF











57_RandomForest_GoldenFeatures_Stacked auc 0.758949 trained in 26.73 seconds
84_Xgboost_SelectedFeatures_Stacked auc 0.703 trained in 10.57 seconds
18_CatBoost_Stacked auc 0.695487 trained in 22.11 seconds
45_NeuralNetwork_Stacked auc 0.502744 trained in 12.14 seconds
88_RandomForest_Stacked auc 0.703846 trained in 26.03 seconds
2_Default_Xgboost_Stacked auc 0.671231 trained in 11.12 seconds
16_CatBoost_Stacked auc 0.678308 trained in 17.87 seconds
44_NeuralNetwork_Stacked auc 0.550308 trained in 11.51 seconds
* Step ensemble_stacked will try to check up to 1 model
Ensemble_Stacked auc 0.789333 trained in 20.65 seconds
AutoML fit time: 2092.4 seconds
AutoML best model: Ensemble_Stacked










{'random_state': 9, 'valid_roc_auc': 0.7893333333333333, 'test_accuracy': 0.7682926829268293, 'test_roc_auc': 0.7290322580645161, 'test_sensitivity': 0.4, 'test_specificity': 0.8870967741935484, 'test_ppv': 0.5333333333333333, 'test_npv': 0.8208955223880597, 'threshold': 0.31227331445014833}
   random_state  valid_roc_auc  test_accuracy  test_roc_auc  test_sensitivity  \
0             0       0.741282       0.756098      0.637097              0.05   
1             1       0.774564       0.756098      0.616129              0.20   
2             2       0.795795       0.695122      0.533065              0.20   
3             3       0.806103       0.682927      0.606452              0.30   
4             4       0.748308       0.756098      0.619355              0.15   
5             5       0.776000       0.707317      0.546774              0.25   
6             6       0.780308       0.731707      0.562097              0.30   
7             7       0.766359       0.756098      0.679839

In [41]:
# 미리 결정된 threshold 설정 (예측 확률 기준점)
threshold = 0.3389855457837697

# test 데이터에 대해 AutoML 모델을 이용해 class 예측 수행
result_test = automl.predict(test[feature_list])

# test 데이터에 대해 예측 확률(probability) 값을 얻음
result_proba = automl.predict_proba(test[feature_list])

# test 데이터프레임에 예측된 확률 값 추가
test['predict'] = automl.predict_proba(test[feature_list])[:,1]

# test 데이터프레임에 예측된 class(0 or 1) 추가
test['class_predict'] = automl.predict(test[feature_list])

# 숫자형 예측값을 문자형 리스크 그룹으로 변환 (1이면 high_risk, 아니면 low_risk)
test['class_predict'] = test['class_predict'].apply(lambda x: 'high_risk' if x == 1 else 'low_risk')

# 최종 ID, Type, 예측 확률, 리스크 그룹만 추출하여 결과 확인
test[['ID', 'Type', 'predict', 'class_predict']]

Unnamed: 0,ID,Type,predict,class_predict
0,C175KR,B,0.533933,high_risk


In [42]:
# ID	Type	predict	class_predict
# 0	C175KR	B	0.533933	high_risk