In [187]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyperclip
import os
import re
import sys
import config
import importlib
import itertools
from datetime import datetime
sys.path.insert(1, '/Users/yifu/PycharmProjects/Radiotherapy-Prediction')
from utils.printers import print_with_color, bcolors
import objects.VarReader
import objects.Evaluator
import objects.Initializer
import objects.Data
import objects.DataProcessor
import objects.Predictor
import objects.FeatureSelector
import objects.InclusionCriteria
import objects.Experiment
import objects.SubsetColumns

importlib.reload(objects.VarReader)
importlib.reload(objects.Evaluator)
importlib.reload(objects.Initializer)
importlib.reload(objects.Data)
importlib.reload(objects.DataProcessor)
importlib.reload(objects.Predictor)
importlib.reload(objects.FeatureSelector)
importlib.reload(objects.InclusionCriteria)
importlib.reload(objects.Experiment) 
importlib.reload(objects.SubsetColumns)

<module 'objects.SubsetColumns' from '/Users/yifu/PycharmProjects/Radiotherapy-Prediction/objects/SubsetColumns.py'>

In [188]:
processed_df_path = "/Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/input/input-data-with-nomogram-probs/nomogram_results_2022-09-08.csv"
metadata_path = "/Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/input/metadata/Metadata.xlsx"

RTx_EN_nonzero_12cols = [
    'PRE_his_subtype___dcis', 'PRE_susp_LN_prsnt_composite',
    'PRE_her_status', 'PRE_dximg___ultrasound',
    'PRE_sln_met_nomogram_prob', 'PRE_pre_op_biopsy',
    'PRE_surg_indicat_prim___primary_tx',
    'PRE_int_mammary_lymphade_pet', 'PRE_his_subtype___idc',
    'PRE_metastatic_carcinoma_on_ax', 'PRE_dximg___mammography',
    'PRE_lymphovascular_invasion0'
]

RTx_LLasso_nonzero_8cols = [
    'PRE_his_subtype___dcis', 'PRE_susp_LN_prsnt_composite',
    'PRE_sln_met_nomogram_prob', 'PRE_dximg___ultrasound',
    'PRE_pre_op_biopsy', 'PRE_surg_indicat_prim___primary_tx',
    'PRE_her_status', 'PRE_his_subtype___idc'
] 

RTx_just_one_col = ['PRE_sln_met_nomogram_prob']

results_base_dir = "/Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/output/test-results"
results_dir = os.path.join(results_base_dir, datetime.now().strftime("%Y-%m-%d_%H%M%S"))
if results_dir not in os.listdir(results_base_dir):
    os.mkdir(results_dir)
    print(f"Created results directory: {results_dir}")

Created results directory: /Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/output/test-results/2022-11-10_145356


In [189]:
# Print the difference between the two sets of columns
print(f"Columns in RTx_EN_nonzero_12cols but not in RTx_LLasso_nonzero_8cols: {set(RTx_EN_nonzero_12cols) - set(RTx_LLasso_nonzero_8cols)}")
print(f"Columns in RTx_LLasso_nonzero_8cols but not in RTx_EN_nonzero_12cols: {set(RTx_LLasso_nonzero_8cols) - set(RTx_EN_nonzero_12cols)}")

Columns in RTx_EN_nonzero_12cols but not in RTx_LLasso_nonzero_8cols: {'PRE_lymphovascular_invasion0', 'PRE_dximg___mammography', 'PRE_metastatic_carcinoma_on_ax', 'PRE_int_mammary_lymphade_pet'}
Columns in RTx_LLasso_nonzero_8cols but not in RTx_EN_nonzero_12cols: set()


In [190]:
Initializer = objects.Initializer.Initializer(
    metadata_path,
    raw_df_path=None,
    results_dir=results_dir,
    processed_df_path=processed_df_path,
    DPI=80,
    models_to_show=["elastic_net", "logistic_reg", "logistic_lasso", "random_forest"]
)


Data = objects.Data.Data()
VarReader = objects.VarReader.VarReader()
Evaluator = objects.Evaluator.Evaluator()
DataProcessor = objects.DataProcessor.DataProcessor()
Predictor = objects.Predictor.Predictor()

In [191]:
target_col = "POS_did_the_patient_receive_pm"

EN_df = pd.read_csv(processed_df_path)[RTx_EN_nonzero_12cols + [target_col]]
Lasso_df = pd.read_csv(processed_df_path)[RTx_LLasso_nonzero_8cols + [target_col]]
All_df = pd.read_csv(processed_df_path)
JustOne_df = pd.read_csv(processed_df_path)[RTx_just_one_col + [target_col]]

EN_df = EN_df.apply(pd.to_numeric, errors='coerce')
Lasso_df = Lasso_df.apply(pd.to_numeric, errors='coerce')
All_df = All_df.apply(pd.to_numeric, errors='coerce')
JustOne_df = JustOne_df.apply(pd.to_numeric, errors='coerce')

All_df = All_df.dropna(axis=1, how="all")

EN_df = EN_df.dropna(subset=[target_col])
Lasso_df = Lasso_df.dropna(subset=[target_col])
All_df = All_df.dropna(subset=[target_col])
JustOne_df = JustOne_df.dropna(subset=[target_col])

df_list = [EN_df, Lasso_df, All_df, JustOne_df]
df_names = ["EN_df", "Lasso_df", "All_df", "JustOne_df"]

In [192]:
# Use machine learning to predict target column
for df, df_name in zip(df_list, df_names):
    print("-"*100)
    print(f"Predicting {target_col} using {df_name} df")
    # Print shape
    print(f"Shape of df: {df.shape}")
    target_col = "POS_did_the_patient_receive_pm"
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix

    X = df.drop(target_col, axis=1)
    y = df[target_col]

    # Standardize and impute X
    from sklearn.preprocessing import StandardScaler
    from sklearn.impute import KNNImputer
    X = KNNImputer(n_neighbors=5).fit_transform(X)
    X = StandardScaler().fit_transform(X)
    # Print shape
    print(f"Shape of X: {X.shape}")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Print the shapes
    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)
    # Logistic regression
    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    y_pred_proba = lr.predict_proba(X_test)[:, 1]

    # Random forest
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    y_pred_proba_rf = rf.predict_proba(X_test)[:, 1]

    # Print metrics
    print("Logistic regression")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    # print("Precision:", precision_score(y_test, y_pred))
    # print("Recall:", recall_score(y_test, y_pred))
    # print("F1:", f1_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_pred_proba))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("Random forest")
    print("Accuracy:", accuracy_score(y_test, y_pred_rf))
    # print("Precision:", precision_score(y_test, y_pred_rf))
    # print("Recall:", recall_score(y_test, y_pred_rf))
    # print("F1:", f1_score(y_test, y_pred_rf))
    print("ROC AUC:", roc_auc_score(y_test, y_pred_proba_rf))
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred_rf))

----------------------------------------------------------------------------------------------------
Predicting POS_did_the_patient_receive_pm using EN_df df
Shape of df: (786, 13)
Shape of X: (786, 12)
X_train shape: (628, 12)
X_test shape: (158, 12)
y_train shape: (628,)
y_test shape: (158,)
Logistic regression
Accuracy: 0.6772151898734177
ROC AUC: 0.7613095238095239
Confusion matrix:
[[68 30]
 [21 39]]
Random forest
Accuracy: 0.6518987341772152
ROC AUC: 0.6954931972789116
Confusion matrix:
[[63 35]
 [20 40]]
----------------------------------------------------------------------------------------------------
Predicting POS_did_the_patient_receive_pm using Lasso_df df
Shape of df: (786, 9)
Shape of X: (786, 8)
X_train shape: (628, 8)
X_test shape: (158, 8)
y_train shape: (628,)
y_test shape: (158,)
Logistic regression
Accuracy: 0.6708860759493671
ROC AUC: 0.7504251700680272
Confusion matrix:
[[69 29]
 [23 37]]
Random forest
Accuracy: 0.6645569620253164
ROC AUC: 0.7173469387755103
Conf