In [144]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyperclip
import os
import re
import sys
import config
import importlib
import itertools
from datetime import datetime
sys.path.insert(1, '/Users/yifu/PycharmProjects/Radiotherapy-Prediction')
from utils.printers import print_with_color, bcolors
import objects.VarReader
import objects.Evaluator
import objects.Initializer
import objects.Data
import objects.DataProcessor
import objects.Predictor
import objects.FeatureSelector
import objects.InclusionCriteria
import objects.Experiment
import objects.SubsetColumns

importlib.reload(objects.VarReader)
importlib.reload(objects.Evaluator)
importlib.reload(objects.Initializer)
importlib.reload(objects.Data)
importlib.reload(objects.DataProcessor)
importlib.reload(objects.Predictor)
importlib.reload(objects.FeatureSelector)
importlib.reload(objects.InclusionCriteria)
importlib.reload(objects.Experiment) 
importlib.reload(objects.SubsetColumns)

<module 'objects.SubsetColumns' from '/Users/yifu/PycharmProjects/Radiotherapy-Prediction/objects/SubsetColumns.py'>

In [145]:
processed_df_path = "/Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/input/input-data-with-nomogram-probs/nomogram_results_2022-09-08.csv"
metadata_path = "/Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/input/metadata/Metadata.xlsx"

RTx_EN_nonzero_12cols = [
    'PRE_his_subtype___dcis', 'PRE_susp_LN_prsnt_composite',
    'PRE_her_status', 'PRE_dximg___ultrasound',
    'PRE_sln_met_nomogram_prob', 'PRE_pre_op_biopsy',
    'PRE_surg_indicat_prim___primary_tx',
    'PRE_int_mammary_lymphade_pet', 'PRE_his_subtype___idc',
    'PRE_metastatic_carcinoma_on_ax', 'PRE_dximg___mammography',
    'PRE_lymphovascular_invasion0'
]

RTx_LLasso_nonzero_8cols = [
    'PRE_his_subtype___dcis', 'PRE_susp_LN_prsnt_composite',
    'PRE_sln_met_nomogram_prob', 'PRE_dximg___ultrasound',
    'PRE_pre_op_biopsy', 'PRE_surg_indicat_prim___primary_tx',
    'PRE_her_status', 'PRE_his_subtype___idc'
]

results_base_dir = "/Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/output/test-results"
results_dir = os.path.join(results_base_dir, datetime.now().strftime("%Y-%m-%d_%H%M%S"))
if results_dir not in os.listdir(results_base_dir):
    os.mkdir(results_dir)
    print(f"Created results directory: {results_dir}")

Created results directory: /Users/yifu/PycharmProjects/Radiotherapy-Prediction/data/output/test-results/2022-11-10_144017


In [146]:
Initializer = objects.Initializer.Initializer(
    metadata_path,
    raw_df_path=None,
    results_dir=results_dir,
    processed_df_path=processed_df_path,
    DPI=80,
    models_to_show=["elastic_net", "logistic_reg", "logistic_lasso", "random_forest"]
)


Data = objects.Data.Data()
VarReader = objects.VarReader.VarReader()
Evaluator = objects.Evaluator.Evaluator()
DataProcessor = objects.DataProcessor.DataProcessor()
Predictor = objects.Predictor.Predictor()

In [147]:
target_col = "POS_did_the_patient_receive_pm"

EN_df = pd.read_csv(processed_df_path)[RTx_EN_nonzero_12cols + [target_col]]
Lasso_df = pd.read_csv(processed_df_path)[RTx_LLasso_nonzero_8cols + [target_col]]
All_df = pd.read_csv(processed_df_path)

EN_df = EN_df.apply(pd.to_numeric, errors='coerce')
Lasso_df = Lasso_df.apply(pd.to_numeric, errors='coerce')
All_df = All_df.apply(pd.to_numeric, errors='coerce')
All_df = All_df.dropna(axis=1, how="all")
EN_df = EN_df.dropna(subset=[target_col])
Lasso_df = Lasso_df.dropna(subset=[target_col])
All_df = All_df.dropna(subset=[target_col])

for df, name in zip([EN_df, Lasso_df, All_df], ["EN", "Lasso", "All"]):
    print(f"Shape of {name} df:", df.shape)

Shape of EN df: (786, 13)
Shape of Lasso df: (786, 9)
Shape of All df: (786, 132)


In [148]:
# Impute missing values using sklearn's IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

def preprocess_values(df):
    # Do not use target column as a feature
    X = df.drop(columns=[target_col])
    y = df[target_col]
    # Standardize all columns except target column to have mean 0 and std 1
    # X = X.dropna(axis=1, how="all")
    imputer = KNNImputer(n_neighbors=5)
    X_imputed = imputer.fit_transform(X)
    df_imputed = pd.DataFrame(X_imputed, columns=X.columns)
    df_imputed[target_col] = y
    return df_imputed

EN_df_imputed = preprocess_values(EN_df)
Lasso_df_imputed = preprocess_values(Lasso_df)
All_df_imputed = preprocess_values(All_df)
# Drop rows with NaN in target column
EN_df_imputed = EN_df_imputed.dropna(subset=[target_col])
Lasso_df_imputed = Lasso_df_imputed.dropna(subset=[target_col])
All_df_imputed = All_df_imputed.dropna(subset=[target_col])

In [149]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

df = All_df

# Impute missing values in df
imputer = KNNImputer(n_neighbors=5)
X = df.drop(columns=[target_col])
X_imputed = imputer.fit_transform(X)
y = df[target_col]
df = pd.DataFrame(X_imputed, columns=X.columns)
df[target_col] = y
# Drop rows with NaN in target
df = df.dropna(subset=[target_col])
X = df.drop(columns=[target_col])
y = df[target_col]
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)
# Predict target column using random forest classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred))

Shape of X: (772, 131)
Shape of y: (772,)
Accuracy: 0.5077720207253886
Precision: 0.37888198757763975
Recall: 0.40397350993377484
F1: 0.391025641025641
ROC AUC: 0.48922079752007896


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [150]:
# Use Random Forest to predict target column
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LogisticRegressionCV
from sklearn.exceptions import ConvergenceWarning
import warnings 

def evaluate_model(y_pred, y_test):
    auc = roc_auc_score(y_test, y_pred)
    # cm = confusion_matrix(y_test, y_pred)
    return auc

with warnings.catch_warnings():
    LR = LogisticRegressionCV(cv=5, max_iter=10000, n_jobs=-1, solver="saga", random_state=1)
    RF = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=1)

    for df, df_name in zip([EN_df_imputed, Lasso_df_imputed, All_df_imputed], ["EN", "Lasso", "All"]):
        for model, name in zip([LR, RF], ["LR", "RF"]):
            X = df.drop(columns=[target_col])
            y = df[target_col]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            auc = evaluate_model(y_pred, y_test)
            print(f"DF: {df_name} | Model: {name} | AUC: {auc}")

DF: EN | Model: LR | AUC: 0.5
DF: EN | Model: RF | AUC: 0.4995833333333333
DF: Lasso | Model: LR | AUC: 0.5
DF: Lasso | Model: RF | AUC: 0.49249999999999994
