In [None]:
import nibabel as nib
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
import glob
import shutil
import seaborn as sns
import sklearn

In [None]:
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import pearsonr

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif,f_classif
from sklearn.model_selection import (KFold, train_test_split, cross_validate, GridSearchCV, RepeatedStratifiedKFold,
                                     cross_val_score, GroupKFold, StratifiedGroupKFold, StratifiedKFold)
from sklearn.metrics import roc_auc_score, recall_score, make_scorer, f1_score, confusion_matrix, roc_curve, accuracy_score

from sklearn.base import clone
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis                              
from sklearn.tree import DecisionTreeClassifier
from feature_engine.selection import RecursiveFeatureAddition

## Data Loading and Preprocessing

In [None]:
# initialize the file paths to hold all the images separated by sequence and SRS Date
root = '/Users/cxl037/PycharmProjects/pythonProject1/Example_Data/'
PreT1Bravo_path = os.path.join(root, 'PreT1Bravo_Immuno')
PreT1Bravo_batchfile = os.path.join(PreT1Bravo_path, 'radiomics_features_re.csv')
PreT1Vasc_path = os.path.join(root, 'PreT1Vasc_Immuno')
PreT1Vasc_batchfile = os.path.join(PreT1Vasc_path, 'radiomics_features_re.csv')
PreT2Flair_path = os.path.join(root, 'PreT2Flair_Immuno')
PreT2Flair_batchfile = os.path.join(PreT2Flair_path, 'radiomics_features_re.csv')

In [None]:
# Read T1Bravo data and add T1_ prefix to all the feature columns
extractedFeatures = pd.read_csv(PreT1Bravo_batchfile)
extractedFeatures = extractedFeatures.add_prefix('T1_')
extractedFeatures.rename(columns = {'T1_Mask': 'Mask'}, inplace = True)
# Get patientID for future merging
extractedFeatures['PatientID'] = extractedFeatures.apply(lambda row: row['Mask'][:7], axis=1)
extractedFeatures['PatientID'] = extractedFeatures['PatientID'].astype(str)
extractedFeatures

In [None]:
# Read T1Vasc data and add Vasc_ prefix to all the feature columns
extractedFeatures2 = pd.read_csv(PreT1Vasc_batchfile)
extractedFeatures2 = extractedFeatures2.add_prefix('Vasc_')
extractedFeatures2.rename(columns = {'Vasc_Mask': 'Mask'}, inplace = True)
extractedFeatures2

In [None]:
# Read T2Flair data and add T2_ prefix to all the feature columns
extractedFeatures3 = pd.read_csv(PreT2Flair_batchfile)
extractedFeatures3 = extractedFeatures3.add_prefix('T2_')
extractedFeatures3.rename(columns = {'T2_Mask': 'Mask'}, inplace = True)
extractedFeatures3

In [None]:
# Merge T1Bravo, T1Vasc, and T2Flair columns together
allFeatures = pd.merge(extractedFeatures, extractedFeatures2, on='Mask', how = 'left' )
allFeatures = pd.merge(allFeatures, extractedFeatures3, on='Mask', how = 'left' )
allFeatures

In [None]:
# Get patient immunotherapy information
patientDetails = pd.read_excel('/Users/cxl037/PycharmProjects/pythonProject1/SRS_immune_list.xlsx')
patientDetails['PatientID'] = patientDetails.apply(lambda row: str(row['MRN']) if len(str(row['MRN'])) == 7 else '0' + str(row['MRN']), axis=1)

In [None]:
# Use patientID to merge radiomic_features.csv and SRS_immune_list.xlsx
allFeatures = pd.merge(allFeatures, patientDetails, on='PatientID', how = 'left')
allFeatures

In [None]:
# Drop all diagnostic features
remove_cols = [feature for feature in allFeatures.columns 
               if not (feature.startswith("T2_original") or feature.startswith("T1_original") or feature.startswith("Vasc_original") 
               or feature == 'Immunotherapy_prior_3_months' or feature == 'PatientID')]
filteredFeatures = allFeatures.drop(remove_cols, axis = 1)
# remove rows with NA values, only preserving those rows which have information for all sequences
filteredFeatures = filteredFeatures.dropna()
# Get all the subject ids (useful for RepeatedStratifiedGroupKFold)
allSubjects = list(filteredFeatures.loc[:, 'PatientID'])
print(len(set(allSubjects)))
print(len(allSubjects))

In [None]:
rd=1
# Get training data and labels
X = filteredFeatures.drop(['Immunotherapy_prior_3_months', 'PatientID'], axis=1)
y = filteredFeatures['Immunotherapy_prior_3_months']
X

## Univariate Analysis

In [None]:
# Normalizing data
sc = StandardScaler()
X_train_ori = X
y_train_ori = y
X_norm = sc.fit_transform(X)
X_norm = pd.DataFrame(X_norm)
X_norm.columns = X.columns
X_norm

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Order features by highest univariate AUC
lr = LogisticRegression()
AUCforUniLogit = {}
for (featureName, featureData) in X_norm.iteritems():
    lr = lr.fit(featureData.values.reshape(-1, 1),y)
    auc = roc_auc_score(y, lr.predict_proba(featureData.values.reshape(-1, 1))[:, 1])
    AUCforUniLogit[featureName] = auc
AUCforUniLogitTop = dict(sorted(AUCforUniLogit.items(), key=lambda item: item[1], reverse=True))
AUCforUniLogitRank = list(AUCforUniLogitTop.keys())
AUCforUniLogitTop20 = dict(sorted(AUCforUniLogit.items(), key=lambda item: item[1], reverse=True)[:20])
print("{:<50} {:<50}".format('feature','auc value'))
for key, value in AUCforUniLogitTop20.items():
    print("{:<50} {:<50}".format(key, value))

## Removing Correlated Features

In [None]:
# Plot correlation heatmap before
fig, ax = plt.subplots(figsize=(30,30))
ax = sns.heatmap(X_norm.corr())

In [None]:
from statsmodels.stats.multitest import fdrcorrection
from scipy.stats import pearsonr

def get_correlated_features(df, threshold=0.95, max_pvalue=0.05):
    corr_matrix = np.zeros((df.shape[1], df.shape[1]))
    pvalue_matrix = np.zeros((df.shape[1], df.shape[1]))
    msk_cols = list(df.columns)

    # initializes (i,j) with the correlation coefficient and p-value for testing non-correlation 
    # between columns i and j
    for i in range(df.shape[1]):
        for j in range(df.shape[1]):
            corrtest = pearsonr(df[df.columns[i]], df[df.columns[j]])
            corr_matrix[i, j] = corrtest[0]
            pvalue_matrix[i, j] = corrtest[1]
    
    p_values = []
    for i in range(df.shape[1]):
        for j in range(i + 1, df.shape[1]):
            p_values.append(pvalue_matrix[i, j])
    
    # corrected p-values to make sure that no false significant results occur
    p_values_corrected = fdrcorrection(p_values, alpha=0.05, method='indep', is_sorted=False)[1]
    pvalues_corrected_matrix = np.zeros((df.shape[1], df.shape[1]))
    

    k = 0
    for i in range(df.shape[1]):
        for j in range(i + 1, df.shape[1]):
            pvalues_corrected_matrix[i, j] = p_values_corrected[k]
            pvalues_corrected_matrix[j, i] = p_values_corrected[k]
            k += 1

    to_drop_matrix = np.zeros((df.shape[1], df.shape[1]))
    
    # Create a matrix where (i, j) is correlated but j > i, in other words only consider upper triangular indices
    for i in range(df.shape[1]):
        for j in range(i + 1, df.shape[1]):
            if pvalues_corrected_matrix[i, j] < max_pvalue and abs(corr_matrix[i, j]) > threshold:
                to_drop_matrix[i, j] = 1
            else:
                if abs(corr_matrix[i, j]) > threshold:
                    print(msk_cols[i] + " * " + msk_cols[j] + 'corr, pvalue, fdr: %f, %f, %f' % (
                        np.round(corr_matrix[i, j], decimals=3),
                        np.round(pvalue_matrix[i, j], decimals=3),
                        np.round(pvalues_corrected_matrix[i, j], decimals=3)))
                to_drop_matrix[i, j] = 0

    upper = pd.DataFrame(to_drop_matrix)
    to_drop = [column for column in upper.columns if any(upper[column] == 1)]
    
    correlated_feats = {}
    for feature in msk_cols:
        correlated_feats[feature] = set()
    for i in to_drop:
        for j in upper.columns:
            if upper[i][j] > threshold:
                correlated_feats[msk_cols[j]].add(msk_cols[i]) # adds the dropped features as the value

    feats_to_drop = [msk_cols[i] for i in to_drop]

    # show how the kept features correlate with the dropped features
    new_correlated_feats = {}
    for feat in correlated_feats.keys():
        if feat not in feats_to_drop and len(correlated_feats[feat]) > 0:
            new_correlated_feats[feat] = correlated_feats[feat]
    correlated_feats = new_correlated_feats


    printDict = {'corr_matrix': corr_matrix,
                 'p_values_matrix': pvalue_matrix,
                 'p_values_corrected_matrix': pvalues_corrected_matrix,
                 'correlated_feats': correlated_feats,
                 'feats_to_drop': feats_to_drop
                }
    return printDict
    

result = get_correlated_features(X)


corr_matrix = abs(result['corr_matrix'])
corr_pvalues = result['p_values_matrix']
corr_pvalues_corrected = result['p_values_corrected_matrix']
correlated_feats = result['correlated_feats']
feats_to_drop = result['feats_to_drop']

print('feats_to_drop',feats_to_drop)
print(len(feats_to_drop))
print('\n')
print('correlated_feats',correlated_feats)

In [None]:
# Drop all correlated features
X_uncorr = X.drop(feats_to_drop, axis = 1)
X_train = X_uncorr.copy()
display(X_train)
y_train = y.copy()
feature_names = list(X_train.columns)

In [None]:
# Heatmap with reduced features

fig, ax = plt.subplots(figsize=(30,30))
ax = sns.heatmap(X_train.corr())

## Defining Functions and Variables for ML Analysis

In [None]:
from sklearn.model_selection import learning_curve

def plot_learning_curve(
    estimator,
    title,
    X,
    y,
    scoring,
    groups=None,
    axes=None,
    ylim=None,
    cv=None,
    n_jobs=None,
    train_sizes=np.linspace(0.1, 1.0, 5),
):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.

    Parameters
    ----------
    estimator : estimator instance
        An estimator instance implementing `fit` and `predict` methods which
        will be cloned for each validation.

    title : str
        Title for the chart.

    X : array-like of shape (n_samples, n_features)
        Training vector, where ``n_samples`` is the number of samples and
        ``n_features`` is the number of features.

    y : array-like of shape (n_samples) or (n_samples, n_features)
        Target relative to ``X`` for classification or regression;
        None for unsupervised learning.

    axes : array-like of shape (3,), default=None
        Axes to use for plotting the curves.

    ylim : tuple of shape (2,), default=None
        Defines minimum and maximum y-values plotted, e.g. (ymin, ymax).

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:

          - None, to use the default 5-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, default=None
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like of shape (n_ticks,)
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the ``dtype`` is float, it is regarded
        as a fraction of the maximum size of the training set (that is
        determined by the selected validation method), i.e. it has to be within
        (0, 1]. Otherwise it is interpreted as absolute sizes of the training
        sets. Note that for classification the number of samples usually have
        to be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(
        estimator,
        X,
        y,
        groups=groups,
        cv=cv,
        n_jobs=n_jobs,
        train_sizes=train_sizes,
        scoring=scoring,
        return_times=True,
    )
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(
        train_sizes,
        train_scores_mean - train_scores_std,
        train_scores_mean + train_scores_std,
        alpha=0.1,
        color="r",
    )
    axes[0].fill_between(
        train_sizes,
        test_scores_mean - test_scores_std,
        test_scores_mean + test_scores_std,
        alpha=0.1,
        color="g",
    )
    axes[0].plot(
        train_sizes, train_scores_mean, "o-", color="r", label="Training score"
    )
    axes[0].plot(
        train_sizes, test_scores_mean, "o-", color="g", label="Cross-validation score"
    )
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, "o-")
    axes[1].fill_between(
        train_sizes,
        fit_times_mean - fit_times_std,
        fit_times_mean + fit_times_std,
        alpha=0.1,
    )
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    fit_time_argsort = fit_times_mean.argsort()
    fit_time_sorted = fit_times_mean[fit_time_argsort]
    test_scores_mean_sorted = test_scores_mean[fit_time_argsort]
    test_scores_std_sorted = test_scores_std[fit_time_argsort]
    axes[2].grid()
    axes[2].plot(fit_time_sorted, test_scores_mean_sorted, "o-")
    axes[2].fill_between(
        fit_time_sorted,
        test_scores_mean_sorted - test_scores_std_sorted,
        test_scores_mean_sorted + test_scores_std_sorted,
        alpha=0.1,
    )
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt

In [None]:
class RepeatedStratifiedGroupKFold():
    def __init__(self, n_splits=5, n_repeats=10, random_state=None):
        self.random_state = random_state
        self.n_repeats = n_repeats
        self.n_splits = n_splits

    def split(self, X, y, groups=None):
        tries = 0
        for idx in range(self.n_repeats):
            invalid = True
            # Skip the cv if any of its folds have only one class in the train/test split (causes errors in AUC calculation)
            while (invalid):
                cv = StratifiedGroupKFold(n_splits=self.n_splits, shuffle = True, random_state=self.random_state + tries)
                invalid = False
                tries = tries + 1
                for train_index, test_index in cv.split(X, y, groups):
                    trainPosRatio = y.iloc[train_index].mean()
                    testPosRatio = y.iloc[test_index].mean()
                    # test for if train fold and test fold only have one class
                    if trainPosRatio == 0 or trainPosRatio == 1 or testPosRatio == 0 or testPosRatio == 1:
                        invalid = True
                        break
                    
            for train_index, test_index in cv.split(X, y, groups):
                yield train_index, test_index

    def get_n_splits(self, X, y, groups=None):
        cv = StratifiedGroupKFold(n_splits=self.n_splits, shuffle = True, random_state=self.random_state)
        return cv.get_n_splits(X, y, groups) * self.n_repeats

In [None]:
# Example RepeatedStratifiedGroupKFold split
cv = RepeatedStratifiedGroupKFold(n_splits=10, n_repeats=10, random_state=5)
print("ORIGINAL POSITIVE RATIO:", y.mean())
train_positive = []
test_positive = []
for fold, (train_idxs, test_idxs) in enumerate(cv.split(X, y, allSubjects)):
    print("Fold :", fold)
    train_positive.append(y.iloc[train_idxs].mean())
    test_positive.append(y.iloc[test_idxs].mean())
    print("TRAIN POSITIVE RATIO:", y.iloc[train_idxs].mean())
    print("TEST POSITIVE RATIO :", y.iloc[test_idxs].mean())
    print("TRAIN GROUPS        :", [allSubjects[i] for i in train_idxs])
    print("TEST GROUPS         :", [allSubjects[i] for i in test_idxs])
print("ALL TRAIN POSITIVE RATIO:", np.mean(train_positive))
print("STD TRAIN POSITIVE RATIO:", np.std(train_positive))
print("ALL TEST POSITIVE RATIO :", np.mean(test_positive))
print("STD TEST POSITIVE RATIO:", np.std(test_positive))

In [None]:
# global constants

N_SPLITS = 10
N_REPEATS = 10
RD = 5
K=10

cv = RepeatedStratifiedGroupKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=RD)

specificity = make_scorer(recall_score, pos_label=0)
sensitivity = make_scorer(recall_score, pos_label=1)

scoring = {'specificity': specificity,
           'sensitivity': sensitivity,
           'roc_auc': 'roc_auc'
          }

## ML Analysis

### Feature Selection with Lasso

In [None]:
def Lasso(X_train): 
    # Define baseline lasso model
    lasso_grid_pipe = Pipeline([
        ('scalar', StandardScaler()),
        ('clf', LogisticRegression(penalty = 'l1', solver='liblinear', max_iter = 10000))])

    # Run grid search on hyperparameters
    parameters = {'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 10, 100, 1000],
                  'clf__tol': [0.00001, 0.0001, 0.0005, 0.001,  0.005, 0.01] }

    lasso_grid = GridSearchCV(lasso_grid_pipe, parameters, cv=cv, scoring = 'roc_auc')
    lasso_grid.fit(X_train, y_train, groups=allSubjects)

    # Get best model and run cross validation using all data
    print('Best estimator: ', lasso_grid.best_estimator_)
    lasso_pipe = lasso_grid.best_estimator_

    scores = cross_validate(lasso_pipe, X_train, y_train, groups=allSubjects, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True)
    print('Cross-Validation Evaluation Scores')
    print('Train AUC:', 'mean-',np.mean(scores['train_roc_auc']), 'std dev-', np.std(scores['train_roc_auc']))
    print('Test AUC:', 'mean-',np.mean(scores['test_roc_auc']), 'std dev-', np.std(scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(scores['test_sensitivity']), 'std dev-', np.std(scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(scores['test_specificity']), 'std dev-', np.std(scores['test_specificity']))
    return lasso_grid.best_params_,lasso_grid.best_estimator_,np.mean(scores['test_roc_auc'])

# Getting best lasso model
lasso_best_param, lasso_best_model, lasso_scores = Lasso(X_train)

fig, axes = plt.subplots(3, 1, figsize=(10, 15))
title = "Learning Curves (Lasso)"
plot_learning_curve(
    lasso_best_model, title, X_train, y_train, groups=allSubjects, scoring='roc_auc', axes=axes, ylim=(0.3, 1.01), cv=cv, n_jobs=4
)
print('Best parameters',lasso_best_param)

In [None]:
# getting abs value of model coefficients for importance scores
coefficients = lasso_best_model.named_steps['clf'].coef_
importance = np.abs(coefficients).reshape(-1)

In [None]:
def FeatureSelection(Model):
    # sort all the positive importance values to obtain different thresholds to filter the importance scores
    posImportance = sorted([i for i in importance if i > 0])
    importance_thresholds_index = np.round(np.linspace(0, len(posImportance) - 1, 10))
    
    highest_score = 0
    # for each threshold, filter features lower than that threshold
    for th_idx in importance_thresholds_index:
        th = posImportance[int(th_idx)]
        selected = np.array(feature_names)[importance >= th]
        print(len(selected))
        if (len(selected) > 30 or len(selected) < 3): # too many/too little features
            continue
            
        # Train model on selected features
        X_train = X[selected]
        model_param, model, model_score = Model(X_train)
        
        # Update highest score and best threshold
        if model_score > highest_score:
            highest_score = model_score
            best_threshold = th
            best_model = model
    return (highest_score, best_threshold, best_model)

### Running Lasso, Ridge, and LinearSVC with Feature Selection

In [None]:
# Lasso Model
(best_lasso_score, best_threshold, best_lasso_model) = FeatureSelection(Lasso)
selected = np.array(feature_names)[importance >= best_threshold]
X_train = X[selected]

print('The threshold {} gives the highest AUC score {} with model\n {}'.format(best_threshold, best_lasso_score, best_lasso_model))
print('This threshold gives {} features which are {}'.format(len(selected), selected))
fig, axes = plt.subplots(3, 1, figsize=(10, 15))
title = "Learning Curves (Lasso)"
plot_learning_curve(
    best_lasso_model, title, X_train, y_train, groups=allSubjects, scoring='roc_auc', axes=axes, ylim=(0.3, 1.01), cv=cv, n_jobs=4
)

In [None]:
def Ridge(X_train):
    # Define baseline ridge model
    ridge_grid_pipe = Pipeline([
        ('scalar', StandardScaler()),
        ('clf', LogisticRegression(penalty = 'l2'))])
    
    # Run grid search on hyperparameters
    parameters = {'clf__C':[0.001, 0.01, .1, .25, .5, .75, 1.0, 5, 10, 50, 100],
                  'clf__tol': [0.00001, 0.0001, 0.0005, 0.001, 0.005, 0.01]
                 }
        
    grid_ridge_reg = GridSearchCV(ridge_grid_pipe, parameters, cv=cv, scoring = 'roc_auc')
    grid_ridge_reg.fit(X_train, y_train, groups=allSubjects)

    # Get best model and run cross validation using all data
    print('Best estimator: ', grid_ridge_reg.best_estimator_)
    ridge_pipe = grid_ridge_reg.best_estimator_

    scores = cross_validate(ridge_pipe, X_train, y_train, groups=allSubjects, cv=cv, scoring = scoring, n_jobs=-1, return_train_score=True) 
    print('Cross-Validation Evaluation Scores')
    print('Train AUC:', 'mean-',np.mean(scores['train_roc_auc']), 'std dev-', np.std(scores['train_roc_auc']))
    print('AUC:', 'mean-',np.mean(scores['test_roc_auc']), 'std dev-', np.std(scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(scores['test_sensitivity']), 'std dev-', np.std(scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(scores['test_specificity']), 'std dev-', np.std(scores['test_specificity']))
    return grid_ridge_reg.best_params_, grid_ridge_reg.best_estimator_, np.mean(scores['test_roc_auc'])

In [None]:
(best_ridge_score, best_threshold, best_ridge_model) = FeatureSelection(Ridge)
selected = np.array(feature_names)[importance >= best_threshold]
X_train = X[selected]

print('The threshold {} gives the highest AUC score {} with model\n {}'.format(best_threshold, best_ridge_score, best_ridge_model))
print('This threshold gives {} features which are {}'.format(len(selected), selected))
fig, axes = plt.subplots(3, 1, figsize=(10, 15))
title = "Learning Curves (Ridge)"
plot_learning_curve(
    best_ridge_model, title, X_train, y_train, groups=allSubjects, scoring='roc_auc', axes=axes, ylim=(0.3, 1.01), cv=cv, n_jobs=4
)

In [None]:
def SVC_classifier(X_train):
    # Define SVC ridge model
    svc_grid_pipe = Pipeline([
    ('scalar', StandardScaler()),
    ('clf', LinearSVC(dual=False))])
 
    # Run grid search on hyperparameters
    param_grid = {'clf__C': [0.001, 0.01, 0.1, 0.5, 1.0, 10, 100, 1000, 1e4, 1e5],
              'clf__tol': [0.00001, 0.0001, 0.0005, 0.001, 0.005, 0.01]}
    
    grid_svc = GridSearchCV(svc_grid_pipe, param_grid = param_grid, cv=cv, verbose = 0,n_jobs=-1,scoring = 'roc_auc') 
    grid_svc.fit(X_train, y_train, groups=allSubjects)

    # Get best model and run cross validation using all data
    print('Best estimator: ', grid_svc.best_estimator_)
    svc_pipe = grid_svc.best_estimator_
    
    svc_scores = cross_validate(svc_pipe, X_train, y_train, groups=allSubjects, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=True)
    print('Cross-Validation Evaluation Scores')
    print('Train AUC:', 'mean-',np.mean(svc_scores['train_roc_auc']), 'std dev-', np.std(svc_scores['train_roc_auc']))
    print('AUC:', 'mean-',np.mean(svc_scores['test_roc_auc']), 'std dev-', np.std(svc_scores['test_roc_auc']))
    print('Sensitivity:', 'mean-',np.mean(svc_scores['test_sensitivity']), 'std dev-', np.std(svc_scores['test_sensitivity']))
    print('Specificity:', 'mean-',np.mean(svc_scores['test_specificity']), 'std dev-', np.std(svc_scores['test_specificity']))

    return grid_svc.best_params_, grid_svc.best_estimator_, np.mean(svc_scores['test_roc_auc'])

In [None]:
(best_svc_score, best_threshold, best_svc_model) = FeatureSelection(SVC_classifier)
selected = np.array(feature_names)[importance >= best_threshold]
X_train = X[selected]

print('The threshold {} gives the highest AUC score {} with model\n {}'.format(best_threshold, best_svc_score, best_svc_model))
print('This threshold gives {} features which are {}'.format(len(selected), selected))
fig, axes = plt.subplots(3, 1, figsize=(10, 15))
title = "Learning Curves (LinearSVC)"
plot_learning_curve(
    best_svc_model, title, X_train, y_train, groups=allSubjects, scoring='roc_auc', axes=axes, ylim=(0.3, 1.01), cv=cv, n_jobs=4
)

## Cross analysis between Pre and Post3Mo SRS

### Post3Mo Selected Features on Pre-SRS Data

In [None]:
post_selected = ['T1_original_shape_Sphericity', 'T1_original_shape_SurfaceVolumeRatio',
 'T1_original_glcm_Imc1', 'T1_original_glcm_Idm', 'T1_original_glcm_Idmn',
 'T1_original_glcm_InverseVariance',
 'T1_original_gldm_DependenceNonUniformityNormalized',
 'Vasc_original_firstorder_Maximum', 'Vasc_original_glcm_ClusterProminence',
 'Vasc_original_glcm_Imc2',
 'Vasc_original_glrlm_LongRunLowGrayLevelEmphasis',
 'Vasc_original_glszm_GrayLevelNonUniformityNormalized',
 'Vasc_original_glszm_GrayLevelVariance',
 'Vasc_original_glszm_SmallAreaLowGrayLevelEmphasis',
 'Vasc_original_gldm_DependenceNonUniformityNormalized',
 'Vasc_original_gldm_SmallDependenceLowGrayLevelEmphasis',
 'Vasc_original_ngtdm_Complexity', 'T2_original_firstorder_90Percentile',
 'T2_original_firstorder_InterquartileRange',
 'T2_original_firstorder_Kurtosis', 'T2_original_firstorder_Maximum',
 'T2_original_firstorder_Skewness', 'T2_original_firstorder_Variance',
 'T2_original_glcm_ClusterProminence', 'T2_original_glcm_JointEnergy',
 'T2_original_glcm_Idm', 'T2_original_glszm_SmallAreaLowGrayLevelEmphasis',
 'T2_original_gldm_SmallDependenceLowGrayLevelEmphasis',
 'T2_original_ngtdm_Strength']
X_sel = X[post_selected]
# svc_best_params, svc_best_model, svc_score = SVC_classifier(X_train)
# fig, axes = plt.subplots(3, 1, figsize=(10, 15))

# title = "Learning Curves (LinearSVC on Post Features)"
# plot_learning_curve(
#     svc_best_model, title, X_train, y_train, groups=allSubjects, scoring='roc_auc', axes=axes, ylim=(0.3, 1.01), cv=cv, n_jobs=4
# )

In [None]:
svc_best_param, svc_best_model, svc_scores = SVC_classifier(X_sel)
fig, axes = plt.subplots(3, 1, figsize=(10, 15))

title = "Learning Curves (LinearSVC)"
plot_learning_curve(
    svc_best_model, title, X_sel, y_train, groups=allSubjects, scoring='roc_auc', axes=axes, ylim=(0.3, 1.01), cv=cv, n_jobs=4
)

### Pre-SRS Model on Post3Mo SRS Data

In [None]:
# initialize the file paths to hold all the images separated by sequence and SRS Date
root = '/Users/cxl037/PycharmProjects/pythonProject1/Example_Data/'
Post3MoT1Bravo_path = os.path.join(root, 'Post3MoT1Bravo_Immuno')
Post3MoT1Bravo_batchfile = os.path.join(Post3MoT1Bravo_path, 'radiomics_features.csv')
Post3MoT1Vasc_path = os.path.join(root, 'Post3MoT1Vasc_Immuno')
Post3MoT1Vasc_batchfile = os.path.join(Post3MoT1Vasc_path, 'radiomics_features.csv')
Post3MoT2Flair_path = os.path.join(root, 'Post3MoT2Flair_Immuno')
Post3MoT2Flair_batchfile = os.path.join(Post3MoT2Flair_path, 'radiomics_features.csv')

In [None]:
# Get patientID to merge radiomic_features.csv and SRS_immune_list.xlsx
extractedFeatures_post = pd.read_csv(Post3MoT1Bravo_batchfile)
extractedFeatures_post = extractedFeatures_post.add_prefix('T1_')
extractedFeatures_post.rename(columns = {'T1_Mask': 'Mask'}, inplace = True)
extractedFeatures_post['PatientID'] = extractedFeatures_post.apply(lambda row: row['Mask'][:7], axis=1)
extractedFeatures_post['PatientID'] = extractedFeatures_post['PatientID'].astype(str)

# Get patientID to merge radiomic_features.csv and SRS_immune_list.xlsx
extractedFeatures2_post = pd.read_csv(Post3MoT1Vasc_batchfile)
extractedFeatures2_post = extractedFeatures2_post.add_prefix('Vasc_')
extractedFeatures2_post.rename(columns = {'Vasc_Mask': 'Mask'}, inplace = True)

# Get patientID to merge radiomic_features.csv and SRS_immune_list.xlsx
extractedFeatures3_post = pd.read_csv(Post3MoT2Flair_batchfile)
extractedFeatures3_post = extractedFeatures3_post.add_prefix('T2_')
extractedFeatures3_post.rename(columns = {'T2_Mask': 'Mask'}, inplace = True)

allFeatures = pd.merge(extractedFeatures_post, patientDetails, on='PatientID', how = 'left' )
allFeatures = pd.merge(allFeatures, extractedFeatures2_post, on='Mask', how = 'left' )
allFeatures = pd.merge(allFeatures, extractedFeatures3_post, on='Mask', how = 'left' )

In [None]:
remove_cols = [feature for feature in allFeatures.columns 
               if not (feature.startswith("T2_original") or feature.startswith("T1_original") or feature.startswith("Vasc_original") 
               or feature == 'Immunotherapy_prior_3_months' or feature == 'PatientID')]
filteredFeatures = allFeatures.drop(remove_cols, axis = 1)
filteredFeatures = filteredFeatures.dropna() # remove rows with NA values, only preserving those with T1 and T2 features

In [None]:
X_post = filteredFeatures.drop(['Immunotherapy_prior_3_months', 'PatientID'], axis=1)
y_post = filteredFeatures['Immunotherapy_prior_3_months']

In [None]:
pre_selected = ['T1_original_shape_Sphericity', 'T1_original_glcm_Correlation',
 'T1_original_glcm_Imc1',
 'T1_original_glszm_SizeZoneNonUniformityNormalized',
 'T1_original_gldm_DependenceVariance',
 'T1_original_gldm_LargeDependenceHighGrayLevelEmphasis',
 'Vasc_original_glcm_ClusterShade', 'Vasc_original_glcm_Idmn',
 'Vasc_original_glszm_GrayLevelNonUniformityNormalized',
 'Vasc_original_gldm_DependenceNonUniformityNormalized',
 'T2_original_gldm_LargeDependenceHighGrayLevelEmphasis',
 'T2_original_gldm_SmallDependenceLowGrayLevelEmphasis']
X_sel_post = X_post[pre_selected]
X_sel_post

In [None]:
roc_auc_score(y_post, best_svc_model.predict(X_sel_post))

## Neural Network Model

Hyperparameter tuning with Multilayer Perceptron (MLP)

In [None]:
from keras.layers.core import Dense, Activation
from tensorflow.keras.optimizers import Adadelta, Adam
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from keras.metrics import AUC
from keras import regularizers
from keras.callbacks import EarlyStopping,CSVLogger
from scikeras.wrappers import KerasClassifier, KerasRegressor

In [None]:
X_train = X[pre_selected]
X_train

In [None]:
sc = StandardScaler()
X_scaled = pd.DataFrame(sc.fit_transform(X_train))
y_train = y.copy()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
display(x_test)

In [None]:
input_dim = len(x_train.columns)
output_dim = 1
print ("input", input_dim, "output", output_dim)

#Hyper-Parameters
layer1 = 1
layer2 = 2
layer3 = 3
dropout1 = 0.3
dropout2 = .5
dropout3 = .45801654819487025
batch_size = 16
optimizer = Adam(
    learning_rate=0.008
)

In [None]:
def build_clf(layer1, dropout1, learning_rate):
    # creating the layers of the NN
    input_img = Input(shape=(input_dim,))
    deep = Dense(layer1, activation='relu')(input_img)
    deep = Dropout(dropout1)(deep)
    outlayer = Dense(output_dim, activation='sigmoid')(deep)
    model = Model(input_img, outlayer)
    model.compile(loss=['binary_crossentropy'], metrics=[AUC()],#, 'sparse_categorical_accuracy'
                        optimizer= Adam(learning_rate=learning_rate))
    return model

In [None]:
model=KerasClassifier(model=build_clf, layer1=1, dropout1=0.1, learning_rate=0.001)

In [None]:
print(model.get_params().keys())

In [None]:
params={'batch_size': [1, 2, 4, 8, 16, 32, 64], 
        'dropout1': [0.1, 0.2, 0.3, 0.4, 0.5],
        'layer1': [1, 2, 3, 4, 5],
        'learning_rate': [1e-4, 1e-3, 0.001, 0.01]
        }
gs = GridSearchCV(estimator=model, param_grid=params, cv=cv, verbose = 3)
# now fit the dataset to the GridSearchCV object. 
gs = gs.fit(X_scaled, y_train, groups=allSubjects)

In [None]:
mlp_best_params = gs.best_params_
mlp_best_score = gs.best_score_
mlp_best_model = gs.best_estimator_
fig, axes = plt.subplots(3, 1, figsize=(10, 15))

title = "Learning Curves (MLP)"
plot_learning_curve(
    mlp_best_model, title, X_sel, y_train, groups=allSubjects, scoring='roc_auc', axes=axes, ylim=(0.3, 1.01), cv=cv, n_jobs=4
)

In [None]:
#MLP
input_img = Input(shape=(input_dim,))
deep = Dense(layer1, activation='relu')(input_img)
deep = Dropout(dropout1)(deep)
# deep = Dense(layer2, activation='relu')(deep)
# deep = Dropout(dropout2)(deep)
#deep = Dense(layer3, activation='relu')(deep)
#deep = Dropout(dropout3)(deep)
outlayer = Dense(output_dim, activation='sigmoid')(deep)
model = Model(input_img, outlayer)

model.compile(loss=['binary_crossentropy'], metrics=[AUC()],#, 'sparse_categorical_accuracy'
                        optimizer= optimizer)
print(model.summary())
print(model.metrics_names)


In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=200)

history = model.fit(X_train, y_train,
                    epochs=2000, #int(params['n_epochs']),
                    batch_size=batch_size,
                    shuffle=True,
                    verbose=2,
                    validation_data=(X_test, y_test),callbacks=[es])

In [None]:
# plot loss during training
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['val_loss'], label='valid')
plt.plot(history.history['loss'], label='train')
plt.legend()
# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['val_auc_33'], label='valid')
plt.plot(history.history['auc_33'], label='train')
plt.legend()
plt.show()

In [None]:
y_pred = model.predict(X_test)
print(y_pred)

In [None]:
X_post = filteredFeatures.drop(['Immunotherapy_prior_3_months', 'PatientID'], axis=1)
y_post = filteredFeatures['Immunotherapy_prior_3_months']

In [None]:
pre_selected = ['T1_original_shape_Sphericity', 'T1_original_glcm_Correlation',
 'T1_original_glcm_Imc1',
 'T1_original_glszm_SizeZoneNonUniformityNormalized',
 'T1_original_gldm_DependenceVariance',
 'T1_original_gldm_LargeDependenceHighGrayLevelEmphasis',
 'Vasc_original_glcm_ClusterShade', 'Vasc_original_glcm_Idmn',
 'Vasc_original_glszm_GrayLevelNonUniformityNormalized',
 'Vasc_original_gldm_DependenceNonUniformityNormalized',
 'T2_original_gldm_LargeDependenceHighGrayLevelEmphasis',
 'T2_original_gldm_SmallDependenceLowGrayLevelEmphasis']
X_sel_post = X_post[pre_selected]
X_sel_post

In [None]:
roc_auc_score(y_post, best_svc_model.predict(X_sel_post))