In [1]:
import numpy as np
import pandas as pd
import pickle
import glob
import matplotlib.pyplot as plt
from scipy.spatial import ConvexHull
import math
import cv2

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix

pi = math.pi

%matplotlib inline

In [2]:
def standardize_feature_matrix(X):
    min_X = X.min(axis=1)[:,None]
    max_X = X.max(axis=1)[:,None]
    
    return (X- min_X) / (max_X - min_X)

In [3]:
clinical_data = pd.read_csv('NSCLCR01Radiogenomic_DATA_LABELS_2018-05-22_1500-shifted.csv')
clinical_data = clinical_data.loc[clinical_data['Case ID']>='R01-001'] #Dropping AMC patients
clinical_data = clinical_data.loc[clinical_data['Case ID']<='R01-146'] #Dropping the rest of R01 patients (with no CT)

clinical_data = clinical_data[['Case ID', 'Pathological T stage',
                               'Pathological N stage', 'Pathological M stage']] #Dropping all irrelavant info

clinical_data.rename(columns = {'Case ID': 'ID',
                                'Pathological T stage': 'T',
                               'Pathological N stage' : 'N',
                               'Pathological M stage' : 'M'}, inplace = True)

# Dropping two patients who had no CT:
clinical_data = clinical_data[clinical_data.ID != 'R01-009']
clinical_data = clinical_data[clinical_data.ID != 'R01-143']

clinical_data['label'] = 0
clinical_data.loc[clinical_data.M != 'M0', 'label'] = 2
clinical_data.loc[(clinical_data.M != 'M0') | (clinical_data.N != 'N0'), 'label'] = 1

label0_count = clinical_data[(clinical_data.label == 0)].count()[0]
label1_count = clinical_data[(clinical_data.label == 1)].count()[0]
label2_count = clinical_data[(clinical_data.label == 2)].count()[0]
print('Class 0 count = %d patients. \nClass 1 count = %d patients. \nClass 2 count = %d patients'%
      (label0_count, label1_count, label2_count))
clinical_data.head(10)

Class 0 count = 112 patients. 
Class 1 count = 32 patients. 
Class 2 count = 0 patients


Unnamed: 0,ID,T,N,M,label
49,R01-001,T1a,N0,M0,0
50,R01-002,T1a,N0,M0,0
51,R01-003,T3,N0,M0,0
52,R01-004,T1b,N2,M0,1
53,R01-005,T2a,N0,M0,0
54,R01-006,T1b,N0,M0,0
55,R01-007,T1a,N1,M0,1
56,R01-008,Tis,N0,M0,0
58,R01-010,T3,N0,M0,0
59,R01-011,T1a,N2,M0,1


In [4]:
labels = clinical_data.iloc[:,-1].values
patients_names = clinical_data.iloc[:,0].values
del clinical_data
labels

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1])

In [5]:
Statistcal_features = np.load(open('Statistical_no_surrouding', 'rb'))
Shape_size_features = np.load(open('Shape_size_features', 'rb'))
All_features = np.hstack((Statistcal_features, Shape_size_features))

print(Statistcal_features.shape, Shape_size_features.shape, All_features.shape)

(144, 14) (144, 8) (144, 22)


In [6]:
print(Statistcal_features[3])
print(Shape_size_features[3])
print(All_features[3])

print('\n')

print(Statistcal_features[50])
print(Shape_size_features[50])
print(All_features[50])

print('\n')

print(Statistcal_features[143])
print(Shape_size_features[143])
print(All_features[143])

[ 1.49402893e+09  1.80788842e+05  5.80062329e+02  1.39000000e+03
  9.49934670e+02 -1.17544920e+07  1.04500000e+03  7.50000000e+01
  1.31500000e+03  9.83048078e+02 -3.49964940e-01  3.10159332e+02
  1.53783688e+08  7.29123326e+06]
[8.82410318e+00 4.17916966e-01 5.68950109e+02 1.33752939e+00
 7.47647125e-01 7.80750084e+02 5.88742199e-01 1.32613236e+03]
[ 1.49402893e+09  1.80788842e+05  5.80062329e+02  1.39000000e+03
  9.49934670e+02 -1.17544920e+07  1.04500000e+03  7.50000000e+01
  1.31500000e+03  9.83048078e+02 -3.49964940e-01  3.10159332e+02
  1.53783688e+08  7.29123326e+06  8.82410318e+00  4.17916966e-01
  5.68950109e+02  1.33752939e+00  7.47647125e-01  7.80750084e+02
  5.88742199e-01  1.32613236e+03]


[ 6.47869133e+09  4.60459695e+05  9.18095849e+01  1.48600000e+03
  8.64472926e+02 -2.22117640e+07  1.00200000e+03  1.52000000e+02
  1.33400000e+03  9.01149048e+02 -5.39981893e-01  3.87883850e+02
  6.74854584e+08  2.47186848e+06]
[3.34023634e+01 6.54094797e-01 2.60451518e+03 1.15200165e+

In [7]:
k = 5

experiments = [Statistcal_features, Shape_size_features, All_features]
experiments_names = ['Statistcal_features', 'Shape_size_features', 'All_features']
results = open("Results.txt", 'w+')

results.write("Tumor without surronding environment\n")
for e, experiment in enumerate(experiments):
    results.write(experiments_names[e]+':\n')
    features_matrix = experiment
    if not np.array_equal(features_matrix[:,-1],labels): features_matrix = np.hstack((features_matrix, labels[:,None]))

    X0 = features_matrix[features_matrix[:,-1]==0]
    X1 = features_matrix[features_matrix[:,-1]==1]

    X0_folds = np.array_split(X0, k)
    X1_folds = np.array_split(X1, k)

    Avg_acc_MLP, Avg_fn = 0, 0
    Avg_acc_logistic_regression, Avg_acc_SVM = 0, 0
    Logistic_reg_fn, SVM_FN = 0, 0

    for i in range(k):
        print('Fold %d/%d is the validation fold:'%(i+1, k))

        X0_train = np.concatenate([X0_folds[j] for j in range(k) if j!=i])
        X1_train = np.concatenate([X1_folds[j] for j in range(k) if j!=i])

        X0_val = X0_folds[i]
        X1_val = X1_folds[i]

        x_train = np.concatenate((X0_train, X1_train), axis=0)
        x_val = np.concatenate((X0_val, X1_val), axis=0)

        np.random.shuffle(x_train)
        np.random.shuffle(x_val)

        y_train = x_train[:,-1]
        y_val = x_val[:,-1]

        x_train = x_train[:,:-1]
        x_val = x_val[:,:-1]

        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train_mlp = scaler.transform(x_train)
        x_val_mlp = scaler.transform(x_val)


        clf = MLPClassifier(solver='lbfgs', alpha=1e-10, hidden_layer_sizes=(1000, 100, 100), random_state=1, max_iter=500)
        clf.fit(x_train_mlp, y_train)

        MLP_result = clf.score(x_val_mlp, y_val)*100

        print('MLP accuracy: ', MLP_result, '%')

        Avg_acc_MLP += MLP_result

        nm = np.asarray(np.where(y_val==1))
        nm = nm.reshape(nm.size)
        fn = (1-clf.score(x_train_mlp[nm], y_val[nm]))*100
        print('MLP FN: ', fn)
        Avg_fn += fn
        
        x_train = standardize_feature_matrix(x_train)
        x_val = standardize_feature_matrix(x_val)
        
        clf2 = LogisticRegression(class_weight= 'balanced', solver='liblinear')
        clf2.fit(x_train, y_train)

        Logistic_regression_result = clf2.score(x_val, y_val)*100
        log_reg_fn = (1-clf2.score(x_val[nm], y_val[nm]))*100

        print('Logistic regression accuracy: ', Logistic_regression_result, '%')
        print('Logistic regression False negative: ', log_reg_fn, '%')

        clf3 = SVC(kernel='linear', class_weight='balanced')
        clf3.fit(x_train, y_train)

        SVM_result = clf3.score(x_val, y_val)*100
        SVM_fn = (1-clf3.score(x_val[nm], y_val[nm]))*100

        print('SVM accuracy: ', SVM_result, '%')
        print('SVM False Negative: ', SVM_fn, '%\n\n\n')
        
        Avg_acc_logistic_regression += Logistic_regression_result
        Avg_acc_SVM += SVM_result

        Logistic_reg_fn += log_reg_fn
        SVM_FN += SVM_fn


    Avg_acc_MLP /= k
    Avg_fn /= k
    Avg_acc_logistic_regression /= k
    Avg_acc_SVM /= k
    Logistic_reg_fn /= k
    SVM_FN /= k

    results.write('\t\t| Average MLP accuracy:                 '+ str(Avg_acc_MLP)+ '% |\n')
    results.write('\t| Average false negative:               '+ str(Avg_fn)+ '% |\n')
    results.write('\t\t| Average logistic regression accuracy: '+ str(Avg_acc_logistic_regression)+ '% |\n')
    results.write('\t| Average logistic regression FN:       '+ str(Logistic_reg_fn)+ '% |\n')
    results.write('\t\t| Average SVM accuracy:                 '+ str(Avg_acc_SVM)+ '% |\n')
    results.write('\t| Average SVM FN:                       '+ str(SVM_FN)+ '% |\n')
    results.write('\t\t\t ------------------------------------------------------------\n\n\n')



results.close()

Fold 1/5 is the validation fold:
MLP accuracy:  80.0 %
MLP FN:  71.42857142857143
Logistic regression accuracy:  60.0 %
Logistic regression False negative:  71.42857142857143 %
SVM accuracy:  63.33333333333333 %
SVM False Negative:  57.14285714285714 %



Fold 2/5 is the validation fold:
MLP accuracy:  83.33333333333334 %
MLP FN:  85.71428571428572
Logistic regression accuracy:  76.66666666666667 %
Logistic regression False negative:  42.85714285714286 %
SVM accuracy:  73.33333333333333 %
SVM False Negative:  42.85714285714286 %



Fold 3/5 is the validation fold:
MLP accuracy:  67.85714285714286 %
MLP FN:  66.66666666666667
Logistic regression accuracy:  71.42857142857143 %
Logistic regression False negative:  33.333333333333336 %
SVM accuracy:  71.42857142857143 %
SVM False Negative:  33.333333333333336 %



Fold 4/5 is the validation fold:
MLP accuracy:  64.28571428571429 %
MLP FN:  66.66666666666667
Logistic regression accuracy:  53.57142857142857 %
Logistic regression False negati

In [13]:
print(Avg_acc_MLP, Avg_acc_logistic_regression, Avg_acc_SVM)
print(Avg_fn, Logistic_reg_fn, SVM_FN)

66.52380952380952 71.47619047619048 74.38095238095238
62.38095238095239 52.38095238095239 83.80952380952382
