Skip to content

Latest commit

 

History

History
234 lines (192 loc) · 9.15 KB

NIH_notebook.org

File metadata and controls

234 lines (192 loc) · 9.15 KB

Processing the NIH chest x-ray dataset

Data preparation

Cleaning the data

We need to separate the diagnosis variable into multiple binary variables, one for each pathology, using the trick of https://www.kaggle.com/sbernadac/lung-deseases-data-analysis:

import pandas as pd
import numpy as np
labels = pd.read_csv("Data_Entry_2017.csv",
                     usecols = ['Image Index','Finding Labels',
                                'Follow-up #','Patient ID',
                                'Patient Age','Patient Gender'])
labels.rename(columns={"Follow-up #":("Patient Data","Follow-up #"),
                       "Patient ID":("Patient Data","Patient ID"),
                       "Patient Age":("Patient Data","Patient Age"),
                       "Patient Gender":("Patient Data","Patient Gender")},
              inplace=True)
pathology_list = ['Cardiomegaly','Emphysema','Effusion','Hernia','Nodule',
                   'Pneumothorax','Atelectasis','Pleural_Thickening','Mass',
                   'Edema','Consolidation','Infiltration','Fibrosis','Pneumonia']
for pathology in pathology_list: 
    labels[("Pathology",pathology)] = labels['Finding Labels'].apply(lambda x: 1 if pathology in x else 0)
    labels[("Pathology","Abnormal")] = labels['Finding Labels'].apply(lambda x: 0 if "No Finding" in x else 1)
labels.index = labels["Image Index"]
labels.drop(["Image Index", "Finding Labels"], inplace=True,axis =1)
labels.columns = pd.MultiIndex.from_tuples(labels.columns)

Combining the labels with TDA data

We need to merge the labels with the TDA data output. I mistakenly didn’t distinguish the labels of the dim0 and dim1 persistence stats, so I will correct that here:

tda_data = pd.read_hdf("tda_data.h5")
tda_labels = labels.loc[tda_data.index.values,:]  
betti_nos = tda_data.iloc[:,0:512]
dim0_stats = tda_data.iloc[:,512:529]
dim1_stats = tda_data.iloc[:,529:]
dim0_stats.rename(columns = {"n_life":"dim0_norm_life", "midlife":"dim0_midlife", 
                             "entropy":"dim0_entropy"}, inplace = True)
dim1_stats.rename(columns = {"n_life":"dim1_norm_life", "midlife":"dim1_midlife", 
                             "entropy":"dim1_entropy"}, inplace = True)
df = pd.concat([tda_labels,betti_nos, dim0_stats, dim1_stats], axis=1)

Splitting into test and training data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
Y = df["Pathology","Abnormal"]
X = df[["dim0_norm_life", "dim1_norm_life", "dim0_midlife", 
        "dim1_midlife","dim0_entropy","dim1_entropy"]]
normalize(X, copy=False)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2, random_state=42)

B = df[["b0","b1"]]
normalize(B, copy=False)
B_train, B_test, Y_train, Y_test = train_test_split(B, Y, test_size=0.2, random_state=42)

Y.sum()/len(Y)

So 44% of the x-rays are abnormal in this sample.

Analysis of TDA data

Logistic regression

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X_train, Y_train)
dec_test_lsvc = linear_svc.decision_function(X_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_test_lsvc)))
predict_train = logistic.predict(X_train)
predict_test = logistic.predict(X_test)

Checking results:

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
#print(classification_report(Y_train,predict_train))
dec_test_logistic = logistic.decision_function(X_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_test_logistic)))
print("Test set:\n" + classification_report(Y_test,predict_test))

Support Vector Machines

Linear

from sklearn import svm
linear_svc = svm.LinearSVC()
linear_svc.fit(X_train,Y_train)
pred_train_lsvc = linear_svc.predict(X_train)
pred_test_lsvc = linear_svc.predict(X_test)
dec_test_lsvc = linear_svc.decision_function(X_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_test_lsvc)))
#print("Training set:\n" + classification_report(Y_train,pred_train_lsvc))
print("Test set:\n" + classification_report(Y_test,pred_test_lsvc))

Let’s look at the Betti curve data:

linear_svc_B = svm.LinearSVC()
linear_svc_B.fit(B_train,Y_train)
pred_Btrain_lsvc = linear_svc_B.predict(B_train)
pred_Btest_lsvc = linear_svc_B.predict(B_test)
dec_Btest_lsvc = linear_svc_B.decision_function(B_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_Btest_lsvc)))
#print("Training set:\n" + classification_report(Y_train,pred_Btrain_lsvc))
print("Test set:\n" + classification_report(Y_test,pred_Btest_lsvc))

58% accuracy with the Betti curve data and 60% AuC.

all_train = pd.concat([X_train,B_train], axis=1)
all_test = pd.concat([X_test,B_test], axis=1)
linear_svc_all = svm.LinearSVC()
linear_svc_all.fit(all_train,Y_train)
pred_alltrain_lsvc = linear_svc_all.predict(all_train)
pred_alltest_lsvc = linear_svc_all.predict(all_test)
dec_alltest_lsvc = linear_svc_all.decision_function(all_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_alltest_lsvc)))
#print(classification_report(Y_train,pred_alltrain_lsvc))
print("Test set:\n" + classification_report(Y_test,pred_alltest_lsvc))

AuC decreases with the inclusion of the Betti curve data.

Gaussian kernel

gauss_svc = svm.SVC(kernel = "rbf")
gauss_svc.fit(X_train,Y_train)
pred_train_gsvc = gauss_svc.predict(X_train)
pred_test_gsvc = gauss_svc.predict(X_test)
#print(classification_report(Y_train,pred_train_gsvc))
print("Test set:\n" + classification_report(Y_test,pred_test_gsvc))

Need to do some tweaking, the model is predicting only zeros.