Skip to content

Latest commit



234 lines (192 loc) · 9.15 KB

File metadata and controls

234 lines (192 loc) · 9.15 KB

Processing the NIH chest x-ray dataset

Data preparation

Cleaning the data

We need to separate the diagnosis variable into multiple binary variables, one for each pathology, using the trick of

import pandas as pd
import numpy as np
labels = pd.read_csv("Data_Entry_2017.csv",
                     usecols = ['Image Index','Finding Labels',
                                'Follow-up #','Patient ID',
                                'Patient Age','Patient Gender'])
labels.rename(columns={"Follow-up #":("Patient Data","Follow-up #"),
                       "Patient ID":("Patient Data","Patient ID"),
                       "Patient Age":("Patient Data","Patient Age"),
                       "Patient Gender":("Patient Data","Patient Gender")},
pathology_list = ['Cardiomegaly','Emphysema','Effusion','Hernia','Nodule',
for pathology in pathology_list: 
    labels[("Pathology",pathology)] = labels['Finding Labels'].apply(lambda x: 1 if pathology in x else 0)
    labels[("Pathology","Abnormal")] = labels['Finding Labels'].apply(lambda x: 0 if "No Finding" in x else 1)
labels.index = labels["Image Index"]
labels.drop(["Image Index", "Finding Labels"], inplace=True,axis =1)
labels.columns = pd.MultiIndex.from_tuples(labels.columns)

Combining the labels with TDA data

We need to merge the labels with the TDA data output. I mistakenly didn’t distinguish the labels of the dim0 and dim1 persistence stats, so I will correct that here:

tda_data = pd.read_hdf("tda_data.h5")
tda_labels = labels.loc[tda_data.index.values,:]  
betti_nos = tda_data.iloc[:,0:512]
dim0_stats = tda_data.iloc[:,512:529]
dim1_stats = tda_data.iloc[:,529:]
dim0_stats.rename(columns = {"n_life":"dim0_norm_life", "midlife":"dim0_midlife", 
                             "entropy":"dim0_entropy"}, inplace = True)
dim1_stats.rename(columns = {"n_life":"dim1_norm_life", "midlife":"dim1_midlife", 
                             "entropy":"dim1_entropy"}, inplace = True)
df = pd.concat([tda_labels,betti_nos, dim0_stats, dim1_stats], axis=1)

Splitting into test and training data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
Y = df["Pathology","Abnormal"]
X = df[["dim0_norm_life", "dim1_norm_life", "dim0_midlife", 
normalize(X, copy=False)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=0.2, random_state=42)

B = df[["b0","b1"]]
normalize(B, copy=False)
B_train, B_test, Y_train, Y_test = train_test_split(B, Y, test_size=0.2, random_state=42)


So 44% of the x-rays are abnormal in this sample.

Analysis of TDA data

Logistic regression

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression(), Y_train)
dec_test_lsvc = linear_svc.decision_function(X_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_test_lsvc)))
predict_train = logistic.predict(X_train)
predict_test = logistic.predict(X_test)

Checking results:

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
dec_test_logistic = logistic.decision_function(X_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_test_logistic)))
print("Test set:\n" + classification_report(Y_test,predict_test))

Support Vector Machines


from sklearn import svm
linear_svc = svm.LinearSVC(),Y_train)
pred_train_lsvc = linear_svc.predict(X_train)
pred_test_lsvc = linear_svc.predict(X_test)
dec_test_lsvc = linear_svc.decision_function(X_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_test_lsvc)))
#print("Training set:\n" + classification_report(Y_train,pred_train_lsvc))
print("Test set:\n" + classification_report(Y_test,pred_test_lsvc))

Let’s look at the Betti curve data:

linear_svc_B = svm.LinearSVC(),Y_train)
pred_Btrain_lsvc = linear_svc_B.predict(B_train)
pred_Btest_lsvc = linear_svc_B.predict(B_test)
dec_Btest_lsvc = linear_svc_B.decision_function(B_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_Btest_lsvc)))
#print("Training set:\n" + classification_report(Y_train,pred_Btrain_lsvc))
print("Test set:\n" + classification_report(Y_test,pred_Btest_lsvc))

58% accuracy with the Betti curve data and 60% AuC.

all_train = pd.concat([X_train,B_train], axis=1)
all_test = pd.concat([X_test,B_test], axis=1)
linear_svc_all = svm.LinearSVC(),Y_train)
pred_alltrain_lsvc = linear_svc_all.predict(all_train)
pred_alltest_lsvc = linear_svc_all.predict(all_test)
dec_alltest_lsvc = linear_svc_all.decision_function(all_test)
print("AUC score is " + str(roc_auc_score(Y_test, dec_alltest_lsvc)))
print("Test set:\n" + classification_report(Y_test,pred_alltest_lsvc))

AuC decreases with the inclusion of the Betti curve data.

Gaussian kernel

gauss_svc = svm.SVC(kernel = "rbf"),Y_train)
pred_train_gsvc = gauss_svc.predict(X_train)
pred_test_gsvc = gauss_svc.predict(X_test)
print("Test set:\n" + classification_report(Y_test,pred_test_gsvc))

Need to do some tweaking, the model is predicting only zeros.