## Setup

In [1]:
import numpy
# import matplotlib.pyplot as plt
# import seaborn as sns
import pandas
# %matplotlib inline
# plt.rcParams['figure.figsize'] = (16.0, 4.0)
# sns.set_style("whitegrid")
random_state = 7
numpy.random.seed(random_state)
from lib.cox_helpers import initialize_cox_store
import datetime

## Load Data

In [2]:
data = pandas.read_csv('data.csv', index_col=[0])
data.head()

Unnamed: 0,AGE,SEX_F,SEX_M,CURADM_DAYS,OUTCOME_H,OUTCOME_N,OUTCOME_I,OUTCOME_D,CURRICU_FLAG,PREVADM_NO,PREVADM_DAYS,PREVICU_DAYS,READMISSION_30_DAYS
0,62.0,1,0,1,0,1,0,0,0,0,0,0,1
1,24.0,0,1,2,0,0,1,0,0,0,0,0,0
2,77.0,0,1,2,0,0,1,0,0,2,2,0,0
3,68.0,0,1,7,0,0,1,0,0,2,2,0,0
4,83.0,0,1,2,0,0,1,0,0,1,1,0,0


In [3]:
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, matthews_corrcoef, accuracy_score
from sklearn.metrics import classification_report

In [4]:
zero_indices = data[data['READMISSION_30_DAYS'] == 0].index

sample_size_to_remove = sum(data['READMISSION_30_DAYS'] == 0) - sum(data['READMISSION_30_DAYS'] == 1)
random_indices = numpy.random.choice(zero_indices, sample_size_to_remove, replace=False)
data = data.drop(random_indices)
print(len(data))
readmission_count = data.groupby('READMISSION_30_DAYS').size().sort_values(ascending=False)
print(readmission_count)

44384
READMISSION_30_DAYS
1    22192
0    22192
dtype: int64


In [5]:
dataset = data.values
numpy.random.shuffle(dataset)
# split into input (X) and output (Y) variables
X = dataset[:,0:12].astype(float)
Y = dataset[:,12]
# X,Y



In [6]:
test_size = 0.1
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=7)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
scaler = scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
from math import floor
def partition(vector, fold, k):

    size = vector.shape[0]

    start = floor((size/k)*fold)

    end = floor((size/k)*(fold+1))

    validation = vector[start:end]
    
    #print(str(type(vector)))

    if str(type(vector)) == "<class 'scipy.sparse.csr.csr_matrix'>":

        indices = range(start, end)

        mask = numpy.ones(vector.shape[0], dtype=bool)

        mask[indices] = False

        training = vector[mask]

    elif str(type(vector)) == "<class 'numpy.ndarray'>":
        
        training = numpy.concatenate((vector[:start], vector[end:]))

    return training, validation



def Cross_Validation(learner, k, examples, labels):

    train_folds_score = []

    validation_folds_score = []
    
    test_score_auc = []
    
    test_score_mcc = []

    for fold in range(0, k):

        training_set, validation_set = partition(examples, fold, k)

        training_labels, validation_labels = partition(labels, fold, k)

        learner.fit(training_set, training_labels)

        training_predicted = learner.predict(training_set)

        validation_predicted = learner.predict(validation_set)

        # print(training_predicted, validation_predicted)
        
        test_predicted = learner.predict(X_test)

        train_folds_score.append(roc_auc_score(training_labels, training_predicted))

        # print(training_labels, training_predicted)
        # print(numpy.sum(training_labels), numpy.sum(training_predicted))
        # print(classification_report(training_labels, training_predicted))

        validation_folds_score.append(roc_auc_score(validation_labels, validation_predicted))
        
        test_score_auc.append(roc_auc_score(Y_test, test_predicted))
        
        test_score_mcc.append(matthews_corrcoef(Y_test, test_predicted))

    return train_folds_score, validation_folds_score, test_score_auc, test_score_mcc

In [8]:
def run(model, features, labels) :
    
    cox_store = initialize_cox_store()
    cox_store['experiments'].update_row({
                    'k': 1,
                    'random_state': random_state,
                    'start_time': datetime.datetime.now().strftime('%Y%m%d%H%M%S'),
                    'classifier': model.__str__().split("(")[0],
                    'classifier_full': model.__str__()
                })
    train_scores, validation_scores, test_scores_auc, test_scores_mcc = Cross_Validation(model, 10, features, labels)
    #print(train_scores, validation_scores, test_scores)
    print(model)
    print('Train AUC', float(format(numpy.mean(train_scores), '.3f')))
    print('Validation AUC',float(format(numpy.mean(validation_scores), '.3f')))
    print('Test AUC',float(format(numpy.mean(test_scores_auc), '.3f')))
    print('Test MCC',float(format(numpy.mean(test_scores_mcc), '.3f')))
    print()
    cox_store['experiments'].update_row({
                    'Train AUC': float(format(numpy.mean(train_scores), '.3f')),
                    'Validation AUC': float(format(numpy.mean(validation_scores), '.3f')),
                    'Test AUC': float(format(numpy.mean(test_scores_auc), '.3f')),
                    'Test MCC': float(format(numpy.mean(test_scores_mcc), '.3f'))
    })
                                      
    cox_store['experiments'].flush_row()
    cox_store.close()
    

In [9]:
%%time
models = [LogisticRegression(solver='liblinear'), KNeighborsClassifier(), GaussianNB(), SVC(gamma='auto')] #LogisticRegression(solver='liblinear')
for model in models:
    run(model, X_test, Y_test)

Logging in: C:\code\python\anonymity_evaluation\cox\2f47921a-aeba-4d71-b498-b9a6710bceb8
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
Train AUC 0.723
Validation AUC 0.723
Test AUC 0.723
Test MCC 0.456

Logging in: C:\code\python\anonymity_evaluation\cox\b99829a3-8ccd-406a-83d5-f46be0ca1683
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Train AUC 0.793
Validation AUC 0.712
Test AUC 0.785
Test MCC 0.57

Logging in: C:\code\python\anonymity_evaluation\cox\25bcbfe1-fd38-4834-92ca-106f297c7547
GaussianNB(priors=None, var_smoothing=1e-09)
Train AUC 0.708
Validation AUC 0.708

In [10]:
from cox.readers import CollectionReader

reader = CollectionReader('cox')
a = reader.df('experiments')

print(a.to_string())
a.to_excel('experimentalResults.xlsx')

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 38.48it/s]

   k  random_state  Train AUC  Validation AUC  Test AUC  Test MCC      start_time            classifier                                    classifier_full                                exp_id
0  1             7      0.711           0.711     0.711     0.437  20200213155611                   SVC  SVC(C=1.0, cache_size=200, class_weight=None, ...  23239e4f-d5f2-4716-973f-1baff69caf15
0  1             7      0.708           0.708     0.708     0.431  20200213155611            GaussianNB       GaussianNB(priors=None, var_smoothing=1e-09)  25bcbfe1-fd38-4834-92ca-106f297c7547
0  1             7      0.723           0.723     0.723     0.456  20200213155604    LogisticRegression  LogisticRegression(C=1.0, class_weight=None, d...  2f47921a-aeba-4d71-b498-b9a6710bceb8
0  1             7      0.793           0.712     0.785     0.570  20200213155605  KNeighborsClassifier  KNeighborsClassifier(algorithm='auto', leaf_si...  b99829a3-8ccd-406a-83d5-f46be0ca1683





