In [1]:
import pandas as pd
import numpy as np
import os
raw_data = pd.read_csv(os.path.join('..', 'data', 'Restroke_SPSS.csv'))
raw_data.replace({999.0: np.nan}, inplace=True)

In [2]:
print('Subjects:{}, Features:{}'.format(raw_data.shape[0], raw_data.shape[1]))

Subjects:7870, Features:62


In [3]:
raw_data.head()

Unnamed: 0,CHT_No,Sex,Age,HTN,DM,Dyslipidemia,AF,CVA_Hx,CT_Old_Lesion,Smoking,...,CT_0ew_Lesio0,Alcohol,Adm_AF_0otEKG,EKG_AF,BU0-Cr Ratio,BUN-Cr,Glucose,NIHSS_Dis,BI_Dis,MRS_Dis
0,20290.0,0.0,74.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,,,,,0.0,100.0,1
1,23614.0,0.0,87.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,16.14,0.0,94.0,1.0,100.0,1
2,24832.0,0.0,65.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,,0.0,0.0,0.0,,,84.0,2.0,100.0,1
3,34018.0,0.0,75.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,19.63,0.0,,2.0,100.0,1
4,45874.0,0.0,84.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,28.57,1.0,133.0,31.0,0.0,5


In [4]:
print('Age_mean: %.2f, Age_std: %.2f,' %(np.mean(raw_data.Age), np.std(raw_data.Age)))
male_protion = (raw_data[raw_data.Sex==1].shape[0]/raw_data.Sex.shape[0])*100
print('Male:%.2f%%' %male_protion)
restroke_protion = (raw_data[raw_data.reStroke==1].shape[0]/raw_data.reStroke.shape[0])*100
print('reStroke:%.2f%%' %restroke_protion)

Age_mean: 67.75, Age_std: 12.74,
Male:62.48%
reStroke:8.58%


In [5]:
tidy_data = raw_data.dropna(axis=0)
# tidy_data.to_csv('tidy.csv', index=False)
print('Tidy_subjects:{}, Tidy_features:{}'.format(tidy_data.shape[0], tidy_data.shape[1]))
print('Tidy_Age_mean: %.2f, Tidy_Age_std: %.2f' %(np.mean(tidy_data.Age), np.std(tidy_data.Age)))
tidy_male_protion = (tidy_data[tidy_data.Sex==1].shape[0]/tidy_data.Sex.shape[0])*100
print('Tidy_Male:%.2f%%' %tidy_male_protion)
tidy_restroke_protion = (tidy_data[tidy_data.reStroke==1].shape[0]/tidy_data.reStroke.shape[0])*100
print('Tidy_reStroke:%.2f%%' %tidy_restroke_protion)

Tidy_subjects:927, Tidy_features:62
Tidy_Age_mean: 67.44, Tidy_Age_std: 13.12
Tidy_Male:64.94%
Tidy_reStroke:7.77%


In [6]:
X_data = tidy_data.drop(['CHT_No', 'reStroke'], axis=1)
y_data = tidy_data[['reStroke']]
print(X_data.shape, y_data.shape)

(927, 60) (927, 1)


In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import auc, roc_curve
from sklearn import preprocessing
from imblearn import over_sampling

In [8]:
from sklearn.svm import SVC
svm_all_auroc = []
print('SVM--')
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data):
    X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    # scaling
    scaler = preprocessing.MinMaxScaler()
    #scaler = preprocessing.StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # over-sampling
    #print('before', y_train.groupby(['reStroke']).size())
    sm = over_sampling.SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    #print('after', y_train.groupby(['reStroke']).size())
    # model
    model = SVC(kernel='rbf', probability=True)
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    svm_all_auroc.append(auroc)
print('auc_mean: %.2f, auc_std: %.2f' %(np.mean(svm_all_auroc), np.std(svm_all_auroc)))

SVM--
auc 0.638235294117647
auc 0.5421455938697318
auc 0.5529411764705883
auc 0.5112359550561798
auc 0.606312292358804
auc 0.5199556541019956
auc 0.4985294117647059
auc 0.4117647058823529
auc 0.5647058823529412
auc 0.5406976744186047
auc_mean: 0.54, auc_std: 0.06


In [9]:
from sklearn.linear_model import LogisticRegression
lr_all_auroc = []
print('LogisticRegression--')
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data):
    X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # over-sampling
    #print('before', y_train.groupby(['reStroke']).size())
    sm = over_sampling.SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    #print('after', y_train.groupby(['reStroke']).size())
    # model
    model = LogisticRegression(penalty='l1', solver='saga', tol=0.1)
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    lr_all_auroc.append(auroc)
print('auc_mean: %.2f, auc_std: %.2f' %(np.mean(lr_all_auroc), np.std(lr_all_auroc)))

LogisticRegression--
auc 0.5176470588235293
auc 0.41954022988505746
auc 0.6132352941176471
auc 0.4859550561797753
auc 0.5664451827242525
auc 0.5343680709534367
auc 0.46470588235294125
auc 0.549579831932773
auc 0.6252100840336133
auc 0.6531007751937985
auc_mean: 0.54, auc_std: 0.07


In [10]:
from sklearn.ensemble import ExtraTreesClassifier
et_all_auroc = []
print('ExtraTreesClassifier--')
for train_index, test_index in KFold(n_splits=10, random_state=42, shuffle=True).split(X_data):
    X_train, X_test = X_data.iloc[train_index], X_data.iloc[test_index]
    y_train, y_test = y_data.iloc[train_index], y_data.iloc[test_index]
    # scaling
    scaler = preprocessing.MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # over-sampling
    #print('before', y_train.groupby(['reStroke']).size())
    sm = over_sampling.SMOTE(random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    #print('after', y_train.groupby(['reStroke']).size())
    # model
    model = ExtraTreesClassifier(n_estimators=250,  random_state=42)
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict_proba(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred[:, 1])
    auroc = auc(fpr, tpr)
    print('auc', auroc)
    et_all_auroc.append(auroc)
print('auc_mean: %.2f, auc_std: %.2f' %(np.mean(et_all_auroc), np.std(et_all_auroc)))

ExtraTreesClassifier--
auc 0.65
auc 0.6168582375478927
auc 0.6139705882352942
auc 0.5997191011235955
auc 0.5705980066445183
auc 0.6025498891352551
auc 0.6044117647058824
auc 0.5092436974789916
auc 0.653781512605042
auc 0.5784883720930233
auc_mean: 0.60, auc_std: 0.04
