In [1]:
import pandas as pd
import numpy as np
import os
import datetime
# 2010-2018
raw_data = pd.read_csv(os.path.join('..', 'data', 'STROKE_VITAL_SIGN.csv'))
raw_data.replace({9999: np.nan}, inplace=True)
raw_data.fillna(value={'death_date':20181231}, inplace=True)

In [2]:
# remove < 1yr follow-up
# last date 20181231 -> 20171231
print('Before removing:', raw_data.shape)
raw_data.discharge_date = pd.to_datetime(raw_data.discharge_date, format='%Y/%m/%d', errors='coerce')
raw_data = raw_data[raw_data.discharge_date < datetime.datetime(2017, 12, 31)]
print('After removing:', raw_data.shape)

Before removing: (21653, 61)
After removing: (19847, 61)


In [3]:
print('Subjects:{}, Features:{}'.format(raw_data.shape[0], raw_data.shape[1]))
print('Age_mean: %.2f, Age_std: %.2f,' %(np.mean(raw_data.Age), np.std(raw_data.Age)))
male_protion = (raw_data[raw_data.Sex==1].shape[0]/raw_data.Sex.shape[0])*100
print('Male: %.2f%%' %male_protion)
Mortality_protion = (raw_data[raw_data.Mortality==1].shape[0]/raw_data.Mortality.shape[0])*100
print('Mortality: %.2f%%' %Mortality_protion)

# short term  < 1 month
short_protion = (raw_data[(raw_data.SurvivalWeeks < 4.1)].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
print('short term  < 1 month death: %.2f%%' %short_protion)
# intermediate term 1–3 months
# intermediate_protion = (raw_data[(raw_data.SurvivalWeeks > 3.9) & (raw_data.SurvivalWeeks < 12.1)].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
intermediate_protion = (raw_data[raw_data.SurvivalWeeks < 12.1].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
print('intermediate term  1–3 month death: %.2f%%' %intermediate_protion)
# long term 3–12 months
# long_protion = (raw_data[(raw_data.SurvivalWeeks > 11.9) & (raw_data.SurvivalWeeks < 48.1)].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
long_protion = (raw_data[raw_data.SurvivalWeeks < 48.1].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
print('long term  3–12 month death: %.2f%%' %long_protion)

Subjects:19847, Features:61
Age_mean: 67.35, Age_std: 12.88,
Male: 61.92%
Mortality: 31.10%
short term  < 1 month death: 4.09%
intermediate term  1–3 month death: 6.07%
long term  3–12 month death: 10.92%


In [4]:
tidy_data = raw_data.dropna(axis=0)
tidy_data.to_csv('a.csv', index=False)
print('Tidy_subjects:{}, Tidy_features:{}'.format(tidy_data.shape[0], tidy_data.shape[1]))
print('Tidy_Age_mean: %.2f, Tidy_Age_std: %.2f' %(np.mean(tidy_data.Age), np.std(tidy_data.Age)))
tidy_male_protion = (tidy_data[tidy_data.Sex==1].shape[0]/tidy_data.Sex.shape[0])*100
print('Tidy_Male: %.2f%%' %tidy_male_protion)
tidy_Mortality_protion = (tidy_data[tidy_data.Mortality==1].shape[0]/tidy_data.Mortality.shape[0])*100
print('Tidy_Mortality: %.2f%%' %tidy_Mortality_protion)

# short term  < 1 month
tidy_short_protion = (tidy_data[tidy_data.SurvivalWeeks < 4].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
print('tidy_short term  < 1 month death: %.2f%%' %tidy_short_protion)
# intermediate term 1–3 months
# tidy_intermediate_protion = (tidy_data[(tidy_data.SurvivalWeeks > 4.1) & (tidy_data.SurvivalWeeks < 12.1)].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
tidy_intermediate_protion = (tidy_data[tidy_data.SurvivalWeeks < 12.1].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
print('tidy_intermediate term  1–3 month death: %.2f%%' %tidy_intermediate_protion)
# long term 3–12 months
# tidy_long_protion = (tidy_data[(tidy_data.SurvivalWeeks > 11.9) & (tidy_data.SurvivalWeeks < 48.1)].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
tidy_long_protion = (tidy_data[tidy_data.SurvivalWeeks < 48.1].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
print('tidy_long term  3–12 month death: %.2f%%' %tidy_long_protion)

Tidy_subjects:2405, Tidy_features:61
Tidy_Age_mean: 66.66, Tidy_Age_std: 13.22
Tidy_Male: 64.41%
Tidy_Mortality: 28.57%
tidy_short term  < 1 month death: 4.70%
tidy_intermediate term  1–3 month death: 6.74%
tidy_long term  3–12 month death: 12.39%


In [5]:
print("Count total NaN at each column in a DataFrame : \n\n", 
      raw_data.isnull().sum().sort_values(ascending=False)) 

Count total NaN at each column in a DataFrame : 

 AST         10855
HbA1c        9404
BMI          6463
MPsum        2619
V            1372
            ...  
HR SD           0
MeanHR G        0
Mean HR         0
ICU             0
LOC             0
Length: 61, dtype: int64


# Make experiment dataset

In [8]:
ex_data = raw_data.drop(['AST', 'HbA1c'], axis=1)
ex_data = ex_data.dropna(axis=0)
print('subjects:{}, features:{}'.format(ex_data.shape[0], ex_data.shape[1]))

subjects:10723, features:59


In [7]:
# ex_data.to_csv(os.path.join('..', 'data','tidy_Stroke_Vital_Sign.csv'), index=False)