In [1]:
import pandas as pd
import numpy as np
import os
# 2010-2018
raw_data = pd.read_csv(os.path.join('..', 'data', 'Clinical data with 3 days vital sign parameters.csv'))
raw_data.replace({9999: np.nan}, inplace=True)

In [2]:
# remove < 1yr follow-up
# last date 20181231 -> 20171231
print('Before removing:', raw_data.shape)
raw_data = raw_data[raw_data.discharge_date < 20171231]
print('After removing:', raw_data.shape)

Before removing: (20764, 53)
After removing: (19023, 53)


In [3]:
print('Subjects:{}, Features:{}'.format(raw_data.shape[0], raw_data.shape[1]))
print('Age_mean: %.2f, Age_std: %.2f,' %(np.mean(raw_data.Age), np.std(raw_data.Age)))
male_protion = (raw_data[raw_data.Sex==1].shape[0]/raw_data.Sex.shape[0])*100
print('Male: %.2f%%' %male_protion)
AllMortality_protion = (raw_data[raw_data.AllMortality==1].shape[0]/raw_data.AllMortality.shape[0])*100
print('AllMortality: %.2f%%' %AllMortality_protion)

# short term  < 1 month
short_protion = (raw_data[(raw_data.SurvivalWeeks < 4.1)].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
print('short term  < 1 month death: %.2f%%' %short_protion)
# intermediate term 1–3 months
# intermediate_protion = (raw_data[(raw_data.SurvivalWeeks > 3.9) & (raw_data.SurvivalWeeks < 12.1)].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
intermediate_protion = (raw_data[raw_data.SurvivalWeeks < 12.1].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
print('intermediate term  1–3 month death: %.2f%%' %intermediate_protion)
# long term 3–12 months
# long_protion = (raw_data[(raw_data.SurvivalWeeks > 11.9) & (raw_data.SurvivalWeeks < 48.1)].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
long_protion = (raw_data[raw_data.SurvivalWeeks < 48.1].shape[0]/raw_data.SurvivalWeeks.shape[0])*100
print('long term  3–12 month death: %.2f%%' %long_protion)

Subjects:19023, Features:53
Age_mean: 67.09, Age_std: 12.78,
Male: 62.31%
AllMortality: 28.06%
short term  < 1 month death: 0.00%
intermediate term  1–3 month death: 1.92%
long term  3–12 month death: 6.99%


In [4]:
tidy_data = raw_data.dropna(axis=0)
print('Tidy_subjects:{}, Tidy_features:{}'.format(tidy_data.shape[0], tidy_data.shape[1]))
print('Tidy_Age_mean: %.2f, Tidy_Age_std: %.2f' %(np.mean(tidy_data.Age), np.std(tidy_data.Age)))
tidy_male_protion = (tidy_data[tidy_data.Sex==1].shape[0]/tidy_data.Sex.shape[0])*100
print('Tidy_Male: %.2f%%' %tidy_male_protion)
tidy_AllMortality_protion = (tidy_data[tidy_data.AllMortality==1].shape[0]/tidy_data.AllMortality.shape[0])*100
print('Tidy_AllMortality: %.2f%%' %tidy_AllMortality_protion)

# short term  < 1 month
tidy_short_protion = (tidy_data[tidy_data.SurvivalWeeks < 4].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
print('tidy_short term  < 1 month death: %.2f%%' %tidy_short_protion)
# intermediate term 1–3 months
# tidy_intermediate_protion = (tidy_data[(tidy_data.SurvivalWeeks > 4.1) & (tidy_data.SurvivalWeeks < 12.1)].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
tidy_intermediate_protion = (tidy_data[tidy_data.SurvivalWeeks < 12.1].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
print('tidy_intermediate term  1–3 month death: %.2f%%' %tidy_intermediate_protion)
# long term 3–12 months
# tidy_long_protion = (tidy_data[(tidy_data.SurvivalWeeks > 11.9) & (tidy_data.SurvivalWeeks < 48.1)].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
tidy_long_protion = (tidy_data[tidy_data.SurvivalWeeks < 48.1].shape[0]/tidy_data.SurvivalWeeks.shape[0])*100
print('tidy_long term  3–12 month death: %.2f%%' %tidy_long_protion)

Tidy_subjects:1748, Tidy_features:53
Tidy_Age_mean: 66.96, Tidy_Age_std: 12.91
Tidy_Male: 65.90%
Tidy_AllMortality: 27.75%
tidy_short term  < 1 month death: 0.00%
tidy_intermediate term  1–3 month death: 2.29%
tidy_long term  3–12 month death: 9.10%


In [5]:
print(" \nCount total NaN at each column in a DataFrame : \n\n", 
      raw_data.isnull().sum()) 

 
Count total NaN at each column in a DataFrame : 

 ID                       0
CHT_NO                   0
admin_date               0
discharge_date           0
Sex                      0
Age                      0
AF                       0
DM                       0
HTN                      0
Dyslipidemia             0
CHF                      0
Smoking                  0
BMI                   6267
Cancer before adm        0
E                      166
V                     1165
M                      177
MPsum                 2476
SSI                      0
Creatinine            4653
ALT                    636
AST                  10563
CHOL                  1871
HbA1c                 8913
TG                    1876
Mean HR                 16
MeanHR G                16
HR SD                   16
HRSD G                  16
HR CV                   16
HRCV G                  16
Mean SBP                13
Mean SBP G              13
SBP SD                  13
SBPSD G                 13
SB

# Make experiment dataset

In [6]:
ex_data = raw_data.drop(['AST', 'BMI', 'HbA1c', 'Creatinine'], axis=1)
ex_data = ex_data.dropna(axis=0)
print('subjects:{}, features:{}'.format(ex_data.shape[0], ex_data.shape[1]))

subjects:13244, features:49


In [7]:
ex_data.to_csv(os.path.join('..', 'data','ex_data.csv'), index=False)