In [2]:
# 시작하기 전에 mount하기
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np

data = pd.read_sas('/content/drive/MyDrive/hn18_all.sas7bdat', format = 'sas7bdat')
# hn18_all.sas7bdat 파일의 경로 복사하기 (국민건강영양조사 데이터셋 사용)

In [4]:
# 원시 데이터 가공하기

selected_data = data[['DI1_dg', 'sex', 'age', 'HE_sbp', 'HE_dbp', 'HE_BMI', 'HE_PLS']]

#흡연 여부
selected_data['sm_present'] = ((data['BS1_1'].isin([1, 2]) & data['BS3_1'].isin([1, 2, 3])) | (data['BS1_1'] == 3)).astype(int)
selected_data.loc[data['BS1_1'] == 2, 'sm_present'] = (data['BS3_1'].isin([1, 2])).astype(int)

#pa_walk : 주5회 걷기 운동을 30분이상 실천한 여부
data['pa_hb30_1'] = data['BE3_32'] * 60 + data['BE3_33']
selected_data['pa_walk'] = (data['BE3_31'].isin([6, 7, 8]) & (data['pa_hb30_1'] >= 30)).astype(int)

#수면시간
data['sleep'] = np.where((data['BP16_11'].isin([88,99]) | data['BP16_12'].isin([88,99]) | data['BP16_13'].isin([88,99]) | data['BP16_14'].isin([88,99])),
                         None, (data['BP16_13']+data['BP16_14']/60 - data['BP16_11']+data['BP16_12']/60 + 24)%24)
selected_data['total_sleep'] = data['sleep']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['sm_present'] = ((data['BS1_1'].isin([1, 2]) & data['BS3_1'].isin([1, 2, 3])) | (data['BS1_1'] == 3)).astype(int)
  data['pa_hb30_1'] = data['BE3_32'] * 60 + data['BE3_33']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['pa_walk'] = (data['BE3_31'].isin([6, 7, 8]) & (data['pa_hb30_1'] >= 30)).astype(int)
  data['sleep'] = np.where((data['BP16_11'].isin([88,99]) | data['BP16_12'].isin([88,99]) | data['BP16_13'].isin([88,99]) | data['BP16_14'].isin([88,99])),
A value is trying to be set

In [5]:
# dropna 함수를 사용해 결측치가 포함된 행을 제거

print(selected_data.shape)
selected_data = selected_data.dropna()
print(selected_data.shape)

(7992, 10)
(5929, 10)


In [7]:
from sklearn.model_selection import train_test_split

X = selected_data.drop('DI1_dg', axis=1).dropna()
y = selected_data.loc[X.index, 'DI1_dg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.astype('float')
X_test = X_test.astype('float')
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [8]:
print(X_train.dtypes)
print('-')
print('고혈압 여부: ' + str(y_train.dtypes))

sex            float64
age            float64
HE_sbp         float64
HE_dbp         float64
HE_BMI         float64
HE_PLS         float64
sm_present     float64
pa_walk        float64
total_sleep    float64
dtype: object
-
고혈압 여부: int64


In [9]:
print('나이')
print(max(X_train['age'].values))
print(min(X_train['age'].values))
print('수축기 혈압')
print(max(X_train['HE_sbp'].values))
print(min(X_train['HE_sbp'].values))
print('확장기 혈압')
print(max(X_train['HE_dbp'].values))
print(min(X_train['HE_dbp'].values))
print('체질량 지수')
print(max(X_train['HE_BMI'].values))
print(min(X_train['HE_BMI'].values))
print('15초 맥박수')
print(max(X_train['HE_PLS'].values))
print(min(X_train['HE_PLS'].values))
print('수면 시간')
print(max(X_train['total_sleep'].values))
print(min(X_train['total_sleep'].values))
print('흡연 여부')
print(max(X_train['sm_present'].values))
print(min(X_train['sm_present'].values))
print('유산소 운동 여부')
print(max(X_train['pa_walk'].values))
print(min(X_train['pa_walk'].values))

나이
80.0
12.0
수축기 혈압
205.0
78.0
확장기 혈압
130.0
35.0
체질량 지수
44.480753799431355
13.319458896982308
15초 맥박수
53.0
15.0
수면 시간
15.0
2.0
흡연 여부
1.0
0.0
유산소 운동 여부
1.0
0.0


In [10]:
# 데이터 정규화하기 (0~1 사이의 값으로)
def normalize_column(column):
    return (column - column.min()) / (column.max() - column.min())

norm_x_train = X_train.apply(normalize_column)
norm_x_test = X_test.apply(normalize_column)

In [11]:
print(norm_x_test)

      sex       age    HE_sbp    HE_dbp    HE_BMI  HE_PLS  sm_present  \
6304  0.0  0.941176  0.566667  0.357143  0.270551     0.6         1.0   
1689  1.0  1.000000  0.183333  0.000000  0.391836     0.1         1.0   
5403  1.0  0.470588  0.458333  0.520408  0.520266     0.2         1.0   
7330  1.0  1.000000  0.583333  0.244898  0.173775     0.3         1.0   
5312  1.0  0.705882  0.125000  0.091837  0.306746     0.0         1.0   
...   ...       ...       ...       ...       ...     ...         ...   
6227  1.0  0.044118  0.333333  0.224490  0.357575     0.4         0.0   
2537  1.0  0.088235  0.250000  0.295918  0.379116     0.6         0.0   
6080  0.0  0.191176  0.258333  0.275510  0.387101     0.6         1.0   
6674  0.0  0.455882  0.250000  0.306122  0.365862     0.5         1.0   
847   0.0  0.985294  0.808333  0.602041  0.251972     0.3         1.0   

      pa_walk  total_sleep  
6304      0.0     0.619048  
1689      0.0     0.238095  
5403      1.0     0.714286  
7330   

In [12]:
# mnist 숫자 인식의 모델 구조 참고해서 구성해보기
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [13]:
mlp = Sequential()
mlp.add(Dense(units=512, activation='tanh', input_shape=(9,), name='one'))
mlp.add(Dense(units=1, activation='sigmoid', name='two'))

mlp.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])
history = mlp.fit(norm_x_train, y_train, batch_size=128, epochs=50, validation_data=(norm_x_test, y_test))

res = mlp.evaluate(norm_x_test, y_test)
print('정확률=', res[1]*100)
# 지금은 정확률이 0.63 정도로 SVM보다 못한 성능을 보여준다.
# loss 값이 왜 저렇게 음의 방향으로 커지는 지에 대해서도 조사가 필요해 보인다.
# 모델 구조 개선이나 하이퍼파라미터 변경, 추가 데이터셋이 필요하다. (추후 할 예정)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
정확률= 63.91230821609497


In [14]:
from keras import models

# 모델 저장
# mlp.save('/content/drive/MyDrive/my_model')

# 고혈압 예측 확률(추정)
a = mlp.predict(norm_x_test[:1])[0][0]
print(a)
print(type(a))

0.11385199
<class 'numpy.float32'>
