In [1]:
import numpy as np
import pandas as pd


import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

In [14]:
URL = './datas/heart.csv'
dataframe = pd.read_csv(URL)

dataframe['thal']
thals = ['fixed', 'normal', 'reversible', 'unknown']
thal_str = np.array([])

for thal in dataframe['thal']:
    thal_str = np.append(thal_str, thals[thal])
    
dataframe['thal'] = thal_str
dataframe.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,normal,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,reversible,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,reversible,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,reversible,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,reversible,1


In [9]:
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), '훈련 샘플')
print(len(val), '검증 샘플')
print(len(test), '테스트 샘플')

193 훈련 샘플
49 검증 샘플
61 테스트 샘플


In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 193 entries, 284 to 165
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       193 non-null    int64  
 1   sex       193 non-null    int64  
 2   cp        193 non-null    int64  
 3   trestbps  193 non-null    int64  
 4   chol      193 non-null    int64  
 5   fbs       193 non-null    int64  
 6   restecg   193 non-null    int64  
 7   thalach   193 non-null    int64  
 8   exang     193 non-null    int64  
 9   oldpeak   193 non-null    float64
 10  slope     193 non-null    int64  
 11  ca        193 non-null    int64  
 12  thal      193 non-null    object 
 13  target    193 non-null    int64  
dtypes: float64(1), int64(12), object(1)
memory usage: 22.6+ KB


In [10]:
# 판다스 데이터프레임으로부터 tf.data 데이터셋을 만들기 위한 함수
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [11]:
batch_size = 5 # 예제를 위해 작은 배치 크기를 사용합니다.
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [12]:
for feature_batch, label_batch in train_ds.take(1):
  print('전체 특성:', list(feature_batch.keys()))
  print('나이 특성의 배치:', feature_batch['age'])
  print('타깃의 배치:', label_batch )

전체 특성: ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
나이 특성의 배치: tf.Tensor([51 53 57 39 62], shape=(5,), dtype=int64)
타깃의 배치: tf.Tensor([1 0 1 1 0], shape=(5,), dtype=int64)


In [7]:
# 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다.
example_batch = next(iter(train_ds))[0]

In [108]:
# 특성 열을 만들고 배치 데이터를 변환하는 함수
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
#   print(feature_layer(example_batch).numpy())
    
  return feature_layer(example_batch).numpy()

In [109]:
age = feature_column.numeric_column("age")

print("층 지나기 전",example_batch['age'].numpy())
print("층 지난 후",demo(age))

층 지나기 전 [58 46 41 59 63]
층 지난 후 [[58.]
 [46.]
 [41.]
 [59.]
 [63.]]


In [110]:
boundaries = np.array([18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
print("버킷 사이즈",boundaries.shape)
age_buckets = feature_column.bucketized_column(age, boundaries=list(boundaries))

print("층 지나기 전",example_batch['age'].numpy())
print("층 지난 후",demo(age_buckets))
print("버킷 결과 사이즈",demo(age_buckets).shape)

버킷 사이즈 (10,)
층 지나기 전 [58 46 41 59 63]
층 지난 후 [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
버킷 결과 사이즈 (5, 11)


In [113]:
thals = ['fixed', 'normal', 'reversible', 'unknown']
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', thals)

thal_one_hot = feature_column.indicator_column(thal)
print("층 지나기 전", example_batch['thal'].numpy())
print("층 지난 후", demo(thal_one_hot))

층 지나기 전 [b'normal' b'reversible' b'reversible' b'unknown' b'unknown']
층 지난 후 [[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]


In [115]:
# 임베딩 열의 입력은 앞서 만든 범주형 열입니다.
thal_embedding = feature_column.embedding_column(thal, dimension=8)

print("층 지나기 전", demo(thal_one_hot))
print("층 지난 후", demo(thal_embedding))

층 지나기 전 [[0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]
층 지난 후 [[ 0.23888093  0.30186707 -0.25977695 -0.1994912  -0.34000975 -0.5717341
   0.18679073  0.26635703]
 [-0.5017554   0.30119684  0.38963363 -0.24513417  0.52775043  0.17985776
   0.04607302 -0.25360796]
 [-0.5017554   0.30119684  0.38963363 -0.24513417  0.52775043  0.17985776
   0.04607302 -0.25360796]
 [-0.01357286 -0.08513401  0.35047606 -0.15298954  0.66920584  0.02583694
   0.14474444 -0.23812744]
 [-0.01357286 -0.08513401  0.35047606 -0.15298954  0.66920584  0.02583694
   0.14474444 -0.23812744]]


In [116]:
thal_hashed = feature_column.categorical_column_with_hash_bucket(
      'thal', hash_bucket_size=1000)
print("층 지나기 전", example_batch['thal'].numpy())
print("층 지난 후", demo(feature_column.indicator_column(thal_hashed)))

층 지나기 전 [b'normal' b'reversible' b'reversible' b'unknown' b'unknown']
층 지난 후 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [132]:
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)

print("층 지나기 전", example_batch['age'].numpy())
print("층 지나기 전", example_batch['thal'].numpy())
print("층 지난 후", demo(feature_column.indicator_column(crossed_feature)))

층 지나기 전 [58 46 41 59 63]
층 지나기 전 [b'normal' b'reversible' b'reversible' b'unknown' b'unknown']
층 지난 후 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [135]:
feature_columns = []

# 수치형 열
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
  feature_columns.append(feature_column.numeric_column(header))

# 버킷형 열
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# 범주형 열
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', ['0', '1','2', '3'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# 임베딩 열
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# 교차 특성 열
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

In [149]:
feature_columns = []

for header in ['age']:
    feature_columns.append(feature_column.numeric_column(header))
    
# 버킷형 열
age_buckets = feature_column.bucketized_column(feature_columns[0], boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# 범주형 열 (원-핫 인코딩)
thal = feature_column.categorical_column_with_vocabulary_list(
      'thal', thals)
thal_one_hot = feature_column.indicator_column(thal)
# 임베딩 열
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)
    
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
print("학습 배치 벡터 크기", feature_layer(example_batch).shape)
feature_layer(example_batch)

학습 배치 벡터 크기 (5, 20)


<tf.Tensor: shape=(5, 20), dtype=float32, numpy=
array([[ 5.8000000e+01,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  1.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         4.5416072e-01,  4.1832155e-01,  2.2656007e-01,  2.2586979e-01,
        -1.8606263e-01,  1.4371683e-01,  2.2610144e-01, -5.2489495e-01],
       [ 4.6000000e+01,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  1.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
        -3.9937806e-01,  5.8340001e-01,  3.4974739e-01,  2.4795340e-01,
        -4.7335532e-02, -4.6615475e-01, -5.8903295e-01, -2.9781199e-01],
       [ 4.1000000e+01,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  1.0000000e+00,  0.0000000e+00,
         0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00,
        -3.99

In [136]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [139]:
feature_layer(example_batch)

<tf.Tensor: shape=(5, 1030), dtype=float32, numpy=
array([[ 58.,   0.,   0., ...,   0., 140., 114.],
       [ 46.,   0.,   0., ...,   0., 147., 150.],
       [ 41.,   0.,   0., ...,   0., 172., 130.],
       [ 59.,   0.,   0., ...,   0., 159., 170.],
       [ 63.,   0.,   0., ...,   0., 132., 130.]], dtype=float32)>

In [59]:
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [60]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=5)

Epoch 1/5
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f9b710e7bb0>

In [61]:
loss, accuracy = model.evaluate(test_ds)
print("정확도", accuracy)

정확도 0.6393442749977112
