In [6]:
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
import numpy as np
import pandas as pd

In [7]:
data = pd.read_csv('train.csv')
print(data.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64


In [8]:
# 결측치 처리
age_mean = data['Age'].mean()
print(age_mean)

mode = data['Embarked'].mode()
print(mode)

data['Age'].fillna(value=30 , inplace=True)
data['Embarked'].fillna(value='S' , inplace=True)

print(data.isnull().sum())

29.69911764705882
0    S
Name: Embarked, dtype: object
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [9]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [10]:

anwsers = data.pop('Survived') # 정답 데이터를 떼어냄

data_set = tf.data.Dataset.from_tensor_slices((dict(data), anwsers))
print(data_set)

<TensorSliceDataset element_spec=({'PassengerId': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Pclass': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Name': TensorSpec(shape=(), dtype=tf.string, name=None), 'Sex': TensorSpec(shape=(), dtype=tf.string, name=None), 'Age': TensorSpec(shape=(), dtype=tf.float64, name=None), 'SibSp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Parch': TensorSpec(shape=(), dtype=tf.int64, name=None), 'Ticket': TensorSpec(shape=(), dtype=tf.string, name=None), 'Fare': TensorSpec(shape=(), dtype=tf.float64, name=None), 'Embarked': TensorSpec(shape=(), dtype=tf.string, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>


In [11]:
# 각 컬럼을 전처리할때 어떻게 할 것인가?
feature_columns = [] # 특성 열을 저장할 리스트

# PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Embarked

# 숫자그대로 집어넣 Fare, Parch, SibSp : numeric_column
# 범주화 하여 넣 Age (10대, 20대, 30대) : bucketized_column
# 종류 몇개없는 카테고리 Sex, Embarkedm, Pclass : indicator_column
# 종류가 너무 많은 카테고리 Ticket : embedding_column

# 숫자그대로 집어넣 Fare, Parch, SibSp : numeric_column
feature_columns.append(tf.feature_column.numeric_column('Age'))
feature_columns.append(tf.feature_column.numeric_column('Sibsp'))
feature_columns.append(tf.feature_column.numeric_column('Parch'))
feature_columns.append(tf.feature_column.numeric_column('Fare'))


# 범주화 하여 넣 Age (10대, 20대, 30대) : bucketized_column
Age = tf.feature_column.numeric_column('Age') 
Age_butket = tf.feature_column.bucketized_column(Age, boundaries=[10,20,30,40,50,60,70,80])

feature_columns.append(Age_butket)

print(Age)
print(feature_columns)



NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)
[NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Sibsp', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), BucketizedColumn(source_column=NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(10, 20, 30, 40, 50, 60, 70, 80))]


In [12]:
print(feature_columns)

[NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Sibsp', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), BucketizedColumn(source_column=NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(10, 20, 30, 40, 50, 60, 70, 80))]


In [13]:
# 종류 몇개없는 카테고리 Sex, Embarkedm, Pclass : indicator_column
word = data['Sex'].unique()
categori = tf.feature_column.categorical_column_with_vocabulary_list('Sex', word)
one_hot = tf.feature_column.indicator_column(categori)
feature_columns.append(one_hot)

word = data['Embarked'].unique()
categori = tf.feature_column.categorical_column_with_vocabulary_list('Embarked', word)
one_hot = tf.feature_column.indicator_column(categori)
feature_columns.append(one_hot)

word = data['Pclass'].unique()
categori = tf.feature_column.categorical_column_with_vocabulary_list('Pclass', word)
one_hot = tf.feature_column.indicator_column(categori)
feature_columns.append(one_hot)

# 종류가 너무 많은 카테고리 Ticket : embedding_column
word = data['Ticket'].unique()
categori = tf.feature_column.categorical_column_with_vocabulary_list('Ticket', word)
one_hot = tf.feature_column.embedding_column(categori, dimension=10)
feature_columns.append(one_hot)


In [14]:
model = tf.keras.Sequential([
   tf.keras.layers.DenseFeatures(feature_columns), # 특성 열을 입력으로 받음
   tf.keras.layers.Dense(128, activation='relu'),
   tf.keras.layers.Dense(64, activation='relu'),
   tf.keras.layers.Dropout(0.2),
   tf.keras.layers.Dense(1, activation='sigmoid'), # 0~1 사이의 값으로 출력 (살아남을 확률)
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) # 이진분류

data_set_batch = data_set.batch(32) # 데이터를 섞고 32개씩 묶음

model.fit(data_set_batch, epochs=100)

Epoch 1/100


ValueError: in user code:

    File "e:\anaconda\envs\tensorflow_env\lib\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "e:\anaconda\envs\tensorflow_env\lib\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "e:\anaconda\envs\tensorflow_env\lib\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "e:\anaconda\envs\tensorflow_env\lib\site-packages\keras\engine\training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "e:\anaconda\envs\tensorflow_env\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer 'dense_features' (type DenseFeatures).
    
    Feature Sibsp is not in features dictionary.
    
    Call arguments received by layer 'dense_features' (type DenseFeatures):
      • features={'PassengerId': 'tf.Tensor(shape=(None,), dtype=int64)', 'Pclass': 'tf.Tensor(shape=(None,), dtype=int64)', 'Name': 'tf.Tensor(shape=(None,), dtype=string)', 'Sex': 'tf.Tensor(shape=(None,), dtype=string)', 'Age': 'tf.Tensor(shape=(None,), dtype=float32)', 'SibSp': 'tf.Tensor(shape=(None,), dtype=int64)', 'Parch': 'tf.Tensor(shape=(None,), dtype=int64)', 'Ticket': 'tf.Tensor(shape=(None,), dtype=string)', 'Fare': 'tf.Tensor(shape=(None,), dtype=float32)', 'Embarked': 'tf.Tensor(shape=(None,), dtype=string)'}
      • cols_to_output_tensors=None
      • training=True
