#### 라이브러리 불러오기

In [8]:
import os
import numpy as np
import pandas as pd

import tensorflow as tf
from sklearn.model_selection import train_test_split

#### 원본데이터를 이용한 모델링

##### 데이터 불러오기

In [2]:
DATA_PATH = os.path.join(os.getcwd(), 'data')
data_df = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))

##### 전처리

* 결측치 처리

In [3]:
# 나이는 전체 중앙값으로 변환
data_df['Age'].fillna(data_df['Age'].median(), inplace=True)

# 승선항은 최빈값(mode()중 첫번째)으로 변환
data_df['Embarked'].fillna(data_df['Embarked'].mode()[0], inplace=True)

* 문자열 컬럼 처리

In [4]:
data_df.Sex = data_df.Sex.map({'male':0, 'female': 1})
data_df.Embarked = data_df.Embarked.map({
    'S': 0, 'C': 1, 'Q': 2
})

##### 훈련/시험용 데이터 분리

In [9]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'

data_x = data_df[features]
data_y = data_df[target]

train_x, test_x, train_y, test_y = \
    train_test_split(data_x, data_y, test_size=0.2, random_state=1234)

In [10]:
# Dataset 생성
BATCH_SIZE = 32
train_ds = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(train_x.shape[0]).batch(BATCH_SIZE)
test_ds = tf.data.Dataset.from_tensor_slices((test_x, test_y)).batch(BATCH_SIZE)

##### 신경망 모델 생성

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(train_x.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 모델 컴파일
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

##### 모델 훈련

In [21]:
model.fit(train_ds, epochs=100, verbose=0)

<keras.callbacks.History at 0x23799412e20>

##### 모델 평가

In [22]:
loss, accuracy = model.evaluate(test_ds, verbose=0)

In [23]:
loss, accuracy

(0.9847401976585388, 0.8156424760818481)

#### 전처리 데이터를 이용한 모델링

##### 전처리 데이터 불러오기

In [24]:
DATA_PATH = os.path.join(os.getcwd(), 'data', '11_titanic')

data_df = pd.read_csv(os.path.join(DATA_PATH, 'preprocessing_data.csv'))
label_df = pd.read_csv(os.path.join(DATA_PATH, 'preprocessing_label.csv'))

train_x, test_x, train_y, test_y = \
    train_test_split(data_df, label_df, test_size=0.2, random_state=1234)

In [25]:
# Dataset 생성
BATCH_SIZE = 32
train_ds = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(train_x.shape[0]).batch(BATCH_SIZE)
test_ds = tf.data.Dataset.from_tensor_slices((test_x, test_y)).batch(BATCH_SIZE)

##### 모델 생성

In [30]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(train_x.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 모델 컴파일
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

##### 훈련/평가

In [31]:
model.fit(train_ds, epochs=1000, verbose=0)
loss, accuracy = model.evaluate(test_ds, verbose=0)
loss, accuracy

(16.74186897277832, 0.659217894077301)