In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import keras

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold

In [25]:
df = pd.read_csv('../data/titanic.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
'''
 0   PassengerId  탑승자 일련번호  
 1   Survived     생존여부 0: 사망, 1: 생존 
 2   Pclass       티켓의 선실 등급 / 1등급, 2등급, 3등급
 3   Name         탑승자 이름
 4   Sex          탑승자 성별 
 5   Age          탑승자 나이
 6   SibSp        같이 탑승한 형제 자매 또는 배우자의 수 
 7   Parch        같이 탑승한 부모님 또는 어린아이 수  
 8   Ticket       티켓 일련 번호
 9   Fare         요금
 10  Cabin        선실번호
 11  Embarked     중간 정착항구
    
[딥러닝 실습]
1. 전처리 작업

 1) 11개의 독립변수 중 생존여부와 관련 없는 변수는 무엇인가?  ; 탑승자 일련번호 df.corr()에서
 
 2) 문자열 컬럼은 무엇인가? : 성별, 정박항구, cabin, name, ticket
 
 3) 결측치가 존재하는 컴럼은 어떤 값으로 대체하여야 하는가?
    Age: 평균값 처리
    cabin: 첫번째 문자만 남기고 나머지 문자 제거, 결측치 값은 N 값으로 설정
    Embarked: 결측치 값은 N 값을 설정
    
2. 훈련 데이터와 테스트 데이터 생성
3. 모델 생성 및 학습
4. 검증 정확도 80% 이상 확보

'''

In [26]:
new_column = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived']
df = df[new_column]

In [28]:
df['Cabin'] = df['Cabin'].str[0]
df['Cabin'] = df['Cabin'].fillna('N') # cabin 결측치 처리
df['Cabin']

0      N
1      C
2      N
3      C
4      N
      ..
886    N
887    B
888    N
889    C
890    N
Name: Cabin, Length: 891, dtype: object

In [30]:
e = LabelEncoder()
e.fit(df['Sex'])
df['Sex'] = e.transform(df['Sex'])
e.fit(df['Embarked'])
df['Embarked'] = e.transform(df['Embarked'])
e.fit(df['Cabin'])
df['Cabin'] = e.transform(df['Cabin'])
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,7,2,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,2,0,1
2,3,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,7,2,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2,2,1
4,5,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,7,2,0


In [32]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

In [35]:
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,7,2,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,2,0,1
2,3,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,7,2,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,2,2,1
4,5,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,7,2,0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        891 non-null    int32  
 10  Embarked     891 non-null    int64  
 11  Survived     891 non-null    int64  
dtypes: float64(2), int32(1), int64(7), object(2)
memory usage: 80.2+ KB


In [36]:
df.corr()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
PassengerId,1.0,-0.035144,0.042939,0.033207,-0.057527,-0.001652,0.012658,-0.03308,0.013083,-0.005007
Pclass,-0.035144,1.0,0.1319,-0.331339,0.083081,0.018443,-0.5495,0.742093,0.157112,-0.338481
Sex,0.042939,0.1319,1.0,0.084153,-0.114631,-0.245489,-0.182333,0.118635,0.104057,-0.543351
Age,0.033207,-0.331339,0.084153,1.0,-0.232625,-0.179191,0.091566,-0.249098,-0.022239,-0.069809
SibSp,-0.057527,0.083081,-0.114631,-0.232625,1.0,0.414838,0.159651,0.041058,0.066654,-0.035322
Parch,-0.001652,0.018443,-0.245489,-0.179191,0.414838,1.0,0.216225,-0.031553,0.038322,0.081629
Fare,0.012658,-0.5495,-0.182333,0.091566,0.159651,0.216225,1.0,-0.525742,-0.221226,0.257307
Cabin,-0.03308,0.742093,0.118635,-0.249098,0.041058,-0.031553,-0.525742,1.0,0.191973,-0.295113
Embarked,0.013083,0.157112,0.104057,-0.022239,0.066654,0.038322,-0.221226,0.191973,1.0,-0.163517
Survived,-0.005007,-0.338481,-0.543351,-0.069809,-0.035322,0.081629,0.257307,-0.295113,-0.163517,1.0


In [37]:
new_column_arange = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch','Fare', 'Cabin', 'Embarked', 'Survived']
df = df[new_column_arange]
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,3,1,22.0,1,0,7.25,7,2,0
1,1,0,38.0,1,0,71.2833,2,0,1
2,3,0,26.0,0,0,7.925,7,2,1
3,1,0,35.0,1,0,53.1,2,2,1
4,3,1,35.0,0,0,8.05,7,2,0


In [38]:
seed = 1234
np.random.seed(seed)
tf.random.set_seed(seed)

In [40]:
df.shape

(891, 9)

In [50]:
MODEL_DIR = './model_titanic/'
if not os.path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)
modelpath = MODEL_DIR + "{epoch:02d}-{val_loss:.4f}.hdf5"

In [43]:
dataset = df.values
X = dataset[:, :8]
Y = dataset[:,8]

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.3, random_state=seed)

model = Sequential()
model.add(Dense(40, input_dim=8, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [101]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [102]:
checkpoint = ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=1, save_best_only=True)
stopping = EarlyStopping(monitor='val_loss', patience=200)

In [103]:
history = model.fit(X_train, Y_train, validation_split=0.3, epochs=1000, batch_size=20, callbacks=[stopping, checkpoint])

Epoch 1/1000

Epoch 00001: val_loss improved from inf to 0.94944, saving model to ./model_titanic\01-0.9494.hdf5
Epoch 2/1000

Epoch 00002: val_loss improved from 0.94944 to 0.70003, saving model to ./model_titanic\02-0.7000.hdf5
Epoch 3/1000

Epoch 00003: val_loss improved from 0.70003 to 0.64718, saving model to ./model_titanic\03-0.6472.hdf5
Epoch 4/1000

Epoch 00004: val_loss improved from 0.64718 to 0.62990, saving model to ./model_titanic\04-0.6299.hdf5
Epoch 5/1000

Epoch 00005: val_loss improved from 0.62990 to 0.62900, saving model to ./model_titanic\05-0.6290.hdf5
Epoch 6/1000

Epoch 00006: val_loss improved from 0.62900 to 0.59791, saving model to ./model_titanic\06-0.5979.hdf5
Epoch 7/1000

Epoch 00007: val_loss improved from 0.59791 to 0.58904, saving model to ./model_titanic\07-0.5890.hdf5
Epoch 8/1000

Epoch 00008: val_loss did not improve from 0.58904
Epoch 9/1000

Epoch 00009: val_loss improved from 0.58904 to 0.58525, saving model to ./model_titanic\09-0.5852.hdf5
Epo


Epoch 00038: val_loss did not improve from 0.47673
Epoch 39/1000

Epoch 00039: val_loss did not improve from 0.47673
Epoch 40/1000

Epoch 00040: val_loss did not improve from 0.47673
Epoch 41/1000

Epoch 00041: val_loss did not improve from 0.47673
Epoch 42/1000

Epoch 00042: val_loss did not improve from 0.47673
Epoch 43/1000

Epoch 00043: val_loss did not improve from 0.47673
Epoch 44/1000

Epoch 00044: val_loss did not improve from 0.47673
Epoch 45/1000

Epoch 00045: val_loss did not improve from 0.47673
Epoch 46/1000

Epoch 00046: val_loss did not improve from 0.47673
Epoch 47/1000

Epoch 00047: val_loss did not improve from 0.47673
Epoch 48/1000

Epoch 00048: val_loss did not improve from 0.47673
Epoch 49/1000

Epoch 00049: val_loss improved from 0.47673 to 0.47398, saving model to ./model_titanic\49-0.4740.hdf5
Epoch 50/1000

Epoch 00050: val_loss improved from 0.47398 to 0.46529, saving model to ./model_titanic\50-0.4653.hdf5
Epoch 51/1000

Epoch 00051: val_loss did not improve


Epoch 00079: val_loss did not improve from 0.45906
Epoch 80/1000

Epoch 00080: val_loss did not improve from 0.45906
Epoch 81/1000

Epoch 00081: val_loss did not improve from 0.45906
Epoch 82/1000

Epoch 00082: val_loss improved from 0.45906 to 0.45504, saving model to ./model_titanic\82-0.4550.hdf5
Epoch 83/1000

Epoch 00083: val_loss did not improve from 0.45504
Epoch 84/1000

Epoch 00084: val_loss did not improve from 0.45504
Epoch 85/1000

Epoch 00085: val_loss did not improve from 0.45504
Epoch 86/1000

Epoch 00086: val_loss did not improve from 0.45504
Epoch 87/1000

Epoch 00087: val_loss did not improve from 0.45504
Epoch 88/1000

Epoch 00088: val_loss did not improve from 0.45504
Epoch 89/1000

Epoch 00089: val_loss did not improve from 0.45504
Epoch 90/1000

Epoch 00090: val_loss did not improve from 0.45504
Epoch 91/1000

Epoch 00091: val_loss did not improve from 0.45504
Epoch 92/1000

Epoch 00092: val_loss improved from 0.45504 to 0.44985, saving model to ./model_titanic\9


Epoch 00120: val_loss did not improve from 0.44155
Epoch 121/1000

Epoch 00121: val_loss did not improve from 0.44155
Epoch 122/1000

Epoch 00122: val_loss did not improve from 0.44155
Epoch 123/1000

Epoch 00123: val_loss did not improve from 0.44155
Epoch 124/1000

Epoch 00124: val_loss did not improve from 0.44155
Epoch 125/1000

Epoch 00125: val_loss did not improve from 0.44155
Epoch 126/1000

Epoch 00126: val_loss did not improve from 0.44155
Epoch 127/1000

Epoch 00127: val_loss did not improve from 0.44155
Epoch 128/1000

Epoch 00128: val_loss did not improve from 0.44155
Epoch 129/1000

Epoch 00129: val_loss did not improve from 0.44155
Epoch 130/1000

Epoch 00130: val_loss did not improve from 0.44155
Epoch 131/1000

Epoch 00131: val_loss did not improve from 0.44155
Epoch 132/1000

Epoch 00132: val_loss did not improve from 0.44155
Epoch 133/1000

Epoch 00133: val_loss did not improve from 0.44155
Epoch 134/1000

Epoch 00134: val_loss did not improve from 0.44155
Epoch 135/


Epoch 00162: val_loss did not improve from 0.44155
Epoch 163/1000

Epoch 00163: val_loss did not improve from 0.44155
Epoch 164/1000

Epoch 00164: val_loss did not improve from 0.44155
Epoch 165/1000

Epoch 00165: val_loss did not improve from 0.44155
Epoch 166/1000

Epoch 00166: val_loss did not improve from 0.44155
Epoch 167/1000

Epoch 00167: val_loss did not improve from 0.44155
Epoch 168/1000

Epoch 00168: val_loss did not improve from 0.44155
Epoch 169/1000

Epoch 00169: val_loss did not improve from 0.44155
Epoch 170/1000

Epoch 00170: val_loss did not improve from 0.44155
Epoch 171/1000

Epoch 00171: val_loss did not improve from 0.44155
Epoch 172/1000

Epoch 00172: val_loss did not improve from 0.44155
Epoch 173/1000

Epoch 00173: val_loss did not improve from 0.44155
Epoch 174/1000

Epoch 00174: val_loss did not improve from 0.44155
Epoch 175/1000

Epoch 00175: val_loss did not improve from 0.44155
Epoch 176/1000

Epoch 00176: val_loss did not improve from 0.44155
Epoch 177/


Epoch 00203: val_loss did not improve from 0.43917
Epoch 204/1000

Epoch 00204: val_loss did not improve from 0.43917
Epoch 205/1000

Epoch 00205: val_loss did not improve from 0.43917
Epoch 206/1000

Epoch 00206: val_loss did not improve from 0.43917
Epoch 207/1000

Epoch 00207: val_loss did not improve from 0.43917
Epoch 208/1000

Epoch 00208: val_loss did not improve from 0.43917
Epoch 209/1000

Epoch 00209: val_loss did not improve from 0.43917
Epoch 210/1000

Epoch 00210: val_loss did not improve from 0.43917
Epoch 211/1000

Epoch 00211: val_loss did not improve from 0.43917
Epoch 212/1000

Epoch 00212: val_loss did not improve from 0.43917
Epoch 213/1000

Epoch 00213: val_loss did not improve from 0.43917
Epoch 214/1000

Epoch 00214: val_loss did not improve from 0.43917
Epoch 215/1000

Epoch 00215: val_loss did not improve from 0.43917
Epoch 216/1000

Epoch 00216: val_loss did not improve from 0.43917
Epoch 217/1000

Epoch 00217: val_loss did not improve from 0.43917
Epoch 218/


Epoch 00245: val_loss did not improve from 0.43917
Epoch 246/1000

Epoch 00246: val_loss did not improve from 0.43917
Epoch 247/1000

Epoch 00247: val_loss did not improve from 0.43917
Epoch 248/1000

Epoch 00248: val_loss did not improve from 0.43917
Epoch 249/1000

Epoch 00249: val_loss did not improve from 0.43917
Epoch 250/1000

Epoch 00250: val_loss did not improve from 0.43917
Epoch 251/1000

Epoch 00251: val_loss did not improve from 0.43917
Epoch 252/1000

Epoch 00252: val_loss did not improve from 0.43917
Epoch 253/1000

Epoch 00253: val_loss did not improve from 0.43917
Epoch 254/1000

Epoch 00254: val_loss did not improve from 0.43917
Epoch 255/1000

Epoch 00255: val_loss did not improve from 0.43917
Epoch 256/1000

Epoch 00256: val_loss did not improve from 0.43917
Epoch 257/1000

Epoch 00257: val_loss did not improve from 0.43917
Epoch 258/1000

Epoch 00258: val_loss did not improve from 0.43917
Epoch 259/1000

Epoch 00259: val_loss did not improve from 0.43917
Epoch 260/


Epoch 00287: val_loss did not improve from 0.43917
Epoch 288/1000

Epoch 00288: val_loss did not improve from 0.43917
Epoch 289/1000

Epoch 00289: val_loss did not improve from 0.43917
Epoch 290/1000

Epoch 00290: val_loss did not improve from 0.43917
Epoch 291/1000

Epoch 00291: val_loss did not improve from 0.43917
Epoch 292/1000

Epoch 00292: val_loss did not improve from 0.43917
Epoch 293/1000

Epoch 00293: val_loss did not improve from 0.43917
Epoch 294/1000

Epoch 00294: val_loss did not improve from 0.43917
Epoch 295/1000

Epoch 00295: val_loss did not improve from 0.43917
Epoch 296/1000

Epoch 00296: val_loss did not improve from 0.43917
Epoch 297/1000

Epoch 00297: val_loss did not improve from 0.43917
Epoch 298/1000

Epoch 00298: val_loss did not improve from 0.43917
Epoch 299/1000

Epoch 00299: val_loss did not improve from 0.43917
Epoch 300/1000

Epoch 00300: val_loss did not improve from 0.43917
Epoch 301/1000

Epoch 00301: val_loss did not improve from 0.43917
Epoch 302/


Epoch 00329: val_loss did not improve from 0.43917
Epoch 330/1000

Epoch 00330: val_loss did not improve from 0.43917
Epoch 331/1000

Epoch 00331: val_loss did not improve from 0.43917
Epoch 332/1000

Epoch 00332: val_loss did not improve from 0.43917
Epoch 333/1000

Epoch 00333: val_loss did not improve from 0.43917
Epoch 334/1000

Epoch 00334: val_loss did not improve from 0.43917
Epoch 335/1000

Epoch 00335: val_loss did not improve from 0.43917
Epoch 336/1000

Epoch 00336: val_loss did not improve from 0.43917
Epoch 337/1000

Epoch 00337: val_loss did not improve from 0.43917
Epoch 338/1000

Epoch 00338: val_loss did not improve from 0.43917
Epoch 339/1000

Epoch 00339: val_loss did not improve from 0.43917
Epoch 340/1000

Epoch 00340: val_loss did not improve from 0.43917
Epoch 341/1000

Epoch 00341: val_loss did not improve from 0.43917
Epoch 342/1000

Epoch 00342: val_loss did not improve from 0.43917
Epoch 343/1000

Epoch 00343: val_loss did not improve from 0.43917
Epoch 344/


Epoch 00371: val_loss did not improve from 0.43917
Epoch 372/1000

Epoch 00372: val_loss did not improve from 0.43917
Epoch 373/1000

Epoch 00373: val_loss did not improve from 0.43917
Epoch 374/1000

Epoch 00374: val_loss did not improve from 0.43917
Epoch 375/1000

Epoch 00375: val_loss did not improve from 0.43917
Epoch 376/1000

Epoch 00376: val_loss did not improve from 0.43917
Epoch 377/1000

Epoch 00377: val_loss did not improve from 0.43917
Epoch 378/1000

Epoch 00378: val_loss did not improve from 0.43917


In [99]:
best_model = load_model('./model_titanic/112-0.4492.hdf5')
result = best_model.evaluate(X_test,Y_test, verbose=0)
print('test loss: {:.4f}, test_accuracy: {:.4f}'.format(result[0], result[1]))

test loss: 0.5426, test_accuracy: 0.7799


In [None]:
## 1. 컬럼 삭제
titanic_df = pd.read_csv('../data/titanic.csv')
titanic_df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
titanic_df.info()

In [106]:
# 2. 결측치 처리하기 
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     204 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [107]:
#3 결측치 처리하기 Cabin
titanic_df['Cabin'].fillna('N', inplace=True)

titanic_df['Cabin'] = titani_df['Cabin'].str[:1fillna('N', inplace=True)
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Cabin     891 non-null    object 
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [None]:
#4 결측치 처리하기 Cabin
