## 載入需要的模組

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

import pandas as pd
import numpy as np
import math

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 讀入資料

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submit = pd.read_csv('gender_submission.csv')

## 先看一下資料內容

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 看看缺失值的狀況

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


## 看看資料的分布狀況

In [6]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


## 從上面觀察的結果，選取自己想要的特徵

In [8]:
selected_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']

In [9]:
X_train = train[selected_features]

In [10]:
y_train = train['Survived']

In [11]:
X_test = test[selected_features]

## 處理缺失值

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Sex         891 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB


In [13]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Age         332 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        417 non-null float64
Sex         418 non-null object
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


In [14]:
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_train['Embarked'].fillna('S', inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True)
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [15]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass      891 non-null int64
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Sex         891 non-null object
Embarked    891 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB


In [16]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass      418 non-null int64
Age         418 non-null float64
SibSp       418 non-null int64
Parch       418 non-null int64
Fare        418 non-null float64
Sex         418 non-null object
Embarked    418 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB


## 將字串類型的類別資料做轉換

In [17]:
X_train.Embarked = X_train.Embarked.map({'S': 1, 'C': 2, 'Q':3})
X_test.Embarked = X_test.Embarked.map({'S': 1, 'C': 2, 'Q':3})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [18]:
X_train.Sex = X_train.Sex.map({'female': 0, 'male': 1})
X_test.Sex = X_test.Sex.map({'female': 0, 'male': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [19]:
age = np.zeros(len(X_train))
age[X_train['Age']<20] = 1
age[(X_train['Age']>=20)&(X_train['Age']<65)] = 2
age[(X_train['Age']>=65)] = 3
X_train['Age'] = age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [20]:
age = np.zeros(len(X_test))
age[X_test['Age']<20] = 1
age[(X_test['Age']>=20)&(X_test['Age']<65)] = 2
age[(X_test['Age']>=65)] = 3
X_test['Age'] = age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [21]:
fare = np.zeros(len(X_train))
fare[X_train['Fare']<8] = 1
fare[(X_train['Fare']>=8)&(X_train['Fare']<14)] = 2
fare[(X_train['Fare']>=14)&(X_train['Fare']<31)] = 3
fare[(X_train['Fare']>=31)] = 4
X_train['Fare'] = fare

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
fare = np.zeros(len(X_test))
fare[X_test['Fare']<8] = 1
fare[(X_test['Fare']>=8)&(X_test['Fare']<14)] = 2
fare[(X_test['Fare']>=14)&(X_test['Fare']<31)] = 3
fare[(X_test['Fare']>=31)] = 4
X_test['Fare'] = fare

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex,Embarked
0,3,2.0,1,0,1.0,1,1
1,1,2.0,1,0,4.0,0,2
2,3,2.0,0,0,1.0,0,1
3,1,2.0,1,0,4.0,0,1
4,3,2.0,0,0,2.0,1,1


## 建立keras sequential模型

In [24]:
model = Sequential()

### 建立輸入層和隱藏層1

In [25]:
model.add(Dense(units = 40, input_dim = 7, 
                kernel_initializer = 'uniform', 
                bias_initializer='zeros', 
                activation = 'relu'))

### 建立隱藏層2

In [26]:
model.add(Dense(units = 30, 
                kernel_initializer = 'uniform',
                bias_initializer='zeros', 
                activation = 'relu'))

###  建立輸出層

In [27]:
model.add(Dense(units = 1, 
                kernel_initializer = 'uniform',
                activation = 'sigmoid'))

## 訓練模型

### 定義訓練方式、訓練模型

In [28]:
model.compile(loss = 'binary_crossentropy', 
              optimizer = 'adam', 
              metrics=['accuracy'])

In [29]:
model.fit(X_train, y_train, validation_split=0.1, epochs=500, batch_size=30, verbose=2)

Train on 801 samples, validate on 90 samples
Epoch 1/500
 - 0s - loss: 0.6897 - acc: 0.6117 - val_loss: 0.6808 - val_acc: 0.6222
Epoch 2/500
 - 0s - loss: 0.6703 - acc: 0.6155 - val_loss: 0.6470 - val_acc: 0.6222
Epoch 3/500
 - 0s - loss: 0.6377 - acc: 0.6155 - val_loss: 0.6003 - val_acc: 0.6222
Epoch 4/500
 - 0s - loss: 0.5957 - acc: 0.6704 - val_loss: 0.5339 - val_acc: 0.7889
Epoch 5/500
 - 0s - loss: 0.5484 - acc: 0.7353 - val_loss: 0.4789 - val_acc: 0.7889
Epoch 6/500
 - 0s - loss: 0.5142 - acc: 0.7516 - val_loss: 0.4485 - val_acc: 0.8000
Epoch 7/500
 - 0s - loss: 0.4920 - acc: 0.7728 - val_loss: 0.4363 - val_acc: 0.8111
Epoch 8/500
 - 0s - loss: 0.4795 - acc: 0.8002 - val_loss: 0.4300 - val_acc: 0.8111
Epoch 9/500
 - 0s - loss: 0.4844 - acc: 0.7865 - val_loss: 0.4429 - val_acc: 0.8222
Epoch 10/500
 - 0s - loss: 0.4695 - acc: 0.7915 - val_loss: 0.4275 - val_acc: 0.8111
Epoch 11/500
 - 0s - loss: 0.4656 - acc: 0.7990 - val_loss: 0.4243 - val_acc: 0.8111
Epoch 12/500
 - 0s - loss: 0.

Epoch 97/500
 - 0s - loss: 0.4397 - acc: 0.8090 - val_loss: 0.3993 - val_acc: 0.8222
Epoch 98/500
 - 0s - loss: 0.4351 - acc: 0.8040 - val_loss: 0.3992 - val_acc: 0.8111
Epoch 99/500
 - 0s - loss: 0.4399 - acc: 0.8002 - val_loss: 0.3973 - val_acc: 0.8333
Epoch 100/500
 - 0s - loss: 0.4389 - acc: 0.8090 - val_loss: 0.3973 - val_acc: 0.8333
Epoch 101/500
 - 0s - loss: 0.4349 - acc: 0.8115 - val_loss: 0.3996 - val_acc: 0.8111
Epoch 102/500
 - 0s - loss: 0.4350 - acc: 0.8090 - val_loss: 0.4034 - val_acc: 0.8222
Epoch 103/500
 - 0s - loss: 0.4426 - acc: 0.8015 - val_loss: 0.3989 - val_acc: 0.8111
Epoch 104/500
 - 0s - loss: 0.4410 - acc: 0.8040 - val_loss: 0.4012 - val_acc: 0.8111
Epoch 105/500
 - 0s - loss: 0.4341 - acc: 0.8077 - val_loss: 0.3982 - val_acc: 0.8222
Epoch 106/500
 - 0s - loss: 0.4357 - acc: 0.8052 - val_loss: 0.3955 - val_acc: 0.8333
Epoch 107/500
 - 0s - loss: 0.4341 - acc: 0.8115 - val_loss: 0.3984 - val_acc: 0.8222
Epoch 108/500
 - 0s - loss: 0.4329 - acc: 0.8102 - val_lo

Epoch 193/500
 - 0s - loss: 0.4298 - acc: 0.8127 - val_loss: 0.3821 - val_acc: 0.8556
Epoch 194/500
 - 0s - loss: 0.4255 - acc: 0.8115 - val_loss: 0.3832 - val_acc: 0.8556
Epoch 195/500
 - 0s - loss: 0.4255 - acc: 0.8127 - val_loss: 0.3847 - val_acc: 0.8556
Epoch 196/500
 - 0s - loss: 0.4227 - acc: 0.8190 - val_loss: 0.3888 - val_acc: 0.8222
Epoch 197/500
 - 0s - loss: 0.4250 - acc: 0.8177 - val_loss: 0.3849 - val_acc: 0.8444
Epoch 198/500
 - 0s - loss: 0.4267 - acc: 0.8115 - val_loss: 0.3846 - val_acc: 0.8333
Epoch 199/500
 - 0s - loss: 0.4302 - acc: 0.8215 - val_loss: 0.3905 - val_acc: 0.8222
Epoch 200/500
 - 0s - loss: 0.4268 - acc: 0.8165 - val_loss: 0.3853 - val_acc: 0.8444
Epoch 201/500
 - 0s - loss: 0.4243 - acc: 0.8165 - val_loss: 0.3826 - val_acc: 0.8444
Epoch 202/500
 - 0s - loss: 0.4236 - acc: 0.8140 - val_loss: 0.3813 - val_acc: 0.8333
Epoch 203/500
 - 0s - loss: 0.4248 - acc: 0.8190 - val_loss: 0.3862 - val_acc: 0.8222
Epoch 204/500
 - 0s - loss: 0.4214 - acc: 0.8227 - val

Epoch 289/500
 - 0s - loss: 0.4132 - acc: 0.8252 - val_loss: 0.3698 - val_acc: 0.8333
Epoch 290/500
 - 0s - loss: 0.4182 - acc: 0.8215 - val_loss: 0.3737 - val_acc: 0.8444
Epoch 291/500
 - 0s - loss: 0.4114 - acc: 0.8240 - val_loss: 0.3735 - val_acc: 0.8444
Epoch 292/500
 - 0s - loss: 0.4231 - acc: 0.8127 - val_loss: 0.3810 - val_acc: 0.8222
Epoch 293/500
 - 0s - loss: 0.4156 - acc: 0.8252 - val_loss: 0.3676 - val_acc: 0.8444
Epoch 294/500
 - 0s - loss: 0.4109 - acc: 0.8240 - val_loss: 0.3691 - val_acc: 0.8444
Epoch 295/500
 - 0s - loss: 0.4111 - acc: 0.8227 - val_loss: 0.3733 - val_acc: 0.8333
Epoch 296/500
 - 0s - loss: 0.4116 - acc: 0.8252 - val_loss: 0.3681 - val_acc: 0.8444
Epoch 297/500
 - 0s - loss: 0.4134 - acc: 0.8265 - val_loss: 0.3748 - val_acc: 0.8333
Epoch 298/500
 - 0s - loss: 0.4110 - acc: 0.8227 - val_loss: 0.3699 - val_acc: 0.8444
Epoch 299/500
 - 0s - loss: 0.4113 - acc: 0.8240 - val_loss: 0.3690 - val_acc: 0.8444
Epoch 300/500
 - 0s - loss: 0.4141 - acc: 0.8165 - val

Epoch 385/500
 - 0s - loss: 0.4006 - acc: 0.8377 - val_loss: 0.3621 - val_acc: 0.8556
Epoch 386/500
 - 0s - loss: 0.4062 - acc: 0.8252 - val_loss: 0.3592 - val_acc: 0.8667
Epoch 387/500
 - 0s - loss: 0.4043 - acc: 0.8290 - val_loss: 0.3624 - val_acc: 0.8556
Epoch 388/500
 - 0s - loss: 0.4045 - acc: 0.8252 - val_loss: 0.3583 - val_acc: 0.8667
Epoch 389/500
 - 0s - loss: 0.4026 - acc: 0.8215 - val_loss: 0.3578 - val_acc: 0.8333
Epoch 390/500
 - 0s - loss: 0.4039 - acc: 0.8240 - val_loss: 0.3549 - val_acc: 0.8333
Epoch 391/500
 - 0s - loss: 0.4054 - acc: 0.8352 - val_loss: 0.3578 - val_acc: 0.8444
Epoch 392/500
 - 0s - loss: 0.4032 - acc: 0.8340 - val_loss: 0.3568 - val_acc: 0.8444
Epoch 393/500
 - 0s - loss: 0.4002 - acc: 0.8327 - val_loss: 0.3552 - val_acc: 0.8444
Epoch 394/500
 - 0s - loss: 0.4020 - acc: 0.8265 - val_loss: 0.3573 - val_acc: 0.8444
Epoch 395/500
 - 0s - loss: 0.4028 - acc: 0.8365 - val_loss: 0.3590 - val_acc: 0.8556
Epoch 396/500
 - 0s - loss: 0.4028 - acc: 0.8265 - val

Epoch 481/500
 - 0s - loss: 0.3980 - acc: 0.8252 - val_loss: 0.3558 - val_acc: 0.8556
Epoch 482/500
 - 0s - loss: 0.3952 - acc: 0.8439 - val_loss: 0.3503 - val_acc: 0.8556
Epoch 483/500
 - 0s - loss: 0.3976 - acc: 0.8365 - val_loss: 0.3587 - val_acc: 0.8556
Epoch 484/500
 - 0s - loss: 0.3959 - acc: 0.8315 - val_loss: 0.3509 - val_acc: 0.8444
Epoch 485/500
 - 0s - loss: 0.3948 - acc: 0.8390 - val_loss: 0.3522 - val_acc: 0.8667
Epoch 486/500
 - 0s - loss: 0.3937 - acc: 0.8340 - val_loss: 0.3544 - val_acc: 0.8667
Epoch 487/500
 - 0s - loss: 0.4007 - acc: 0.8277 - val_loss: 0.3551 - val_acc: 0.8556
Epoch 488/500
 - 0s - loss: 0.3927 - acc: 0.8414 - val_loss: 0.3554 - val_acc: 0.8667
Epoch 489/500
 - 0s - loss: 0.3975 - acc: 0.8327 - val_loss: 0.3478 - val_acc: 0.8444
Epoch 490/500
 - 0s - loss: 0.3924 - acc: 0.8427 - val_loss: 0.3508 - val_acc: 0.8556
Epoch 491/500
 - 0s - loss: 0.3958 - acc: 0.8340 - val_loss: 0.3528 - val_acc: 0.8667
Epoch 492/500
 - 0s - loss: 0.3954 - acc: 0.8352 - val

<keras.callbacks.History at 0x1c728ce3240>

In [30]:
scores = model.evaluate(X_train, y_train)
scores[1]



0.8428731757382351

## 用訓練好的模型來做預測

In [31]:
survived_predict = model.predict(X_test)
survived_predict = np.array(survived_predict)
survived_predict = np.round(survived_predict)
survived_predict

array([[0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],

## 將結果輸出成規定的格式

In [32]:
submission=pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':survived_predict[0:418,0]})

In [33]:
submission.to_csv('submission.csv', index=False)