In [28]:
import os

import numpy as np
import pandas as pd
import tensorflow.keras as keras
from sklearn.model_selection import train_test_split

DIR_TRAIN = os.getcwd() + "\\data\\train.csv"
DIR_TEST = os.getcwd() + "\\data\\test.csv"

test_names = ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
train_names = ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin',
               'Embarked']

In [42]:
def model():
    '''
    定义模型
    :return: keras model
    '''
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(32, activation='relu'))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(64, activation='relu'))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(128, activation='relu'))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(32, activation='relu'))
    model.add(keras.layers.Dropout(0.2))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer=keras.optimizers.RMSprop(), loss=keras.losses.binary_crossentropy, metrics=['acc'])
    return model

In [43]:
keras_model = model()

In [4]:
def loadData(is_train, dir):
    if is_train:
        names = train_names
    else:
        names = test_names
    data = pd.read_csv(dir, header=0, names=names)
    data.pop('Cabin')
    data.pop('Name')
    data.pop('Ticket')
    return data

In [5]:
def progressData(data):
    # 补充完整数据
#     passengerId = data.pop('PassengerId')
    mean_age = round(data.mean()['Age'], 1)
    mean_fare = round(data.mean()['Fare'], 1)
    data = data.fillna({'Age': mean_age, 'Fare': mean_fare})
    data = data.fillna(method='ffill')
    return data

In [29]:
def splitData(datas, labels, splite):
    return train_test_split(datas, labels, test_size=splite, random_state=42)

In [7]:
def generator(data, lables, batch_size):
    idx = np.arange(len(data))
    print(len(data))
    np.random.shuffle(idx)
    print(data.columns)
    batchs = [idx[range(batch_size * i, min(len(data), batch_size * (i + 1)))] for i in
              range(int(len(data) / batch_size + 1))]
    while True:
        for i in batchs:
            xx = data.loc[i, :]
            yy = lables.loc[i]
            yield (xx, yy)

In [8]:
data_train = loadData(is_train=True, dir=DIR_TRAIN)
lables = data_train.pop('Survived')

In [9]:
data_test = loadData(is_train=False,dir=DIR_TEST)

In [10]:
print(data_train.shape)
print(data_test.shape)
data = data_train.append(data_test)
print(data.shape)

(891, 8)
(418, 8)
(1309, 8)


In [11]:
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


In [12]:
data = progressData(data)
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


### 数据处理
sex字段 male->1,female->0
embarked字段 S->0,C->1,Q->2
Age字段正则化
Embarked字段正则化

In [13]:
sex={'male':1,'female':0}
embarked={'S':0,'C':1,'Q':2}

In [14]:
def mean_std(name_column, data_set):
    mean, std = data_set.mean()[name_column], data_set.std()[name_column]
    print(mean, std)
    return mean, std

In [15]:
mean_fare,std_fare = mean_std('Fare',data)

33.295482734912184 51.738879032622215


In [16]:
data['Sex'] = data['Sex'].map(lambda x:sex[x])
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,22.0,1,0,7.25,S
1,2,1,0,38.0,1,0,71.2833,C
2,3,3,0,26.0,0,0,7.925,S
3,4,1,0,35.0,1,0,53.1,S
4,5,3,1,35.0,0,0,8.05,S


In [17]:
data['Embarked'] = data['Embarked'].map(lambda x:embarked[x])
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,22.0,1,0,7.25,0
1,2,1,0,38.0,1,0,71.2833,1
2,3,3,0,26.0,0,0,7.925,0
3,4,1,0,35.0,1,0,53.1,0
4,5,3,1,35.0,0,0,8.05,0


In [18]:
data['Age'] = data['Age'].map(lambda x:x/100.0)

In [19]:
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,0.22,1,0,7.25,0
1,2,1,0,0.38,1,0,71.2833,1
2,3,3,0,0.26,0,0,7.925,0
3,4,1,0,0.35,1,0,53.1,0
4,5,3,1,0.35,0,0,8.05,0


In [20]:
data['Fare'] = data['Fare'].map(lambda x:(x-mean_fare)/std_fare)
data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,0.22,1,0,-0.503403,0
1,2,1,0,0.38,1,0,0.734222,1
2,3,3,0,0.26,0,0,-0.490356,0
3,4,1,0,0.35,1,0,0.382778,0
4,5,3,1,0.35,0,0,-0.48794,0


In [21]:
train = data.iloc[0:891]
train.tail()
eval = data.iloc[891:]
eval.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,0.345,0,0,-0.492208,2
1,893,3,0,0.47,1,0,-0.508234,0
2,894,2,1,0.62,0,0,-0.456291,2
3,895,3,1,0.27,0,0,-0.476102,0
4,896,3,0,0.22,1,1,-0.406039,0


In [22]:
p =int( train.shape[0]*0.2)
p

178

In [23]:
loo = LeavePOut(p = p)

In [25]:
??keras_model.fit

In [30]:
x_train,x_test,y_train,y_test = splitData(train,lables,0.2)

In [36]:
y_train.values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,

In [44]:
history = keras_model.fit(x_train.values,y_train.values,batch_size=128,validation_data=[x_test.values,y_test.values],epochs=20)

Train on 712 samples, validate on 179 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
