In [15]:
import urllib.request
import os
##下載鐵達泥旅客資料集
url='http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls'
filepath='./data/titanic3.xls'
#判斷是否有資料集
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url,filepath)
    print("download:",result)

In [16]:
import numpy
import pandas as pd 
all_df=pd.read_excel(filepath)
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [46]:
#把ticket cabin艙位號碼捨棄
cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
all_df = all_df[cols]
df = all_df.drop(['name'],axis=1)
all_df.isnull().sum()


survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [47]:
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)
df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int)
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


In [51]:
x_OneHot_df = pd.get_dummies(data=df,columns=['embarked'])
x_OneHot_df[:10]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1
2,0,1,0,2.0,1,2,151.55,0,0,1
3,0,1,1,30.0,1,2,151.55,0,0,1
4,0,1,0,25.0,1,2,151.55,0,0,1
5,1,1,1,48.0,0,0,26.55,0,0,1
6,1,1,0,63.0,1,0,77.9583,0,0,1
7,0,1,1,39.0,0,0,0.0,0,0,1
8,1,1,0,53.0,2,0,51.4792,0,0,1
9,0,1,1,71.0,0,0,49.5042,1,0,0


In [64]:
ndarray = x_OneHot_df.values
ndarray.shape
ndarray[:2]

array([[   1.    ,    1.    ,    0.    ,   29.    ,    0.    ,    0.    ,
         211.3375,    0.    ,    0.    ,    1.    ],
       [   1.    ,    1.    ,    1.    ,    0.9167,    1.    ,    2.    ,
         151.55  ,    0.    ,    0.    ,    1.    ]])

In [65]:
#擷取features and label
Label = ndarray[:,0]#參數一 是取：所有筆數 參數二 欄位名稱
Features = ndarray[:,1:]

In [66]:
Label[:2]

array([ 1.,  1.])

In [67]:
Features[:2]

array([[   1.    ,    0.    ,   29.    ,    0.    ,    0.    ,  211.3375,
           0.    ,    0.    ,    1.    ],
       [   1.    ,    1.    ,    0.9167,    1.    ,    2.    ,  151.55  ,
           0.    ,    0.    ,    1.    ]])

In [74]:
from sklearn import preprocessing
#使用MinMaxScaler標準化刻度minmax_scale
minmax_scale = preprocessing.MinMaxScaler(feature_range = (0,1))
#使用minmax_scale.fit_transfrom進行標準化
scaledFeatures = minmax_scale.fit_transform(Features)
scaledFeatures[:2]

array([[ 0.        ,  0.        ,  0.36116884,  0.        ,  0.        ,
         0.41250333,  0.        ,  0.        ,  1.        ],
       [ 0.        ,  1.        ,  0.00939458,  0.125     ,  0.22222222,
         0.2958059 ,  0.        ,  0.        ,  1.        ]])

In [84]:
#將資料分為訓練與測試資料集
msk = numpy.random.rand(len(all_df)) < 0.8
train_df=all_df[msk]
test_df = all_df[~msk]

In [85]:
print('total:',len(all_df),
      'train:',len(train_df),
      'test:',len(test_df))

total: 1309 train: 1043 test: 266


In [92]:
#建立preprocessdata 函數進行資料的預處理
def PreprocessData(raw_df):
    df = raw_df.drop(['name'],axis=1)
    #檢查null值
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int)
    x_OneHot_df = pd.get_dummies(data= df,columns=['embarked'])
    
    ndarray = x_OneHot_df.values
    Features = ndarray[:,1:]
    Labels=ndarray[:,0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range= (0,1))
    scaledFeatures = minmax_scale.fit_transform(Features)
    
    return scaledFeatures,Label

In [93]:
train_Features,train_Labels=PreprocessData(train_df)
test_Features,test_Labels = PreprocessData(test_df)

In [95]:
train_Labels[:2]

array([ 1.,  1.])

In [3]:
import numpy
import pandas as pd 
from sklearn import preprocessing
numpy.random.seed(10)
all_df=pd.read_excel('./data/titanic3.xls')
cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
all_df = all_df[cols]
msk = numpy.random.rand(len(all_df)) < 0.8
train_df=all_df[msk]
test_df = all_df[~msk]
print('total:',len(all_df),
      'train:',len(train_df),
      'test:',len(test_df))
def PreprocessData(raw_df):
    df = raw_df.drop(['name'],axis=1)
    #檢查null值
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map({'female':0,'male':1}).astype(int)
    x_OneHot_df = pd.get_dummies(data= df,columns=['embarked'])
    
    ndarray = x_OneHot_df.values
    Features = ndarray[:,1:]
    Labels=ndarray[:,0]
    
    minmax_scale = preprocessing.MinMaxScaler(feature_range= (0,1))
    scaledFeatures = minmax_scale.fit_transform(Features)
    
    return scaledFeatures,Labels
train_Features,train_Labels=PreprocessData(train_df)
test_Features,test_Labels = PreprocessData(test_df)


total: 1309 train: 1034 test: 275


In [4]:
#建立模型
from keras.models import Sequential
from keras.layers import Dense,Dropout

Using TensorFlow backend.


In [5]:
model = Sequential()

##建立輸入層與隱藏層1

model.add(Dense(units = 40, #共有40個神經元
                input_dim=9,#輸入層9因為資料預處理有9個特徵值
                kernel_initializer = 'uniform',#使用uniform distribution 分佈亂數 初始化weight 與bias
                activation = 'relu'))
##建立隱藏層2

model.add(Dense(units = 30,
                kernel_initializer = 'uniform',
                activation = 'relu'))

#建立輸出層
model.add(Dense(units = 1,
                kernel_initializer = 'uniform',
                activation = 'sigmoid')) #定義激活函數 sigmoid

In [6]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 40)                400       
_________________________________________________________________
dense_2 (Dense)              (None, 30)                1230      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 1,661
Trainable params: 1,661
Non-trainable params: 0
_________________________________________________________________


In [7]:
#定義訓練方式
model.compile(loss='binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [111]:
train_history = model.fit(x= train_Features,
                          y= train_Labels,
                          validation_split=0.1,
                          epochs = 30,
                          batch_size=30,
                          verbose=2)

ValueError: Input arrays should have the same number of samples as target arrays. Found 1034 input samples and 1309 target samples.