### 下載鐵達尼號旅客資料集

In [14]:
import urllib.request
import os

In [15]:
url = r"http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath = r'C:\PyCodes\Keras\File\titanic3.xls'
if not os.path.isfile(filepath):
    #titanic3.xls does not exist
    result = urllib.request.urlretrieve(url, filepath)
    print('downloaded:', result)

### 使用Pandas dataframe讀取資料並進行處理

In [16]:
import numpy
import pandas as pd

In [17]:
all_df = pd.read_excel(filepath)

In [18]:
all_df[:10]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
5,1,1,"Anderson, Mr. Harry",male,48.0,0,0,19952,26.55,E12,S,3,,"New York, NY"
6,1,1,"Andrews, Miss. Kornelia Theodosia",female,63.0,1,0,13502,77.9583,D7,S,10,,"Hudson, NY"
7,1,0,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S,,,"Belfast, NI"
8,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53.0,2,0,11769,51.4792,C101,S,D,,"Bayside, Queens, NY"
9,1,0,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [19]:
columns =   ['survived', 
                        'name', 
                        'pclass' , 
                        'sex', 
                        'age', 
                        'sibsp',
                        'parch', 
                        'fare', 
                        'embarked']
all_df = all_df[columns]

In [20]:
all_df[:10]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S
2,0,"Allison, Miss. Helen Loraine",1,female,2.0,1,2,151.55,S
3,0,"Allison, Mr. Hudson Joshua Creighton",1,male,30.0,1,2,151.55,S
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,female,25.0,1,2,151.55,S
5,1,"Anderson, Mr. Harry",1,male,48.0,0,0,26.55,S
6,1,"Andrews, Miss. Kornelia Theodosia",1,female,63.0,1,0,77.9583,S
7,0,"Andrews, Mr. Thomas Jr",1,male,39.0,0,0,0.0,S
8,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",1,female,53.0,2,0,51.4792,S
9,0,"Artagaveytia, Mr. Ramon",1,male,71.0,0,0,49.5042,C


In [21]:
#missing features count
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [22]:
#delete 'name' column
df = all_df.drop(['name'], axis=1)

In [23]:
#fill null 'age' with average age
age_avg = df['age'].mean()
df['age'] = df['age'].fillna(age_avg)

In [24]:
#fill null 'fare' with average fare
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [25]:
#convert (female, male) to (0, 1)
df['sex'] = df['sex'].map({'female': 0, 'male': 1}).astype(int)

In [26]:
#convert 'embarked' to OneHot format
x_df_onehot = pd.get_dummies(data=df, columns=["embarked" ])

In [28]:
x_df_onehot[:10]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1
2,0,1,0,2.0,1,2,151.55,0,0,1
3,0,1,1,30.0,1,2,151.55,0,0,1
4,0,1,0,25.0,1,2,151.55,0,0,1
5,1,1,1,48.0,0,0,26.55,0,0,1
6,1,1,0,63.0,1,0,77.9583,0,0,1
7,0,1,1,39.0,0,0,0.0,0,0,1
8,1,1,0,53.0,2,0,51.4792,0,0,1
9,0,1,1,71.0,0,0,49.5042,1,0,0


### 轉換為array

In [29]:
ndarray = x_df_onehot.values

In [30]:
ndarray.shape

(1309, 10)

In [31]:
ndarray[:10]

array([[  1.    ,   1.    ,   0.    ,  29.    ,   0.    ,   0.    ,
        211.3375,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   1.    ,   0.9167,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ],
       [  0.    ,   1.    ,   0.    ,   2.    ,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ],
       [  0.    ,   1.    ,   1.    ,  30.    ,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ],
       [  0.    ,   1.    ,   0.    ,  25.    ,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   1.    ,  48.    ,   0.    ,   0.    ,
         26.55  ,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   0.    ,  63.    ,   1.    ,   0.    ,
         77.9583,   0.    ,   0.    ,   1.    ],
       [  0.    ,   1.    ,   1.    ,  39.    ,   0.    ,   0.    ,
          0.    ,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   0.    ,  53.    ,   2.    ,   0.  

In [32]:
Label = ndarray[:, 0]    #survive or not
Features = ndarray[:, 1:]    #9 different features

In [35]:
Label.shape

(1309,)

In [36]:
Features.shape

(1309, 9)

In [74]:
Label[:10]

array([1., 1., 0., 0., 0., 1., 1., 0., 1., 0.])

In [75]:
Features[:10]

array([[  1.    ,   0.    ,  29.    ,   0.    ,   0.    , 211.3375,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   0.9167,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   0.    ,   2.    ,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,  30.    ,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   0.    ,  25.    ,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,  48.    ,   0.    ,   0.    ,  26.55  ,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   0.    ,  63.    ,   1.    ,   0.    ,  77.9583,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,  39.    ,   0.    ,   0.    ,   0.    ,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   0.    ,  53.    ,   2.    ,   0.    ,  51.4792,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,  71.    ,

### 將array進行標準化

In [38]:
from sklearn import preprocessing

In [39]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

In [40]:
scaled_features = minmax_scale.fit_transform(Features)

In [41]:
scaled_features[:10]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.0229641 , 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.37369494, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.31106443, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.59916476, 0.        , 0.        ,
        0.05182215, 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.78705628, 0.125     , 0.        ,
        0.15216447, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.48642985, 0.        , 0.        ,
        0.        , 0.        , 0.        , 1.        ],


### 將資料分為訓練資料與測試資料

In [70]:
msk = numpy.random.rand(len(all_df)) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

In [71]:
msk

array([ True,  True,  True, ...,  True,  True,  True])

In [72]:
t = 0
f = 0
error = 0
for b in msk:
    if b == True:
        t += 1
    elif b == False:
        f += 1
    else:
        error += 1

print(t, f, error)

1047 262 0


In [73]:
print('total:',len(all_df),
      'train:',len(train_df),
      'test:',len(test_df))

total: 1309 train: 1047 test: 262


In [31]:
def PreprocessData(raw_df):
    df=raw_df.drop(['name'], axis=1)
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)
    x_OneHot_df = pd.get_dummies(data=df,columns=["embarked" ])

    ndarray = x_OneHot_df.values
    Features = ndarray[:,1:]
    Label = ndarray[:,0]

    minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
    scaledFeatures=minmax_scale.fit_transform(Features)    
    
    return scaledFeatures,Label