In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os

  from ._conv import register_converters as _register_converters


<p>資料來源： <a href="https://www.kaggle.com/c/titanic/data" target="_blank" rel="noopener">Titanic: Machine Learning from Disaster</a> </p>

| Variable           | Definition  | Key |
| ------------  | ----  | ---  |
| survival  |Survival |  0 = No, 1 = Yes  |
|pclass   | Ticket class | 1 = 1st, 2 = 2nd, 3 = 3rd |
| sex | Sex |   |
| Age | Age in years |   |
| sibsp | # of siblings / spouses aboard the Titanic |   |
| parch | # of parents / children aboard the Titanic |   |
| ticket | Ticket number |   |
| fare | Passenger fare  |   |
| cabin |  Cabin number |   |
| embarked | Port of Embarkation | C = Cherbourg, Q = Queenstown, S = Southampton  |

<b>Variable Notes</b>
<p>pclass: A proxy for socio-economic status (SES)</p>
<p>1st = Upper</p>
<p>2nd = Middle</p>
<p>3rd = Lower</p>

<p>age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5</p>

<p>sibsp: The dataset defines family relations in this way...</p>
<p>Sibling = brother, sister, stepbrother, stepsister</p>
<p>Spouse = husband, wife (mistresses and fiancés were ignored)</p>

<p>parch: The dataset defines family relations in this way...</p>
<p>Parent = mother, father</p>
<p>Child = daughter, son, stepdaughter, stepson</p>
<p>Some children travelled only with a nanny, therefore parch=0 for them.</p>

In [4]:
data = pd.read_csv(os.path.join("raw_data", "train.csv"))

In [5]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
data.drop(["PassengerId", "Name", "Ticket"], axis=1, inplace=True)
## save data
data.to_csv(os.path.join("raw_data", "data.csv"), index=False)

In [8]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [9]:
import gc
gc.collect() ## 釋放內存

0

### Miss Data

In [10]:
## 檢查遺漏值的數量
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [11]:
## 填補遺漏值
data.fillna({"Age":-1,
            "Cabin":"Unk",
            "Embarked":"Unk",
            "Fare":-1},
            inplace=True)

In [12]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,Unk,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,Unk,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,Unk,S


In [13]:
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

# 普通邏輯迴歸

<h3>資料前處理

In [14]:
## covert sex binary
# 男生 => 0  女生 => 1
data.loc[:,"Sex"] = (data["Sex"] == "female").astype(int)

#  切出訓練跟測試資料
feature_col_name = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
Xtr = data.loc[:, feature_col_name].sample(frac = 0.75)
Xts = data[~data.index.isin(Xtr.index)].loc[:, feature_col_name]

## One hot encode Ytr and Yts
#  快速方法
Ytr = pd.get_dummies(data[data.index.isin(Xtr.index)]["Survived"].values)
Yts = pd.get_dummies(data[~data.index.isin(Xtr.index)]["Survived"].values)

<h3> 架構 Tensorflow </h3>

In [15]:
import tensorflow as tf

num_features = Xtr.shape[1]
num_classes  = Ytr.shape[1]

##  tf.placeholder 的 shape 參數：默認是None，就是一维值，也可以是多维，比如[2,3], [None, 3]表示col是3，row不定
X = tf.placeholder("float", [None, num_features])
Y = tf.placeholder("float", [None, num_classes])

# W - 權重 array
W = tf.Variable(tf.zeros([num_features, num_classes]))
# B - 偏移 array
B = tf.Variable(tf.zeros([num_classes]))

## 定義邏輯迴歸模型
#  將 y = wx + b 作為 softmax 的參數

yhat = tf.nn.softmax(tf.matmul(X, W)+B)

## 定義 Loss function
# 參考資料 : https://blog.csdn.net/mao_xiao_feng/article/details/53382790
loss_fn = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=yhat, labels=Y))

## 定義 Optimozer 最小誤差優化器 在 loss_fn 上
opt = tf.train.AdamOptimizer(0.01).minimize(loss_fn)

<h3> 開始訓練模型 </h3>

In [16]:
sess = tf.Session()

# 初始化所有的變數
init = tf.global_variables_initializer()  ## 使用 global_variables_initializer 來初始化
sess.run(init)

num_epochs = 10
## full data each time
for i in range(num_epochs):
    print("epoch : ", i+1)
    sess.run(opt, feed_dict = {X: Xtr, Y:Ytr})

epoch :  1
epoch :  2
epoch :  3
epoch :  4
epoch :  5
epoch :  6
epoch :  7
epoch :  8
epoch :  9
epoch :  10


<h3> 訓練模型  </h3>

In [17]:
## 準確率function
equal = tf.equal(tf.argmax(yhat, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(equal, dtype = tf.float32))

In [18]:
## 取得測試資料的準確率
test_accuracy_values = sess.run(accuracy, feed_dict = {X: Xts, Y:Yts})
test_accuracy_values

0.6367713

# 邏輯迴歸使用批次處理

In [19]:
#取得資料
data = pd.read_csv(os.path.join("raw_data", "data.csv"))
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [20]:
print("Pclass 的 種類有%d種，分別是"%data["Pclass"].nunique(), data["Pclass"].unique())
print("Sex 的 種類有%d種，分別是"%data["Sex"].nunique(), data["Sex"].unique())
print("Cabin 的 種類有%d種，分別是"%data["Cabin"].nunique(), data["Cabin"].unique())
print("Embarked 的 種類有%d種，分別是"%data["Embarked"].nunique(), data["Embarked"].unique())

Pclass 的 種類有3種，分別是 [3 1 2]
Sex 的 種類有2種，分別是 ['male' 'female']
Cabin 的 種類有147種，分別是 [nan 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78' 'D33'
 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26' 'C110'
 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7' 'C49'
 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87' 'B77'
 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26' 'C106'
 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124' 'C91'
 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'B96 B98' 'E10' 'E44' 'A34'
 'C104' 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79'
 'E25' 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68'
 'A10' 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58'
 'C126' 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90'
 'C45' 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' '

In [21]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [22]:
# data 的預設值
_csv_column_defaults = [[0],[-1],["Unk"],[-1.],[0],[0],[-1.],["Unk"],["Unk"]]
_csv_columns = data.columns.tolist() 

## 定義 Ch4 的輸入函數
def input_fn(csv_file, feature_names, batch_size=16, n_epochs=10, shuffle=False):
    def decode_csv(line):
        parsed_line = tf.decode_csv(line, _csv_column_defaults)
        feature_dict = dict(zip(feature_names, parsed_line))
        labels = feature_dict.pop("Survived")  ## 取得 labels ， 同時從 feature_dict 移除 Survived col
        return feature_dict, labels

    if shuffle:
        dataset = dataset.shuffle(buffer_size = 100*1024) # buffer 100KB

    dataset = (tf.data.TextLineDataset(csv_file)    # Read text file
              .skip(1) # 跳過欄頭
              .map(decode_csv, num_parallel_calls=3)) # 轉換每一欄

    dataset = dataset.batch(batch_size)
    dataset = dataset.repeat(n_epochs)
    iterator = dataset.make_one_shot_iterator()
    batch_features, batch_labels = iterator.get_next()
    return batch_features, batch_labels

<h3>處理種類特徵</h3>
<p>參考資料：<a href="https://blog.danielchen.tk/2018/03/21/TensorFlow-%E5%AD%B8%E7%BF%92%E7%AD%86%E8%A8%98-1-feature-column/index.html" target="_blank" rel="noopener">TensorFlow 學習筆記 (1) - feature_column</a> </p>


<h3>基本種類特徵</h3>

In [23]:
sex = tf.feature_column.categorical_column_with_vocabulary_list("Sex", vocabulary_list=["female", "male", "Unk"])
embarked = tf.feature_column.categorical_column_with_vocabulary_list("Embarked", vocabulary_list=["S", "C", "Q", "Unk"])

<h3>基本數值特徵</h3>

In [24]:
age = tf.feature_column.numeric_column("Age")
sib = tf.feature_column.numeric_column("SibSp")
parch = tf.feature_column.numeric_column("Parch")
fare = tf.feature_column.numeric_column("Fare")

## <h2>關於 Bucketized Column</h2>
<hr>
<h4>bucketized_column</h4>
<p>字面上意思就是將資料分成好幾個籃子，我們可以透過 bucketized_column 將連續的數值資料，透過設定區間的方式將資料群組起來，並標上標籤。

例如說我們有年份的資料，我們可將時間軸切成四份<p>

<img src="https://www.tensorflow.org/versions/master/images/feature_columns/bucketized_column.jpg" alt="">

如此可將不同的年份分裝成四份，下表用 one-hot vector 表示

| Date Range   | 對應  |
| ------------  | ----  |
| < 1960  |[1, 0, 0, 0] |
| >= 1960 but < 1980   | [0, 1, 0, 0] |
| >= 1980 but < 2000 | [0, 0, 1, 0] |
| > 2000 | [0, 0, 0, 1] |

In [26]:
age_buckets = tf.feature_column.bucketized_column(age, boundaries=[5., 10., 18., 25., 35., 45., 55., 65.])

<h3>定義模型</h3>

In [27]:
import os
import shutil

## 檢查是否 path 路徑存在，如果沒有 就創立
def chech_and_mkdir(path):
    if not(os.path.isdir(path)):
        os.makedirs(path)


def delete_mkdir(path):
    if os.path.isdir(path):
        shutil.rmtree(path)

In [28]:
columns = [age, sib, parch, fare, sex, embarked]

## 輸出的模型路徑
model_dir = "lr_model"
delete_mkdir(model_dir)
chech_and_mkdir(model_dir)

model = tf.estimator.LinearClassifier(model_dir=model_dir,
                                     feature_columns=columns,
                                     optimizer=tf.train.AdamOptimizer())

INFO:tensorflow:Using default config.


INFO:tensorflow:Using config: {'_global_id_in_cluster': 0, '_service': None, '_num_worker_replicas': 1, '_save_summary_steps': 100, '_log_step_count_steps': 100, '_model_dir': 'lr_model', '_num_ps_replicas': 0, '_master': '', '_task_type': 'worker', '_save_checkpoints_secs': 600, '_task_id': 0, '_tf_random_seed': None, '_is_chief': True, '_evaluation_master': '', '_keep_checkpoint_max': 5, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5dd43857b8>}


<h3>Train model</h3>

In [29]:
model.train(input_fn=lambda: input_fn(os.path.join("raw_data", "data.csv"), _csv_columns))

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 1 into lr_model/model.ckpt.


INFO:tensorflow:step = 1, loss = 11.090355


INFO:tensorflow:global_step/sec: 135.732


INFO:tensorflow:step = 101, loss = 9.610937 (0.741 sec)


INFO:tensorflow:global_step/sec: 197.993


INFO:tensorflow:step = 201, loss = 12.629804 (0.503 sec)


INFO:tensorflow:global_step/sec: 195.618


INFO:tensorflow:step = 301, loss = 8.880047 (0.514 sec)


INFO:tensorflow:global_step/sec: 205.622


INFO:tensorflow:step = 401, loss = 9.075473 (0.487 sec)


INFO:tensorflow:global_step/sec: 191.719


INFO:tensorflow:step = 501, loss = 6.975941 (0.517 sec)


INFO:tensorflow:Saving checkpoints for 560 into lr_model/model.ckpt.


INFO:tensorflow:Loss for final step: 5.958311.


<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x7f5dd43855f8>

In [30]:
results = model.evaluate(input_fn=lambda: input_fn(os.path.join("raw_data", "data.csv"), _csv_columns, n_epochs=1))

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2018-05-25-06:10:54


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from lr_model/model.ckpt-560


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Finished evaluation at 2018-05-25-06:10:55


INFO:tensorflow:Saving dict for global step 560: accuracy = 0.7665544, accuracy_baseline = 0.6161616, auc = 0.8267983, auc_precision_recall = 0.7445791, average_loss = 0.5473142, global_step = 560, label/mean = 0.3838384, loss = 8.70816, prediction/mean = 0.4256734


In [31]:
results

{'accuracy': 0.7665544,
 'accuracy_baseline': 0.6161616,
 'auc': 0.8267983,
 'auc_precision_recall': 0.7445791,
 'average_loss': 0.5473142,
 'global_step': 560,
 'label/mean': 0.3838384,
 'loss': 8.70816,
 'prediction/mean': 0.4256734}

<h2>Addition - 用 Bucketized Column 資料</h2>

In [33]:
columns = [age_buckets, sib, parch, fare, sex, embarked]

## 輸出的模型路徑
model_dir = "lr_model"
delete_mkdir(model_dir)
chech_and_mkdir(model_dir)

model = tf.estimator.LinearClassifier(model_dir=model_dir,
                                     feature_columns=columns,
                                     optimizer=tf.train.AdamOptimizer())

model.train(input_fn=lambda: input_fn(os.path.join("raw_data", "data.csv"), _csv_columns))
results = model.evaluate(input_fn=lambda: input_fn(os.path.join("raw_data", "data.csv"), _csv_columns, n_epochs=1))

INFO:tensorflow:Using default config.


INFO:tensorflow:Using config: {'_global_id_in_cluster': 0, '_service': None, '_num_worker_replicas': 1, '_save_summary_steps': 100, '_log_step_count_steps': 100, '_model_dir': 'lr_model', '_num_ps_replicas': 0, '_master': '', '_task_type': 'worker', '_save_checkpoints_secs': 600, '_task_id': 0, '_tf_random_seed': None, '_is_chief': True, '_evaluation_master': '', '_keep_checkpoint_max': 5, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_every_n_hours': 10000, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f5db2b78828>}


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 1 into lr_model/model.ckpt.


INFO:tensorflow:step = 1, loss = 11.090355


INFO:tensorflow:global_step/sec: 129.651


INFO:tensorflow:step = 101, loss = 9.725036 (0.773 sec)


INFO:tensorflow:global_step/sec: 166.378


INFO:tensorflow:step = 201, loss = 10.277811 (0.602 sec)


INFO:tensorflow:global_step/sec: 258.979


INFO:tensorflow:step = 301, loss = 8.842726 (0.386 sec)


INFO:tensorflow:global_step/sec: 236.593


INFO:tensorflow:step = 401, loss = 9.405177 (0.422 sec)


INFO:tensorflow:global_step/sec: 168.666


INFO:tensorflow:step = 501, loss = 6.845979 (0.593 sec)


INFO:tensorflow:Saving checkpoints for 560 into lr_model/model.ckpt.


INFO:tensorflow:Loss for final step: 5.9806776.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2018-05-25-06:21:24


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from lr_model/model.ckpt-560


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Finished evaluation at 2018-05-25-06:21:25


INFO:tensorflow:Saving dict for global step 560: accuracy = 0.76094276, accuracy_baseline = 0.6161616, auc = 0.8273682, auc_precision_recall = 0.7470209, average_loss = 0.5438212, global_step = 560, label/mean = 0.3838384, loss = 8.652584, prediction/mean = 0.41493168


{'loss': 8.652584, 'auc': 0.8273682, 'label/mean': 0.3838384, 'average_loss': 0.5438212, 'prediction/mean': 0.41493168, 'accuracy': 0.76094276, 'auc_precision_recall': 0.7470209, 'global_step': 560, 'accuracy_baseline': 0.6161616}


In [34]:
print(results)

{'loss': 8.652584, 'auc': 0.8273682, 'label/mean': 0.3838384, 'average_loss': 0.5438212, 'prediction/mean': 0.41493168, 'accuracy': 0.76094276, 'auc_precision_recall': 0.7470209, 'global_step': 560, 'accuracy_baseline': 0.6161616}
