In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

  from ._conv import register_converters as _register_converters


### 读出数据观测

In [2]:
# Pclass Ticket class 
# Fare 票价
# ticket ticket number
# cabin 客舱号码
# embarked 地点
# parch 父母，孩子
# sibsp 亲友

test_names = ['PassengerId','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']
train_names = ['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']

In [3]:
def load_cvs(path,names,isDeleteCabin=False,isDeleteName = True,isDeleteTicket = True):
    data = pd.read_csv(path,header=0,names = names)
    if isDeleteCabin:
        data.pop('Cabin')
    if isDeleteName:
        data.pop('Name')
    if isDeleteName:
        data.pop('Ticket')
    return data

In [4]:
data_train = load_cvs('./data/train.csv',names=train_names,isDeleteCabin = True)
data_train.pop('PassengerId')
# 使用平均值进行填充
mean_age =round( data_train.mean()['Age'],1)
mean_fare =round( data_train.mean()['Fare'],1)
data_train = data_train.fillna({'Age':mean_age,'Fare':mean_fare})
data_train = data_train.fillna(method='ffill')
data_train.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
data_test = load_cvs('./data/test.csv',names=test_names,isDeleteCabin = True)
data_test.pop('PassengerId')
#使用训练的平均值填充缺省值
data_test = data_test.fillna({'Age':mean_age,'Fare':mean_fare})
data_test = data_test.fillna(method='ffill')
data_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


### 标准化年龄

In [6]:
def mean_std(name_column,data_set):
    mean,std= data_set.mean()[name_column],data_set.std()[name_column]
    print(mean,std)
    return mean,std

In [7]:
mean_age,std_age = mean_std('Age',data_train)
mean_fare,std_fare = mean_std('Fare',data_train)

29.69929292929302 13.002015230774283
32.2042079685746 49.693428597180905


In [8]:
def normal_data_train(name_column,mean,std,data):
    data[name_column] = data.apply(func=lambda x: (x[name_column]-mean)/std ,axis=1)

In [9]:
def slip_train_data(data_train):
    df = data_train.sample(frac=0.8) # 全部打乱
    cut_idx = int(round(0.1 * df.shape[0]))
    df_test, df_train = df.iloc[:cut_idx], df.iloc[cut_idx:]
    train_x,train_y = df_train,df_train.pop('Survived')
    test_x,test_y = df_test,df_test.pop('Survived')
    return (train_x,train_y),(test_x,test_y)

In [10]:
(train_x,train_y),(test_x,test_y) = slip_train_data(data_train=data_train)

In [11]:
train_x.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
617,3,female,26.0,1,0,16.1,S
463,2,male,48.0,0,0,13.0,S
472,2,female,33.0,1,2,27.75,S
588,3,male,22.0,0,0,8.05,S
87,3,male,29.7,0,0,8.05,S


### 自定义Estimator

In [12]:
def model_fn(features,labels,mode,params):
    feature_columns = params['features_columns']
    print(feature_columns)
    inputs = tf.feature_column.input_layer(features=features,feature_columns=feature_columns)
    hidden_unit = params['hidden_unit']
    # 创建隐藏层
    i = 0
    for unit in hidden_unit:
        inputs = tf.layers.dense(inputs=inputs,units=unit,activation=tf.nn.relu,kernel_regularizer=tf.contrib.layers.l1_l2_regularizer(0.5,0.5),name=('layer_hidden_%d' %i))
        i +=1

    # 创建输出层
    
    logits = tf.layers.dense(inputs=inputs,units=params['n_classes'],activation=None,name='layer_output')
    
    predict_class = tf.argmax(logits,axis=1)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions={'class_ids':predict_class[:,tf.newaxis],'probabilities':tf.nn.softmax(logits),'logit':logits}
        return tf.estimator.EstimatorSpec(mode,predictions)
    
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,logits=logits) +  tf.losses.get_regularization_loss()
    
    # 评估 返回准确率
    if mode == tf.estimator.ModeKeys.EVAL:
        accuracy = tf.metrics.accuracy(labels=labels,predictions=predict_class,name ='acc_op')
        metrics = {'accuracy':accuracy}
#         tf.summary.scalar(accuracy[1])
        return tf.estimator.EstimatorSpec(mode,loss = loss,eval_metric_ops = metrics)
    
    # 训练
    tf.assert_equal(mode,tf.estimator.ModeKeys.TRAIN)
    global_step = tf.train.get_global_step()
    starter_learning_rate = 0.1
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
                                           100000, 0.96, staircase=True)
    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss,global_step)
    return tf.estimator.EstimatorSpec(mode,loss = loss,train_op = train_op)
    

### 生成features columns

In [13]:
feature_columns = []
# Pclass 分类特征列
feature_columns.append(tf.feature_column.numeric_column(key='Pclass',dtype=tf.int32))
# sex 分类词汇列
feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key='Sex',vocabulary_list=['male','female'])))
# Age 数值列
feature_columns.append(tf.feature_column.numeric_column(key='Age',normalizer_fn=lambda x: (x-mean_age)/std_age ))
# SibSp 数值列
feature_columns.append(tf.feature_column.numeric_column(key='SibSp'))
# Parch 数值列
feature_columns.append(tf.feature_column.numeric_column(key='Parch'))
# Ticket 经过哈希处理的列
# feature_columns.append(tf.feature_column.categorical_column_with_hash_bucket(key='Ticket',hash_bucket_size=))
# Fare 数值列
feature_columns.append(tf.feature_column.numeric_column(key='Fare',normalizer_fn=lambda x:(x-mean_fare)/std_fare))
# Embarked 分类词汇列
feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(key='Embarked',vocabulary_list=['C','S','Q'])))


In [14]:
params = {'features_columns':feature_columns,'hidden_unit':[10,10],'n_classes':2}
titanic_class = tf.estimator.Estimator(model_fn=model_fn,model_dir='./model',params=params)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000225FC8E1E48>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [15]:
def input_fn_train(features,labels,batch_size):
    data_set = tf.data.Dataset.from_tensor_slices((dict(features),labels))
    data_set.shuffle(1000).repeat().batch(batch_size)
    return data_set

In [16]:
row = train_x.iloc[:,0].size
print(row)
for i in range(row):
    if train_x.iloc[i,0] == 0:
        print("  hanhao   ",i)

642


In [17]:
input_fn_train = tf.estimator.inputs.pandas_input_fn(train_x,train_y,batch_size=128,num_epochs=1000,shuffle=True)
titanic_class.train(input_fn=input_fn_train,steps=1000)

INFO:tensorflow:Calling model_fn.
[_NumericColumn(key='Pclass', shape=(1,), default_value=None, dtype=tf.int32, normalizer_fn=None), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x00000225EC9289D8>), _NumericColumn(key='SibSp', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='Parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x00000225FC87C8C8>), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='Embarked', vocabulary_list=('C', 'S', 'Q'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow

<tensorflow.python.estimator.estimator.Estimator at 0x225fc87acc0>

In [18]:
input_fn_eval = tf.estimator.inputs.pandas_input_fn(test_x,test_y,batch_size=128,num_epochs=1000,shuffle=True)
titanic_class.evaluate(input_fn=input_fn_eval,steps=1)

INFO:tensorflow:Calling model_fn.
[_NumericColumn(key='Pclass', shape=(1,), default_value=None, dtype=tf.int32, normalizer_fn=None), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x00000225EC9289D8>), _NumericColumn(key='SibSp', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='Parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x00000225FC87C8C8>), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='Embarked', vocabulary_list=('C', 'S', 'Q'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow

{'accuracy': 0.7265625, 'global_step': 1000, 'loss': 0.76613784}

In [19]:
input_fn_predict = tf.estimator.inputs.pandas_input_fn(data_test.head(1),shuffle = True)
predictions = titanic_class.predict(input_fn=input_fn_predict)
print(predictions)
SPECIES = ['saveed', 'dead']
for pred_dict,expec in zip(predictions,SPECIES):
    template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]
    
    print(template.format(SPECIES[class_id],
                              100 * probability, expec))

<generator object Estimator.predict at 0x00000225FC8746D0>
INFO:tensorflow:Calling model_fn.
[_NumericColumn(key='Pclass', shape=(1,), default_value=None, dtype=tf.int32, normalizer_fn=None), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='Sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)), _NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x00000225EC9289D8>), _NumericColumn(key='SibSp', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='Parch', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), _NumericColumn(key='Fare', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=<function <lambda> at 0x00000225FC87C8C8>), _IndicatorColumn(categorical_column=_VocabularyListCategoricalColumn(key='Embarked', vocabulary_list=('C', 'S', 'Q'), dtype=tf.string, default_value=-1, num_oov_buckets=