In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

tf.__version__

'2.0.0'

### Feature column 的使用

In [2]:
import pathlib

In [3]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

In [4]:
train_dir = tf.keras.utils.get_file(origin=TRAIN_DATA_URL,fname="train.csv")
test_dir = tf.keras.utils.get_file(origin=TEST_DATA_URL,fname="test.csv")

In [5]:
train_data = pd.read_csv(train_dir)
train_data.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [6]:
test_data = pd.read_csv(test_dir)
test_data.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,35.0,0,0,8.05,Third,unknown,Southampton,y
1,0,male,54.0,0,0,51.8625,First,E,Southampton,y
2,1,female,58.0,0,0,26.55,First,C,Southampton,y
3,1,female,55.0,0,0,16.0,Second,unknown,Southampton,y
4,1,male,34.0,0,0,13.0,Second,D,Southampton,y


In [7]:
ds_train = tf.data.experimental.make_csv_dataset(train_dir,5,label_name="survived",na_value="?")

Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.
Instructions for updating:
Use `tf.data.Dataset.shuffle(buffer_size, seed)` followed by `tf.data.Dataset.repeat(count)`. Static tf.data optimizations will take care of using the fused implementation.


In [8]:
example_data,example_label = next(iter(ds_train))
example_data,example_label

(OrderedDict([('sex',
               <tf.Tensor: id=93, shape=(5,), dtype=string, numpy=array([b'male', b'female', b'male', b'male', b'male'], dtype=object)>),
              ('age',
               <tf.Tensor: id=85, shape=(5,), dtype=float32, numpy=array([11. , 24. , 30.5, 16. , 22. ], dtype=float32)>),
              ('n_siblings_spouses',
               <tf.Tensor: id=91, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0])>),
              ('parch',
               <tf.Tensor: id=92, shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0])>),
              ('fare',
               <tf.Tensor: id=90, shape=(5,), dtype=float32, numpy=array([18.7875, 13.    ,  8.05  ,  8.05  ,  9.    ], dtype=float32)>),
              ('class',
               <tf.Tensor: id=87, shape=(5,), dtype=string, numpy=array([b'Third', b'Second', b'Third', b'Third', b'Third'], dtype=object)>),
              ('deck',
               <tf.Tensor: id=88, shape=(5,), dtype=string, numpy=array([b'unknown', b'F', b'unknown',

In [9]:
key_numeric_columns = ["n_siblings_spouses","parch","fare"]

In [10]:
desc = train_data[key_numeric_columns].describe()

In [11]:
MEAN = desc.T["mean"]
STD = desc.T["std"]
MEAN,STD

(n_siblings_spouses     0.545455
 parch                  0.379585
 fare                  34.385399
 Name: mean, dtype: float64, n_siblings_spouses     1.151090
 parch                  0.792999
 fare                  54.597730
 Name: std, dtype: float64)

In [12]:
class PackNumbericFeature(object):
    def __init__(self,names):
        self.names = names
    
    def __call__(self,features,lables):
        numeric_features = [ features.pop(name) for name in self.names ]
        # 数据类型转换
        numeric_features =[ tf.cast(feat,dtype=tf.float32) for feat in numeric_features ]
        
        numeric_features = tf.stack(numeric_features,axis=-1)
        
        features["numeric"] = numeric_features
        return features,lables

In [13]:
ds_train = ds_train.map(PackNumbericFeature(key_numeric_columns))

In [14]:
example_data,example_label = next(iter(ds_train))
example_data,example_label

(OrderedDict([('sex',
               <tf.Tensor: id=130, shape=(5,), dtype=string, numpy=array([b'male', b'female', b'male', b'male', b'male'], dtype=object)>),
              ('age',
               <tf.Tensor: id=124, shape=(5,), dtype=float32, numpy=array([11. , 24. , 30.5, 16. , 22. ], dtype=float32)>),
              ('class',
               <tf.Tensor: id=126, shape=(5,), dtype=string, numpy=array([b'Third', b'Second', b'Third', b'Third', b'Third'], dtype=object)>),
              ('deck',
               <tf.Tensor: id=127, shape=(5,), dtype=string, numpy=array([b'unknown', b'F', b'unknown', b'unknown', b'unknown'], dtype=object)>),
              ('embark_town',
               <tf.Tensor: id=128, shape=(5,), dtype=string, numpy=
               array([b'Cherbourg', b'Southampton', b'Southampton', b'Southampton',
                      b'Southampton'], dtype=object)>),
              ('alone',
               <tf.Tensor: id=125, shape=(5,), dtype=string, numpy=array([b'y', b'y', b'y', b'y

### 处理数值型数据

In [15]:
def normalizer(data,mean,std):
    return (data-mean)/std

In [16]:
import functools

In [17]:
nornalizer_fn = functools.partial(normalizer,mean = MEAN,std = STD)

In [18]:
numeric_column = tf.feature_column.numeric_column("numeric",normalizer_fn=nornalizer_fn,shape=(len(key_numeric_columns)))
numeric_columns = [numeric_column]
numeric_columns

[NumericColumn(key='numeric', shape=(3,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalizer at 0x0000000015EEE840>, mean=n_siblings_spouses     0.545455
 parch                  0.379585
 fare                  34.385399
 Name: mean, dtype: float64, std=n_siblings_spouses     1.151090
 parch                  0.792999
 fare                  54.597730
 Name: std, dtype: float64))]

In [19]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)

In [20]:
numeric_layer(example_data)

<tf.Tensor: id=143, shape=(5, 3), dtype=float32, numpy=
array([[-0.47385937, -0.4786705 , -0.28568769],
       [-0.47385937, -0.4786705 , -0.39169025],
       [-0.47385937, -0.4786705 , -0.4823534 ],
       [-0.47385937, -0.4786705 , -0.4823534 ],
       [-0.47385937, -0.4786705 , -0.4649534 ]], dtype=float32)>

#### 类别型数据

In [21]:
key_category = ["sex","class","deck","embark_town","alone"]

In [22]:
cate = pd.Categorical(train_data['sex'].values)

In [23]:
cates = {}
for cate in key_category:
    cates[cate] = pd.Categorical(train_data[cate].values).categories.values

cates

{'sex': array(['female', 'male'], dtype=object),
 'class': array(['First', 'Second', 'Third'], dtype=object),
 'deck': array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'unknown'], dtype=object),
 'embark_town': array(['Cherbourg', 'Queenstown', 'Southampton', 'unknown'], dtype=object),
 'alone': array(['n', 'y'], dtype=object)}

In [24]:
cate_features = []
for cate,vocab in cates.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(cate,vocab)
    cate_features.append(tf.feature_column.indicator_column(cat_col))

In [25]:
cate_features

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('female', 'male'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Queenstown', 'Southampton', 'unknown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('n', 'y'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

In [26]:
category_layer = tf.keras.layers.DenseFeatures(cate_features)

In [27]:
category_layer(example_data)

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.


<tf.Tensor: id=326, shape=(5, 19), dtype=float32, numpy=
array([[0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
        0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
        0., 1., 0.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 0., 1.],
       [0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
        0., 0., 1.]], dtype=float32)>

#### 区间值

In [28]:
section_column = tf.feature_column.

NameError: name 'section_column' is not defined