# 读取鸢尾花数据集

## 读取和构造dataframe数据 

In [53]:
# 数据来自kdd99数据，用来测试tensorflow的文件读写
import pandas as pd
import tensorflow as tf
tf.enable_eager_execution()
import warnings
warnings.filterwarnings("ignore")
tf_filename = './iris.tfrecords'

target_column_name = 'target_names'

# # 读取方式1
# pd.options.display.max_columns = 50
# data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data') # 数据
# data.loc[data.shape[0]] = range(5)
# data = data.shift(1)
# data.iloc[0] = data.columns
# data.columns = ['sepal_length','sepal_width','petal_length','petal_width',target_column_name]
# data.head()

# 读取方式2
from sklearn.datasets import load_iris
iris = load_iris()
target_column_name = 'target_names'
data = pd.concat([pd.DataFrame(iris.data,columns=iris.feature_names),pd.DataFrame(iris.target,columns=[target_column_name])],axis=1)
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## 建立每个特征于其类型的映射关系

In [54]:
field_type_map = dict(zip([i for i in data.columns],[data[i].dtype.kind for i in data.columns]))
field_type_map
# 确认是否需要修改
# field_type_map['petal_length'] = 'f'
# field_type_map['petal_width'] = 'f'
# field_type_map['sepal_length'] = 'f'
# field_type_map['sepal_width'] = 'f'
# field_type_map

{'petal length (cm)': 'f',
 'petal width (cm)': 'f',
 'sepal length (cm)': 'f',
 'sepal width (cm)': 'f',
 'target_names': 'i'}

---

# 读取dataframe并写入tfrecord文件

In [55]:
# 定义数据格式转换方法，包含了三种，即转int64、float，以及byte类型
def _int64_feature(value):
    if type(value) != 'int':
        value = int(value)
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
def _float_feature(value):
    if type(value) != 'float':
        value = float(value)
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def _bytes_feature(value):
    if type(value) != 'str':
        value = str(value)
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

# 定义RecordWriter开始准备写
# 其中'corrected.tfrecords'为要保存的目标文件名称
with tf.python_io.TFRecordWriter(tf_filename) as writer:
    # 对每一行
    for idx,ser in data.iterrows():
        # 定义一个词典feature_dict，来保存每一行的feature
        feature_dict = {}
        for field, value in ser.iteritems():
            # 转化数据类型
            field_type = field_type_map.get(field)
            assert field_type in ('i', 'f', 'O')
            if not field_type:
                continue
            if field_type == 'i':
                feature_dict[field] = _int64_feature(value)
            if field_type == 'f':
                feature_dict[field] = _float_feature(value)
            if field_type == 'O':
                feature_dict[field] = _bytes_feature(value)
        # 关键字features和feature必须加：先构造一个feature，再拼接为features，最后拼接为example
        example = tf.train.Example(features = tf.train.Features(feature = feature_dict))
        # 写入
        writer.write(example.SerializeToString())
print 'tfrecords written!'

tfrecords written!


# 读tfrecord并转化为dataset格式

In [58]:
# 解码example的方式
import tensorflow as tf
def _parser_(example):
    feature_spec = {}
    for key,value in field_type_map.iteritems():
#         print key,value
        if value == 'i':
            feature_spec[key] = tf.FixedLenFeature((), tf.int64, default_value=None)
        if value == 'O':
            feature_spec[key] = tf.FixedLenFeature((), tf.string, default_value=None)
        if value == 'f':
            feature_spec[key] = tf.FixedLenFeature((), tf.float32, default_value=None)
    parsed_dict = tf.parse_single_example(example, feature_spec)
    parsed_label = parsed_dict.pop(target_column_name)
    return parsed_dict,parsed_label

# 以下的list_files()的第一个参数是file_pattern，主要是面向多个具有相似结构的文件使用正则表达式方式来读入，即一次可读入许多文件
files = tf.data.Dataset.list_files(tf_filename, shuffle=True)

# 使用interleave方式是一种内存读取方法，是一种并发处理方式，对files中的
dataset = files.interleave(lambda tf_filename: tf.data.TFRecordDataset(filenames=tf_filename), cycle_length=1)

# dataset = dataset.shuffle(buffer_size=1000)
dataset = dataset.map(map_func=_parser_)

dataset = dataset.batch(100)

print 'dataset loaded!'

dataset loaded!


# dataset转化为DataFrame

In [59]:
def convertDatasetToDataframe(dataset,batch_cnt=None):
    '''
        batch_cnt == 0:    第一个batch
        batch_cnt == N:    取前N个batch
    '''
    total_data = pd.DataFrame(columns=[k for k,v in list(dataset.take(-1))[0][0].iteritems()]+['labels'])
    if (batch_cnt == None) | (batch_cnt == 0):
        print 'batch数未指定，将默认输出第一个batch转化后的Dataframe.'
        single_batch = list(dataset.take(1))[0]
        batch_features = single_batch[0]
        batch_labels = single_batch[1]
        batch_data = pd.DataFrame()
        for k,v in batch_features.iteritems():
            batch_data[k] = v.numpy()
        batch_data['labels'] = batch_labels.numpy()
        total_data = pd.concat([total_data,batch_data],axis=0)
    if batch_cnt > 0:
        print '输出前',str(batch_cnt),'个batch转化后的Dataframe'
        for single_batch in list(dataset.take(batch_cnt)):
            batch_features = single_batch[0]
            batch_labels = single_batch[1]
            batch_data = pd.DataFrame()
            for k,v in batch_features.iteritems():
                batch_data[k] = v.numpy()
            batch_data['labels'] = batch_labels.numpy()
            total_data = pd.concat([total_data,batch_data],axis=0)
    return total_data
    
convertDatasetToDataframe(dataset,2)

输出前 2 个batch转化后的Dataframe


Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm),sepal length (cm),labels
0,3.5,1.4,0.2,5.1,0
1,3.0,1.4,0.2,4.9,0
2,3.2,1.3,0.2,4.7,0
3,3.1,1.5,0.2,4.6,0
4,3.6,1.4,0.2,5.0,0
5,3.9,1.7,0.4,5.4,0
6,3.4,1.4,0.3,4.6,0
7,3.4,1.5,0.2,5.0,0
8,2.9,1.4,0.2,4.4,0
9,3.1,1.5,0.1,4.9,0
