In [2]:
import tensorflow as tf
import pandas as pd

##### libsvm数据介绍：
&emsp;&emsp;**Label feat_id1:value1 feat_id2:value2 ..**

&emsp;&emsp;label 是类别的标识，feat_id1是第一个类别的编号，value1是第一个类别的值。

&emsp;&emsp;lib_svm的数据将特征值为0的数据进行省略，只保留特征值不为0的特征，在训练中，类别特征经过one-hot后，是一个稀疏矩阵，存储非常占内存，通过libsvm存储后可以降低存储空间。feat_id是从1开始的序列值。

#### 举例说明
&emsp;&emsp;假设数据列是sex,status,age三列，其中sex,status是类别列，age是连续数字，其中sex类别为(man,woman,other),stage类别为(married,single,other)。

In [3]:
df = pd.DataFrame([['man','married',33],['man','single',23],['man','other',43],
                  ['woman','married',28],['other','married',65]],columns=['sex','status','age'])

In [6]:
df

Unnamed: 0,sex,status,age
0,man,married,33
1,man,single,23
2,man,other,43
3,woman,married,28
4,other,married,65


In [5]:
one_hot_df = pd.get_dummies(df)

In [7]:
one_hot_df

Unnamed: 0,age,sex_man,sex_other,sex_woman,status_married,status_other,status_single
0,33,1,0,0,1,0,0
1,23,1,0,0,0,0,1
2,43,1,0,0,0,1,0
3,28,0,0,1,1,0,0
4,65,0,1,0,1,0,0


In [9]:
# 给one-hot df添加一列label,假设都是1
one_hot_df['label'] = 1
one_hot_df

Unnamed: 0,age,sex_man,sex_other,sex_woman,status_married,status_other,status_single,label
0,33,1,0,0,1,0,0,1
1,23,1,0,0,0,0,1,1
2,43,1,0,0,0,1,0,1
3,28,0,0,1,1,0,0,1
4,65,0,1,0,1,0,0,1


&emsp;&emsp;从上面的例子中可以看出，经过one-hot之后数据很稀疏，但是只有三列有值，将columns进行index赋值，即age:1,sex_man:2,sex_other:3,sex_woman:4,status_married:5,status_other:6,status_single:7。

转换后的数据应该如下：

In [11]:
# 测试libsvm数据
line1 = '1 1:33 2:1 5:1'
line2 = '1 1:23 2:1 7:1'

In [13]:
# 接下来了解怎么切分数据
sess = tf.Session()
columns = tf.string_split([line1],' ')
sess.run(columns)

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [0, 2],
       [0, 3]]), values=array([b'1', b'1:33', b'2:1', b'5:1'], dtype=object), dense_shape=array([1, 4]))

In [14]:
# tf.string_split返回的数据是sparseTensorValue类型
sess.run(columns.values[0])

b'1'

In [15]:
# 获取label,将格式转换为float32
labels = tf.string_to_number(columns.values[0],out_type=tf.float32)
sess.run(labels)

1.0

In [16]:
# 获取index:value数据
splits = tf.string_split(columns.values[1:],':')
sess.run(splits)

SparseTensorValue(indices=array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1],
       [2, 0],
       [2, 1]]), values=array([b'1', b'33', b'2', b'1', b'5', b'1'], dtype=object), dense_shape=array([3, 2]))

In [17]:
id_vals = tf.reshape(splits.values,splits.dense_shape)
sess.run(id_vals)

array([[b'1', b'33'],
       [b'2', b'1'],
       [b'5', b'1']], dtype=object)

In [22]:
# 此时id_vals，第一列是id,第二列是value
# 需要进行切分，沿着行的方向，axis = 1
feat_ids,feat_vals = tf.split(id_vals,num_or_size_splits=2,axis=1)
print(sess.run(feat_ids))
print('###')
print(sess.run(feat_vals))

[[b'1']
 [b'2']
 [b'5']]
###
[[b'33']
 [b'1']
 [b'1']]


In [23]:
# 下一步将feat_ids转换成int32,feat_vals转换成float32
feat_ids = tf.string_to_number(feat_ids,out_type=tf.int32)
feat_vals = tf.string_to_number(feat_vals,out_type=tf.float32)
print(sess.run(feat_ids))
print('##################')
print(sess.run(feat_vals))

[[1]
 [2]
 [5]]
##################
[[33.]
 [ 1.]
 [ 1.]]


In [26]:
with tf.Session() as sess:
    Feat_Emb = tf.constant([[1.0,2.0,3.0],[4.0,5.0,6.0],[7.0,8.0,9.0],[10.0,11.0,12.0]])
    feat_ids = tf.constant([[[0],[2]],[[1],[3]]])
    print('feat_ids:'+ str(sess.run(feat_ids)))
    feat_ids = tf.reshape(feat_ids,shape=[-1,2])
    print('feat_ids:'+str(sess.run(feat_ids)))
    embeddings = tf.nn.embedding_lookup(Feat_Emb, feat_ids)
    print('embeddings: '+str(sess.run(embeddings)))
    feat_vals = tf.constant([[[0.1],[0.2]],[[0.5],[0.6]]])
    print('feat_vals: ' + str(sess.run(feat_vals)))
    feat_vals = tf.reshape(feat_vals,shape=[-1,2,1])
    xx = tf.multiply(embeddings,feat_vals)
    print('xx:' + str(sess.run(xx)))

feat_ids:[[[0]
  [2]]

 [[1]
  [3]]]
feat_ids:[[0 2]
 [1 3]]
embeddings: [[[ 1.  2.  3.]
  [ 7.  8.  9.]]

 [[ 4.  5.  6.]
  [10. 11. 12.]]]
feat_vals: [[[0.1]
  [0.2]]

 [[0.5]
  [0.6]]]
xx:[[[0.1       0.2       0.3      ]
  [1.4       1.6       1.8000001]]

 [[2.        2.5       3.       ]
  [6.        6.6000004 7.2000003]]]
