### 加载h5py 模块
* 利用h5py.File()函数读取h5文件，文件结构类似dict
* 利用keys()函数获取File的所有键值
* 获取File相应键值的value——dataset

In [1]:
import h5py

f = h5py.File('E:/Alibaba German AI Challenge/round1_test_a_20181109.h5','r')

type(f)

h5py._hl.files.File

In [2]:
list(f.keys())

['sen1', 'sen2']

In [3]:
sen1 = f['sen1']
sen2 = f['sen2']
type(sen1)

h5py._hl.dataset.Dataset

In [4]:
sen1.shape,sen2.shape

((4838, 32, 32, 8), (4838, 32, 32, 10))

In [5]:
sen1.size

39632896

In [6]:
sen1.dtype

dtype('float64')

##### dataset类型的属性
* shape
* size
* dtype

In [7]:
import numpy as np

s1 = np.array(sen1)
s1.shape

(4838, 32, 32, 8)

##### load validation data

In [1]:
import h5py

f = h5py.File('E:/Alibaba German AI Challenge/origin_DATA/validation.h5','r')

type(f)

h5py._hl.files.File

In [2]:
list(f.keys())

['label', 'sen1', 'sen2']

In [2]:
import numpy as np
s1 = np.array(f['sen1'])
s2 = np.array(f['sen2'])
y = np.array(f['label'])
s1.shape,s2.shape,y.shape

((24119, 32, 32, 8), (24119, 32, 32, 10), (24119, 17))

In [3]:
x = []
for i in range(0,s1.shape[0]):
    temp1 = s1[i].flatten()
    temp2 = s2[i].flatten()
    temp = np.hstack((temp1,temp2))
    x.append(temp)
x = np.array(x)
x.shape

(24119, 18432)

In [4]:
data = np.hstack((x,y))
data.shape

(24119, 18449)

##### CNN demo

In [1]:
# start tensorflow interactiveSession                           
import tensorflow as tf
import pandas as pd
import numpy as np
from PIL import Image
import random
import h5py

filename = 'E:/Alibaba German AI Challenge/origin_DATA/validation.h5'
f = h5py.File(filename,'r')
print('Get the h5 file')

s1 = np.array(f['sen1'])
s2 = np.array(f['sen2'])
y = np.array(f['label'])

x = []
for i in range(0,s1.shape[0]):
    temp1 = s1[i].flatten()
    temp2 = s2[i].flatten()
    temp = np.hstack((temp1,temp2))
    x.append(temp)
x = np.array(x)

data = np.hstack((x,y))
print('The shape of data is ',data.shape)

sess = tf.InteractiveSession()

#####################################################     Net Define     ##################################################### 

# weight initialization
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape = shape)
    return tf.Variable(initial)

# convolution
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
# pooling
def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

# Create the model
# placeholder
x = tf.placeholder("float", [None, 18432])
y_ = tf.placeholder("float", [None, 17])


# first convolutinal layer
w_conv1 = weight_variable([5, 5, 18, 32])
b_conv1 = bias_variable([32])

x_image = tf.reshape(x, [-1, 32, 32, 18])

h_conv1 = tf.nn.relu(conv2d(x_image, w_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

# second convolutional layer
w_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, w_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

# densely connected layer
w_fc1 = weight_variable([8*8*64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 8*8*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, w_fc1) + b_fc1)

# dropout
keep_prob = tf.placeholder("float")
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

# readout layer
w_fc2 = weight_variable([1024, 17])
b_fc2 = bias_variable([17])

y_conv = tf.nn.softmax(tf.matmul(h_fc1_drop, w_fc2) + b_fc2)

# train and evaluate the model
#交叉熵作为损失函数
delta = 1e-7
cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv+delta))
train_step = tf.train.GradientDescentOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess.run(tf.global_variables_initializer())


#####################################################       Train     ##################################################### 


##随机抽取一部分作为一个mini-batch
def get_batch(data, batch_size):
    sample = random.sample(list(data),batch_size)
    sample = np.array(sample)
    train_x = sample[:,:-17]
    train_y = sample[:,-17:]
    
    return train_x, train_y


batch_size = 200
for i in range(64000):
    batch_x,batch_y = get_batch(data,batch_size)
    if i%200 == 0:
        train_accuracy = accuracy.eval(feed_dict={x:batch_x, y_:batch_y, keep_prob:1.0})
        print ("step %d, train accuracy %g" %(i, train_accuracy))
    train_step.run(feed_dict={x:batch_x, y_:batch_y, keep_prob:0.5})

#print ("test accuracy %g" % accuracy.eval(feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0}))

Get the h5 file
The shape of data is  (24119, 18449)


TypeError: unhashable type: 'numpy.ndarray'

In [2]:
filename = 'E:/Alibaba German AI Challenge/origin_DATA/round1_test_a_20181109.h5'
f = h5py.File(filename,'r')
list(f.keys())

['sen1', 'sen2']

In [3]:
test_s1 = f['sen1']
test_s2 = f['sen2']

test = []
for i in range(0,test_s1.shape[0]):
    temp1 = test_s1[i].flatten()
    temp2 = test_s2[i].flatten()
    temp = np.hstack((temp1,temp2))
    test.append(temp)
test = np.array(test)
test.shape

(4838, 18432)

In [4]:
test_y = np.zeros((test.shape[0],17))
test_y.shape

(4838, 17)

In [6]:
pred = tf.argmax(y_conv, 1)

test_x_0 = test[0:1500]
test_y_0 = test_y[0:1500]
P_0 = pred.eval(feed_dict={x:test_x_0, y_:test_y_0, keep_prob:1.0})

test_x_1 = test[1500:3000]
test_y_1 = test_y[1500:3000]
P_1 = pred.eval(feed_dict={x:test_x_1, y_:test_y_1, keep_prob:1.0})

test_x_2 = test[3000:4500]
test_y_2 = test_y[3000:4500]
P_2 = pred.eval(feed_dict={x:test_x_2, y_:test_y_2, keep_prob:1.0})

test_x_3 = test[4500:]
test_y_3 = test_y[4500:]
P_3 = pred.eval(feed_dict={x:test_x_3, y_:test_y_3, keep_prob:1.0})

P = np.hstack([P_0,P_1,P_2,P_3])
P

array([16,  2, 13, ...,  8, 13, 16], dtype=int64)

In [7]:
P.shape

(4838,)

In [11]:
one_hot=tf.one_hot(P,17)
one_hot.shape

TensorShape([Dimension(4838), Dimension(17)])

In [12]:
one_hot

<tf.Tensor 'one_hot_1:0' shape=(4838, 17) dtype=float32>

In [20]:
Pred_one_hot = sess.run(one_hot)

In [15]:
Pred_one_hot.shape

(4838, 17)

In [21]:
out = pd.DataFrame(Pred_one_hot, columns = list(range(17)))
out.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
Pred_one_hot = Pred_one_hot.astype(np.int32)

In [23]:
out = pd.DataFrame(Pred_one_hot, columns = list(range(17)))
out.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [24]:
out.shape

(4838, 17)

In [25]:
out.to_csv('first_64k_vali_as_train.csv', index = False, header = False)