In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import platform as plat
import os
import time

import keras as kr
import numpy as np
import random
import pdb

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Reshape, BatchNormalization # , Flatten
from keras.layers import Lambda, TimeDistributed, Activation,Conv2D, MaxPooling2D,GRU, Bidirectional #, Merge
from keras.layers.merge import add, concatenate
from keras import backend as K
from keras.optimizers import SGD, Adadelta, Adam

Using TensorFlow backend.


## 搭建模型

In [23]:
class SpeechRecognitionModel():
    '''
    定义CNN/LSTM/CTC模型，使用函数式模型
    输入层：200维的特征值序列，一条语音数据的最大长度设为1600（大约16s）
    隐藏层：卷积池化层，卷积核大小为3x3，池化窗口大小为2
    隐藏层：全连接层
    输出层：全连接层，神经元数量为self.MS_OUTPUT_SIZE，使用softmax作为激活函数，
    CTC层：使用CTC的loss作为损失函数，实现连接性时序多输出

    '''
    def __init__(self):
        '''
        初始化
        默认输出的拼音的表示大小是1423，即1423个拼音+1个空白块
        '''
        self.MS_OUTPUT_SIZE = 1423 + 1 # 神经网络最终输出的每一个字符向量维度的大小
        #self.BATCH_SIZE = BATCH_SIZE # 一次训练的batch
        self.label_max_string_length = 64
        self.AUDIO_LENGTH = 1600
        self.AUDIO_FEATURE_LENGTH = 200
        
        self.model, self.ctc_model = self._model_init()
#         self.datapath = datapath
#         self.slash = ''
#         system_type = plat.system() # 由于不同的系统的文件路径表示不一样，需要进行判断
#         if(system_type == 'Windows'):
#             self.slash='\\' # 反斜杠
#         elif(system_type == 'Linux'):
#             self.slash='/' # 正斜杠
#         else:
#             print('*[Message] Unknown System\n')
#             self.slash='/' # 正斜杠
#         if(self.slash != self.datapath[-1]): # 在目录路径末尾增加斜杠
#             self.datapath = self.datapath + self.slash
    
    def _model_init(self):
        input_data = Input(name="the_inputs", shape=(None, self.AUDIO_FEATURE_LENGTH, 1))  
        
        x = Conv2D(32, (3,3), use_bias=False, activation='relu', padding='same', kernel_initializer='he_normal')(input_data)
        x = BatchNormalization()(x)
        x = Conv2D(32, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=2, strides=None, padding='valid')(x)
        
        x = Conv2D(64, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = Conv2D(64, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        
        x = Conv2D(64, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = Conv2D(64, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=2, strides=None, padding='valid')(x)
        
        x = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        
        x = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        
        x = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = Conv2D(128, (3,3), use_bias=True, activation='relu', padding='same', kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D(pool_size=2, strides=None, padding='valid')(x)
        
        # 200 / 8 * 128 = 3200
        x = Reshape(target_shape=(-1, 3200))(x)
        
        x = Dropout(0.2)(x)
        x = Dense(128, activation='relu', use_bias=True, kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        
        gru_units = 128
        # 创建一个双向GRU，看看是否能增加精度？
#         gru_1a = GRU(gru_units, return_sequences=True, kernel_initializer='he_normal', name='gru_1a')(x)
#         gru_1b = GRU(gru_units, return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='gru_1b')(x)
        x = Bidirectional(GRU(gru_units, return_sequences=True, kernel_initializer='he_normal', name='gru_1'))(x) 
        x = BatchNormalization()(x)
        
        x = Bidirectional(GRU(gru_units, return_sequences=True, kernel_initializer='he_normal', name='gru_2'))(x) 
        x = BatchNormalization()(x)
        
        x = Dense(128, activation='relu', use_bias=True, kernel_initializer='he_normal')(x)
        x = BatchNormalization()(x)
        
        x = Dense(self.MS_OUTPUT_SIZE, use_bias=True, kernel_initializer='he_normal')(x)
        
        y_pred = Activation('softmax', name='y_pred_activation')(x)
        model_data = Model(inputs=input_data, outputs=y_pred)
        
        labels = Input(name='the_labels', shape=[None], dtype='float32')
        input_length = Input(name='input_length', shape=[1], dtype='int64')
        label_length = Input(name='label_length', shape=[1], dtype='int64')
        loss_out = Lambda(self._ctc_batch_cost_func, output_shape=(1,), name='ctc_loss')([y_pred, labels, input_length, label_length])
        
        ctc_model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
        ctc_model.summary()
        
        optimizer = Adam(learning_rate=0.0003, beta_1=0.9, beta_2=0.999, decay=0.0, epsilon=10e-8)
        ctc_model.compile(optimizer=optimizer, loss={'ctc_loss': lambda y_true, y_pred: y_pred}, metrics=['acc'])
        
        # captures output of softmax so we can decode the output during visualization
#         test_func = K.function([input_data], [self.y_pred])
#         pdb.set_trace()

        #print('[*提示] 创建模型成功，模型编译成功')
        print('[*Info] Create Model Successful, Compiles Model Successful. ')
        return model_data, ctc_model
        
        
    def _ctc_batch_cost_func(self, args):
        y_pred, labels, input_length, label_length = args

#         pdb.set_trace()
        y_pred = y_pred[:, :, :]
        return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
        

In [None]:
import tensorflow as tf

# with tf.Session() as s:




## 训练模型

In [20]:
from speech_data import *

wav_list = get_thchs30_data(wav_base_path="H:\\PycharmProjects\\dataset\\data_thchs30\\train-test")
batch = data_generator(wav_list, batch_size)
y_pred = next(batch)[0]['the_labels']


{'ctc': <function __main__.<lambda>(y_true, y_pred)>}

In [None]:
from speech_data import *

model = SpeechRecognitionModel()

epochs = 5
batch_size = 4

train_wav_list = get_thchs30_data(wav_base_path="H:\\PycharmProjects\\dataset\\data_thchs30\\train")
validation_wav_list = get_thchs30_data(wav_base_path="H:\\PycharmProjects\\dataset\\data_thchs30\\dev")
batch_num = len(train_wav_list) // batch_size
val_batch_num = len(validation_wav_list) // batch_size

train_batch = data_generator(train_wav_list, batch_size)
validation_batch = data_generator(validation_wav_list, batch_size)

history = model.ctc_model.fit_generator(train_batch,
                                        steps_per_epoch=batch_num,
                                        epochs=8
                                       )

# for i in range(epochs):
#     print("Begin epoch:", i+1)
#     train_batch = data_generator(train_wav_list, batch_size)
#     history = model.ctc_model.fit_generator(train_batch, steps_per_epoch=batch_num, epochs=1)
#     print(history)


y_pred: Tensor("y_pred_activation_11/truediv:0", shape=(?, ?, 1424), dtype=float32)
labels Tensor("the_labels_11:0", shape=(?, ?), dtype=float32)
Model: "model_24"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
the_inputs (InputLayer)         (None, None, 200, 1) 0                                            
__________________________________________________________________________________________________
conv2d_133 (Conv2D)             (None, None, 200, 32 288         the_inputs[0][0]                 
__________________________________________________________________________________________________
batch_normalization_177 (BatchN (None, None, 200, 32 128         conv2d_133[0][0]                 
__________________________________________________________________________________________________
conv2d_134 (Conv2D)             (None, None,

[*Info] Create Model Successful, Compiles Model Successful. 
Epoch 1/8