# Train Deep Speech neural net on text data

1. Global set up - setting up logging and root directory for including the modules, files etc.
2. Load Simple Wikipedia, clean up the data, transform into idx-s and one-hot vectors.
3. Initialize training and validation datasets.
4. Run training.

## 1. Global setup
Set up logging and paths

In [1]:
try:
    with open("global_setup.py") as setupfile:
        exec(setupfile.read())
except FileNotFoundError:
    print('Setup already completed')

## 2. Import the required packages

In [2]:
import random
import tensorflow as tf
from keras.utils import to_categorical
import numpy as np
from nltk import tokenize # Text to sentences
import pandas as pd # Train and Validation data generation
import re
import pprint
from src.wikipedia import Wikipedia
#random.seed(12345)

## 4. Load the Simple Wikipedia

In [3]:
wikipedia = Wikipedia(
    language="simple",
    cache_directory_url=False
)

simplewiki-latest-pages-articles-multistream.xml.bz2
Loading parsed documents.
Loading preprocessed documents.
Wikipedia loaded.


## 5. Clean-up the data

In [4]:
# Cleaning up simple wikipedia texts
pattern_ignored_words = re.compile(
    r"""
    (?:(?:thumb|thumbnail|left|right|\d+px|upright(?:=[0-9\.]+)?)\|)+
    |^\s*\|.+$
    |^REDIRECT\b""",
    flags=re.DOTALL | re.UNICODE | re.VERBOSE | re.MULTILINE)
pattern_new_lines = re.compile('[\n\r ]+', re.UNICODE)
texts = [wikipedia.documents[i].text for i in range(len(wikipedia.documents))]
texts = [pattern_ignored_words.sub('', texts[i]) for i in range(len(texts))]
texts = [pattern_new_lines.sub(' ', texts[i]) for i in range(len(texts))]
texts = [texts[i].replace("\\", "") for i in range(len(texts))]
texts = [texts[i].replace("\xa0", " ") for i in range(len(texts))]

## 6. Divide into sentences

In [26]:
# Simple wikipedia article texts into single sentences

sentences = []
sentences += [tokenize.sent_tokenize(texts[i]) for i in range(len(texts))]
#sentences += [texts[i].split(". ") for i## 6. Divide into sentences in range(len(texts))] #len(texts)
# Now sentences is a list of lists. The next expression flattens it into one long list.
sentences = [item for sublist in sentences for item in sublist]

In [6]:
pprint.pprint(sentences[0:3])

['The Month Spring flowers in April in the Northern Hemisphere.',
 'April comes between March and May, making it the fourth month of the year.',
 'It also comes first in the year out of the four months that have 30 days, as '
 'June, September and November are later in the year.']


## 7. Clean-up sentences and remove too long and short ones

Median sentence length is 83 symbols. We remove the sentences shorter than 20 symbols and longer than 100 symbols to clean up the dataset.<br><br>
We also remove the sentences starting with "Category:", "Related pages", "References", "Other websites:". <br>
These are technical Wikipedia pages that we do not need. Need to check for more, e.g. "Gallery".

In [27]:
print(len(sentences))
for i in reversed(range(len(sentences))):
    if len(sentences[i]) < 20 or len(sentences[i]) > 100 \
        or sentences[i][0:9] == "Category:" \
        or sentences[i][0:13] == "Related pages" \
        or sentences[i][0:10] == "References" \
        or sentences[i][0:14] == "Other websites":
        sentences.pop(i)
print(len(sentences))

#Gallery - do something?

1142223
648280


In [8]:
pprint.pprint(sentences[530000:530005])

['McCarty later explained the work for the general reader.',
 'Uses Propene is produced from fossil fuels, and from coal.',
 'Propene is the second most important product used in the petrochemical '
 'industry, after Ethene.',
 'About two thirds are used to produce Polypropylene.',
 'Propene and benzene are converted to acetone and phenol via the cumene '
 'process.']


In [9]:
import statistics
sentence_lengths = [len(sentences[i]) for i in range(len(sentences))]
print(statistics.median(sentence_lengths))

from collections import defaultdict
appearances = defaultdict(int)

sentence_lengths.sort()

for curr in sentence_lengths:
    appearances[curr] += 1
    
a = set(sentence_lengths) 
for i in a:
    print("{} - {}".format(i, appearances[i]))

63.0
20 - 2262
21 - 2398
22 - 6644
23 - 2670
24 - 3251
25 - 3208
26 - 3194
27 - 3488
28 - 3590
29 - 3899
30 - 4177
31 - 4754
32 - 4765
33 - 5145
34 - 5282
35 - 5572
36 - 5968
37 - 6002
38 - 6286
39 - 6727
40 - 6990
41 - 7439
42 - 7753
43 - 8142
44 - 8835
45 - 9162
46 - 9368
47 - 9875
48 - 10349
49 - 10546
50 - 10651
51 - 10699
52 - 10742
53 - 10845
54 - 10767
55 - 10812
56 - 10723
57 - 10611
58 - 10619
59 - 10610
60 - 10586
61 - 10580
62 - 10482
63 - 10394
64 - 10473
65 - 10232
66 - 10094
67 - 10029
68 - 10016
69 - 9925
70 - 9713
71 - 9810
72 - 9634
73 - 9984
74 - 9663
75 - 9281
76 - 9452
77 - 9147
78 - 9420
79 - 8992
80 - 8905
81 - 8782
82 - 8805
83 - 8615
84 - 8441
85 - 8453
86 - 8247
87 - 8388
88 - 8352
89 - 7819
90 - 7911
91 - 7842
92 - 7697
93 - 7515
94 - 7169
95 - 7359
96 - 7289
97 - 7207
98 - 7034
99 - 6955
100 - 6768


## 8. Generate training data

1. Convert sentences into IDXs (replace characters with integers).
2. Convert IDXs into one-hot vectors

In [28]:
## 8. Generate training data
alphabets = {'a':0, 'b':1, 'c':2, 'd':3, 'e':4, 'f':5, 'g':6, 'h':7, 'i':8, 'j':9, 'k':10, 'l':11, 'm':12, 'n':13, 'o':14,
            'p':15, 'q':16, 'r':17, 's':18, 't':19, 'u':20, 'v':21, 'w':22, 'x':23, 'y':24, 'z':25, 
            '0':26, '1':27, '2':28, '3':29, '4':30, '5':31, '6':32, '7':33, '8':34, '9':35, 
            ' ':36, ',':37, '.':38, ':':39, ';':40, '"':41, "'":42, '':43, '(':44, ')':45} #43 = unknown symbol

idxs = [alphabets[ch] if ch in alphabets else 43 for ch in 'az 123#']

idxs

#one_hot = tf.one_hot(idxs, depth=len(alphabets), dtype=tf.uint8)

#sess = tf.InteractiveSession()
#one_hot.eval()
one_hot = to_categorical(idxs, num_classes = len(alphabets))
one_hot

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 

In [29]:
sentences_idxs = []
for i in range(len(sentences)):
    idx = []
    for j in sentences[i]:
        if j in alphabets:
            idx += [alphabets[j]]
        else:
            idx += [43]
    sentences_idxs.append(idx)
    
#sentences_onehot = [tf.one_hot(sentences_idxs[i], depth=len(alphabets), dtype=tf.uint8) for i in range(len(sentences_idxs))]

## 9. Get first 10K observations for test purposes

In [30]:
#sentences_onehot = [tf.one_hot(sentences_idxs[i], depth=len(alphabets), dtype=tf.uint8) for i in range(10000)]
sentences_onehot = [to_categorical(sentences_idxs[i], num_classes = len(alphabets)) for i in range(100000)]
sentences = sentences[0:100000]

In [31]:
# Generate the data examples
# X and Y are identical for the test purposes

data = pd.DataFrame(
    {'X': sentences_onehot,
     'Y': sentences
    })

print(len(sentences_onehot[100][0]))
print(len(sentences_onehot))

46
100000


## 11. Initialize the DeepSpeech NN to train

<p>The original DeepSpeech paper uses the language model on top of the RNN (p. 4), see: https://arxiv.org/pdf/1412.5567.pdf</p>
<p>I have disabled the language model in file: <i>report.py (67)</i>, because the "KENLM" package is hard to install on Windows. For this purpose, we need to train a new model, especially for Danish language.</p>

In [15]:
#####################################################

import os

import keras
from keras.callbacks import TensorBoard
from keras.optimizers import Adam, Nadam

#from KerasDeepSpeech.data import combine_all_wavs_and_trans_from_csvs
from KerasDeepSpeech.generator import BatchGenerator
from KerasDeepSpeech.model import *
from KerasDeepSpeech.report import ReportCallback
from KerasDeepSpeech.utils import load_model_checkpoint, save_model, MemoryCallback

#####################################################


#######################################################

# Prevent pool_allocator message
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#######################################################


def main(args):
    '''
    There are 5 simple steps to this program
    '''

    #1. combine all data into 2 dataframes (train, valid)
    print("Getting data from arguments")
    #train_dataprops, df_train = combine_all_wavs_and_trans_from_csvs(args.train_files, sortagrad=args.sortagrad)
    #valid_dataprops, df_valid = combine_all_wavs_and_trans_from_csvs(args.valid_files, sortagrad=args.sortagrad)

    train_ratio = 0.9 #90% of data used for training, 10% for validation
    args.model_arch = 0
    args.opt = "adam"
    args.train_steps = 0
    args.epochs = 10
    args.valid_steps = 0
    args.batchsize = 32 #was 16
    args.name = ""
    args.loadcheckpointpath = ""
    args.fc_size = 512
    args.rnn_size = 512
    args.learning_rate = 0.01
    args.memcheck = False
    args.tensorboard = True
    
    model_input_type = "text"
    
    
    df_train = data[0:int(train_ratio * len(sentences_onehot))]
    df_valid = data[int(train_ratio * len(sentences_onehot)):]


    ## 2. init data generators
    print("Creating data batch generators")
    traindata = BatchGenerator(dataframe=df_train, dataproperties=None,
                              training=True, batch_size=args.batchsize, model_input_type=model_input_type)
    validdata = BatchGenerator(dataframe=df_valid, dataproperties=None,
                              training=False, batch_size=args.batchsize, model_input_type=model_input_type)




    output_dir = os.path.join('checkpoints/results',
                                  'model%s_%s' % (args.model_arch,
                                             args.name))
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)


    ## 3. Load existing or create new model
    if args.loadcheckpointpath:
        # load existing
        print("Loading model")

        cp = args.loadcheckpointpath
        assert(os.path.isdir(cp))

        model_path = os.path.join(cp, "model")
        # assert(os.path.isfile(model_path))

        model = load_model_checkpoint(model_path)


        print("Model loaded")
    else:
        # new model recipes here
        print('New model DS{}'.format(args.model_arch))
        if (args.model_arch == 0):
            # DeepSpeech1 with Dropout
            model = ds1_dropout(input_dim=len(alphabets), fc_size=args.fc_size, rnn_size=args.rnn_size,dropout=[0.1,0.1,0.1], output_dim=len(alphabets) + 1)

        elif(args.model_arch==1):
            # DeepSpeech1 - no dropout
            model = ds1(input_dim=26, fc_size=args.fc_size, rnn_size=args.rnn_size, output_dim=29)

        elif(args.model_arch==2):
            # DeepSpeech2 model
            model = ds2_gru_model(input_dim=161, fc_size=args.fc_size, rnn_size=args.rnn_size, output_dim=29)

        elif(args.model_arch==3):
            # own model
            model = ownModel(input_dim=26, fc_size=args.fc_size, rnn_size=args.rnn_size, dropout=[0.1, 0.1, 0.1], output_dim=29)

        elif(args.model_arch==4):
            # graves model
            model = graves(input_dim=26, rnn_size=args.rnn_size, output_dim=29, std=0.5)

        elif(args.model_arch==5):
            #cnn city
            model = cnn_city(input_dim=161, fc_size=args.fc_size, rnn_size=args.rnn_size, output_dim=29)

        elif(args.model_arch == 6):
            # constrained model
            model = const(input_dim=26, fc_size=args.fc_size, rnn_size=args.rnn_size, output_dim=29)
        else:
            raise("model not found")

        print(model.summary(line_length=80))

        #required to save the JSON
        save_model(model, output_dir)

    if (args.opt.lower() == 'sgd'):
        opt = SGD(lr=args.learning_rate, decay=1e-6, momentum=0.9, nesterov=True, clipnorm=5)
    elif (args.opt.lower() == 'adam'):
        opt = Adam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8, clipnorm=5)
    elif (args.opt.lower() == 'nadam'):
        opt = Nadam(lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8, clipnorm=5)
    else:
        raise "optimiser not recognised"

    model.compile(optimizer=opt, loss=ctc)

    ## 4. train

    if args.train_steps == 0:
        args.train_steps = len(df_train.index) // args.batchsize
        # print(args.train_steps)
    # we use 1/xth of the validation data at each epoch end to test val score
    if args.valid_steps == 0:

        args.valid_steps = (len(df_valid.index) // args.batchsize)
        # print(args.valid_steps)


    if args.memcheck:
        cb_list = [MemoryCallback()]
    else:
        cb_list = []

    if args.tensorboard:
        tb_cb = TensorBoard(log_dir='./tensorboard/{}/'.format(args.name), write_graph=False, write_images=True)
        cb_list.append(tb_cb)

    y_pred = model.get_layer('ctc').input[0]
    input_data = model.get_layer('the_input').input

    report = K.function([input_data, K.learning_phase()], [y_pred])
    report_cb = ReportCallback(report, validdata, model, args.name, save=True)

    cb_list.append(report_cb)

    model.fit_generator(generator=traindata.next_batch(),
                        steps_per_epoch=args.train_steps,
                        epochs=args.epochs,
                        callbacks=cb_list,
                        validation_data=validdata.next_batch(),
                        validation_steps=args.valid_steps,
                        initial_epoch=0,
                        verbose=1,
                        class_weight=None,
                        max_q_size=10,
                        workers=1,
                        pickle_safe=False
                        )

    # K.clear_session()

    ## These are the most important metrics
    print("Mean WER   :", report_cb.mean_wer_log)
    print("Mean LER   :", report_cb.mean_ler_log)
    print("NormMeanLER:", report_cb.norm_mean_ler_log)

    # export to csv?
    K.clear_session()

In [16]:
class Object(object):
    pass

args = Object()

In [None]:
main(args)

Getting data from arguments
Creating data batch generators
New model DS0
________________________________________________________________________________
Layer (type)              Output Shape      Param #  Connected to               
the_input (InputLayer)    (None, None, 46)  0                                   
________________________________________________________________________________
time_distributed_8 (TimeD (None, None, 512) 24064    the_input[0][0]            
________________________________________________________________________________
time_distributed_9 (TimeD (None, None, 512) 0        time_distributed_8[0][0]   
________________________________________________________________________________
time_distributed_10 (Time (None, None, 512) 262656   time_distributed_9[0][0]   
________________________________________________________________________________
time_distributed_11 (Time (None, None, 512) 0        time_distributed_10[0][0]  
____________________________________

 340/2812 [==>...........................] - ETA: 1:47:56 - loss: in - ETA: 1:01:56 - loss: in - ETA: 46:23 - loss: inf  - ETA: 39:29 - loss: in - ETA: 34:34 - loss: in - ETA: 31:26 - loss: in - ETA: 29:01 - loss: in - ETA: 27:18 - loss: in - ETA: 25:59 - loss: in - ETA: 25:06 - loss: in - ETA: 24:18 - loss: in - ETA: 23:37 - loss: in - ETA: 23:01 - loss: in - ETA: 22:30 - loss: in - ETA: 21:59 - loss: in - ETA: 21:34 - loss: in - ETA: 21:12 - loss: in - ETA: 20:52 - loss: in - ETA: 20:34 - loss: in - ETA: 20:19 - loss: in - ETA: 20:03 - loss: in - ETA: 19:50 - loss: in - ETA: 19:40 - loss: in - ETA: 19:31 - loss: in - ETA: 19:22 - loss: in - ETA: 19:14 - loss: in - ETA: 19:09 - loss: in - ETA: 19:00 - loss: in - ETA: 18:53 - loss: in - ETA: 18:43 - loss: in - ETA: 18:36 - loss: in - ETA: 18:28 - loss: in - ETA: 18:21 - loss: in - ETA: 18:16 - loss: in - ETA: 18:10 - loss: in - ETA: 18:05 - loss: in - ETA: 18:00 - loss: in - ETA: 17:56 - loss: in - ETA: 17:52 - loss: in - ETA: 17:47 - 















In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())