In [1]:
#!pip install ../input/transformers/transformers-master/
#!pip install keras_preprocessing
#!pip install bert

In [2]:
import pandas as pd
import os
import librosa
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow.keras.backend as K
#from sklearn.metrics.pairwise import cosine_similarity
from torchaudio import load
from keras_preprocessing.sequence import pad_sequences
from transformers import BertModel, TFBertModel, BertTokenizer
import numpy as np
from math import sqrt
from tqdm import notebook

In [3]:
data_dir = '../input/common-voice'
audio_dir = os.path.join(data_dir, 'cv-valid-train')
#/cv-valid-train'
sound_len = 10
bert_path = '../input/bert-base-uncased'
bert_vocab_path = 'vocab.txt'
bert_model_path = 'bert-base-uncased/pytorch_model.bin'
max_len_text = 30
max_len_fft = 300000
window = 40
batch_size = 1
sess = tf.compat.v1.Session()
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
epochs = 2

In [28]:
class VoiceInstance:
    
    def __init__(self, file, tokenizer, model=None, sound_len=10**10, text=None):
        
        self.file = file
        self.text = text
        self.tokenizer = tokenizer
        self.model = model
        self.mfcc = self._transform_audio()
        self._get_embeddings()
        
    @staticmethod
    def pad_seq_text(sequence):
        return np.pad(sequence, (0, max_len_text-len(sequence)), 
                      mode='constant')
    
    @staticmethod
    def pad_seq_fft(sequence):
        return np.pad(sequence, (0, max_len_fft-len(sequence)), 
                      mode='constant')
    
    def _transform_audio(self):
        
        waveform, rate = load(file)
        new_rate = rate/100
        self.fft, self.frequency = self._get_fft(waveform, new_rate)
        #self.mfcc = self._get_mfcc(waveform, new_rate)
        
    def _get_mfcc(self, waveform, sample_rate=22000):
        
        mfcc_tensor = tf.signal.dct(waveform.numpy())
        return mfcc_tensor
    
    def _get_fft(self, waveform, rate):
        
        length = len(waveform)
        frequency = np.fft.rfftfreq(length)
        fft = np.fft.rfft(waveform)/length
        fft = np.array([sqrt(np.real(x)**2 + np.imag(x)**2) for x in fft[0]])
        return self.pad_seq_fft(fft), frequency
    
    def _get_embeddings(self):
        
        def tokenize_text(text):
            return self.tokenizer.tokenize(text)
        
        def get_ids(tokens):
            return self.tokenizer.convert_tokens_to_ids(tokens)
        
        if self.text:
            self.tokens = tokenize_text(self.text)
            emb = np.array(get_ids(self.tokens))
            emb = self.pad_seq_text(emb)
            self.embeddings = emb
            #self.embeddings = self.model.call(emb.reshape(1, -1))


In [70]:
def build_voice_model(max_len_fft=max_len_fft, batch_size=1):
    
    input_layer = tf.keras.layers.Input(shape=(max_len_fft, ), batch_size=batch_size)
    norm_1 = tf.keras.layers.BatchNormalization()(input_layer)
    reshaped = tf.keras.layers.Reshape((-1, max_len_fft))(norm_1)
    lstm_1 = tf.keras.layers.LSTM(256, return_sequences=True)(reshaped)
    flatten = tf.keras.layers.Flatten()(lstm_1)
    dense_1 = tf.keras.layers.Dense(256, activation='relu')(flatten)
    
    model = tf.keras.Model(inputs=[input_layer], outputs=[dense_1])
    model.compile(optimizer='Adam', loss='mean_squared_error', metrics=['accuracy'])
    return model
"""
class VoiceModel(tf.keras.Model):
    
    def __init__(self, max_text_len=max_len_text,
                batch_size=batch_size,
                max_len_fft=max_len_fft, embs=768):
        super(VoiceModel, self).__init__(self)
        #self.input_layer = tf.keras.layers.Input(shape=(max_len_fft, ), batch_size=batch_size)
        self.lstm_1 = tf.keras.layers.LSTM(256, return_sequences=True)
        self.dropout = tf.keras.layers.Dropout(0.3)
        self.lstm_2 = tf.keras.layers.LSTM(256)
        self.flatten = tf.keras.layers.Flatten()
        self.dense = tf.keras.layers.Dense(256, activation='relu')
        self.output_layer = tf.keras.layers.Dense(embs, activation='tanh')
        
    def call(self, x):
        
        #x = self.input_layer(x)
        x = self.lstm_1(x)
        x = self.dropout(x)
        x = self.lstm_2(x)
        x = self.flatten(x)
        x = self.dense(x)
        
        return self.output_layer(x)
"""

"\nclass VoiceModel(tf.keras.Model):\n    \n    def __init__(self, max_text_len=max_len_text,\n                batch_size=batch_size,\n                max_len_fft=max_len_fft, embs=768):\n        super(VoiceModel, self).__init__(self)\n        #self.input_layer = tf.keras.layers.Input(shape=(max_len_fft, ), batch_size=batch_size)\n        self.lstm_1 = tf.keras.layers.LSTM(256, return_sequences=True)\n        self.dropout = tf.keras.layers.Dropout(0.3)\n        self.lstm_2 = tf.keras.layers.LSTM(256)\n        self.flatten = tf.keras.layers.Flatten()\n        self.dense = tf.keras.layers.Dense(256, activation='relu')\n        self.output_layer = tf.keras.layers.Dense(embs, activation='tanh')\n        \n    def call(self, x):\n        \n        #x = self.input_layer(x)\n        x = self.lstm_1(x)\n        x = self.dropout(x)\n        x = self.lstm_2(x)\n        x = self.flatten(x)\n        x = self.dense(x)\n        \n        return self.output_layer(x)\n"

In [71]:
train_loss = tf.keras.metrics.KLDivergence(name='train_loss')
train_accuracy = tf.keras.metrics.CosineSimilarity(name='train_accuracy')
test_loss = tf.keras.metrics.KLDivergence(name='test_loss')
test_accuracy = tf.keras.metrics.CosineSimilarity(name='test_accuracy')
loss = tf.keras.losses.KLDivergence()
optimizer = tf.keras.optimizers.Adam()

#instance = 
#model = VoiceModel(instance.fft, instance.embeddings)
model = build_voice_model()

NameError: name 'Model' is not defined

In [57]:
@tf.function
def train_step(images, labels):
    predictions = model(images, training=True)
    loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss(loss)
    train_accuracy(labels, predictions)


In [58]:
@tf.function
def test_step(images, labels):
    predictions = model(images, training=False)
    t_loss = loss(labels, predictions)
    test_loss(t_loss)
    test_accuracy(labels, predictions)


In [59]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
#model = TFBertModel.from_pretrained('bert-base-cased')
info_frame = pd.read_csv('../input/common-voice/cv-valid-train.csv')

prepocessed_frame = pd.DataFrame(columns=['filename', 'tokens', 'fft', 'hidden_emb'])

for epoch in range(epochs):

    for i in notebook.tqdm(info_frame.index):
        if info_frame.loc[i, 'down_votes'] > 2:
            continue
        train_loss.reset_states()
        train_accuracy.reset_states()
        test_loss.reset_states()
        test_accuracy.reset_states()

        file = os.path.join(audio_dir, info_frame.loc[i, 'filename'])
        instance = VoiceInstance(file=file, 
                                 tokenizer=tokenizer,
                                 text=info_frame.loc[i, 'text'])
        #print(tf.reshape(instance.embeddings[0], -1, 1).shape)
        #print(instance.__dir__())
        print(instance.fft, instance.embeddings)
        model.fit(instance.fft, instance.embeddings)
        #print(instance.embeddings[1].shape)
        prepocessed_frame = prepocessed_frame.append({'index': i,
                                                     'tokens': instance.tokens,
                                                     'fft': instance.fft,
                                                    'hidden_emb': instance.embeddings[1]}, 
                                                     ignore_index=True)
    print(prepocessed_frame.head())
    prepocessed_frame.to_csv('prepocessed_frame.csv')
    

HBox(children=(FloatProgress(value=0.0, max=195776.0), HTML(value='')))

[ 8.24768387 10.0985757   9.56836605 ...  0.          0.
  0.        ] [3858 1106 6239  184 2354 1116 1105 2812 1172 1103 1385 2226 1125 1163
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]



ValueError: in converted code:

    <ipython-input-55-6490e3ee3e36>:18 call  *
        x = self.lstm_1(x)
    /opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/layers/recurrent.py:644 __call__
        return super(RNN, self).__call__(inputs, **kwargs)
    /opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/base_layer.py:737 __call__
        self.name)
    /opt/conda/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/input_spec.py:177 assert_input_compatibility
        str(x.shape.as_list()))

    ValueError: Input 0 of layer lstm_12 is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 1]
