In [1]:
import pyaudio
import wave
import time
import struct
import speech_recognition as sr
print("speech_recognition version - {}".format(sr.__version__))

speech_recognition version - 3.6.3


### Functions for audio clipping, playing, and recording.

In [2]:
def clip16(x):    
    # Clipping for 16 bits
    if x > 32767:
        x = 32767
    elif x < -32768:
        x = -32768
    else:
        x = x        
    return int(x)

def play_wav(filename):

    import wave
    import pyaudio
    import time

    wf = wave.open(filename, 'rb')
    CHANNELS = wf.getnchannels()
    RATE = wf.getframerate() 
    WIDTH = wf.getsampwidth() 

    def my_callback(input_string, block_size, time_info, status):
        output_string = wf.readframes(block_size)
        return (output_string, pyaudio.paContinue)

    p = pyaudio.PyAudio()
    stream = p.open(format = p.get_format_from_width(WIDTH),
                    channels = CHANNELS,
                    rate = RATE,
                    input = False,
                    output = True,
                    stream_callback = my_callback)

    print('* Playing audio file: ' + filename)
    stream.start_stream()

    while stream.is_active():
        time.sleep(0.1)

    stream.stop_stream()
    print('Finished.')
    stream.close()

    p.terminate()
    
def record_wav(filename='test_01.wav', CHANNELS = 1, RATE = 16000, WIDTH = 2, GAIN = 1.0):

    def my_callback_fun(binary_input_data, block_size, time_info, status):
        input_tuple = struct.unpack('h', binary_input_data)
        output_sample = clip16(GAIN * input_tuple[0])
        output_list.append(output_sample)
        return(binary_input_data, pyaudio.paContinue)

    p = pyaudio.PyAudio()

    # Set PyAudio format
    PA_format = p.get_format_from_width(WIDTH)

    stream = p.open(format = PA_format,
                    channels = CHANNELS,
                    rate = RATE,
                    input = True,
                    output = False,
                    frames_per_buffer = 1,
                    stream_callback = my_callback_fun)

    output_list = []

    stream.start_stream()
    print('* Start recording')

    time.sleep(10.0)

    stream.stop_stream()
    print('* Finish recording')

    stream.close()
    p.terminate()

    # Convert output signal to binary signal to write to wave file 
    output_string = struct.pack('h'*len(output_list), *output_list)

    # write data into wav file
    wf = wave.open(filename, 'w')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(WIDTH)
    wf.setframerate(RATE)
    wf.writeframes(output_string)
    wf.close()

    print('* Audio saved to file: ' + filename)

filename = 'test_01.wav'

### Record an audio File

In [3]:
record_wav(filename)

* Start recording
* Finish recording
* Audio saved to file: test_01.wav


### Play the file

In [3]:
play_wav(filename)

* Playing audio file: test_01.wav
Finished.


### Run Speech Recognition

In [4]:
r = sr.Recognizer()
test_file = sr.AudioFile(filename)

with test_file as source:
    audio = r.record(source)
# type(audio)

start = time.time()
text = r.recognize_google(audio)
end = time.time()

print("Recognised speech : \n\n{}".format(text))
print("\nTime taken for speech recognition : {} seconds".format( round(end-start, 3) ))

f = open("test_01.txt", "w")
print(text, file=f)
f.close()

Recognised speech : 

this is a speech recognition demo using python speech recognition library and tacotron 2

Time taken for speech recognition : 4.698 seconds


In [5]:
run -i "synthesize.py" --model='Tacotron-2' --mode='eval' --text_list=test_01.txt

Using TensorFlow backend.


Running End-to-End TTS Evaluation. Model: Tacotron-2
Synthesizing mel-spectrograms from text..
loaded model at logs-Tacotron-2/taco_pretrained/tacotron_model.ckpt-35000
Hyperparameters:
  allow_clipping_in_normalization: True
  attention_dim: 128
  attention_filters: 32
  attention_kernel: (31,)
  cbhg_conv_channels: 128
  cbhg_highway_units: 128
  cbhg_highwaynet_layers: 4
  cbhg_kernels: 8
  cbhg_pool_size: 2
  cbhg_projection: 256
  cbhg_projection_kernel_size: 3
  cbhg_rnn_units: 128
  cin_channels: 80
  cleaners: english_cleaners
  clip_for_wavenet: True
  clip_mels_length: True
  cross_entropy_pos_weight: 20
  cumulative_weights: True
  decoder_layers: 2
  decoder_lstm_units: 1024
  embedding_dim: 512
  enc_conv_channels: 512
  enc_conv_kernel_size: (5,)
  enc_conv_num_layers: 3
  encoder_lstm_units: 256
  fmax: 7600
  fmin: 55
  frame_shift_ms: None
  freq_axis_kernel_size: 3
  gate_channels: 256
  gin_channels: -1
  griffin_lim_iters: 60
  hop_size: 275
  input_type: raw
  kern

  0%|          | 0/1 [00:00<?, ?it/s]

Starting Synthesis


  if np.issubdtype(x.dtype, float) or np.issubdtype(x.dtype, complex):
100%|██████████| 1/1 [00:14<00:00, 14.21s/it]

synthesized mel spectrograms at tacotron_output/eval
Tacotron TTS synthesis complete!

Time taken for speech generation : 19.922 seconds





In [6]:
# filename = "wavenet_output/wavs/wavenet-audio-mel-batch_0_sentence_0.wav"

filename = "tacotron_output/logs-eval/wavs/wav-batch_0_sentence_0-linear.wav"
play_wav(filename)

* Playing audio file: tacotron_output/logs-eval/wavs/wav-batch_0_sentence_0-linear.wav
Finished.


In [14]:
from gtts import gTTS
start = time.time()
tts = gTTS(text=text, lang='en')
end = time.time()
print("\nTime taken for TTS generation : {} seconds".format( round(end-start, 3) ))
tts.save('sample.mp3')


Time taken for TTS generation : 0.644 seconds


In [7]:
from pygame import mixer # Load the required library

mixer.init()
mixer.music.load('sample.mp3')
mixer.music.play()

pygame 1.9.4
Hello from the pygame community. https://www.pygame.org/contribute.html
