# This file uses Google Speech to Text to transcribe all of the EGOCOM audio (as well as ICA source estimates).

## This script uses smart GET/FETCH HTTP protocols to asynchronously transcribe many audio files in parallel. It repeatedly queries Google's servers, always pushing the max limit of parallel requests it can take, and waits automatically when needed.

In [1]:
from __future__ import print_function, absolute_import, division, unicode_literals, with_statement # Python 2 compatibility

import io
import os

from datetime import timedelta, datetime
import subprocess
import json
import time
import os
import pickle

In [2]:
def async_srt_format_timestamp(seconds):
    seconds = float(seconds)
    stamp = str(timedelta(seconds=seconds))
    first, second, third = stamp.split(":")
    third = "{:.3f}".format(float(third))
    sec, milisec = third.split(".")
    third = ",".join([sec.zfill(2), milisec])
    return ":".join([first.zfill(2), second.zfill(2), third])

def write_subtitles(lrrr, wfn = None):
    '''
    Parameters
    ----------
    lrr is a google.cloud.speech_v1.types.LongRunningRecognizeResponse
    Requires enable_word_time_offsets = True (see example below)
    
    Example for where lrrr comes from
    ---------------------------------
    
    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        sample_rate_hertz=44100,
        language_code='en-US',
        enable_word_time_offsets=True)

    operation = client.long_running_recognize(config, audio)
    lrrr = operation.result(timeout=90)
    '''
    
    max_words_per_subtitle = 16
    if wfn is not None:
        with open(wfn, 'w') as f:
            f.write('')
        wf = open(wfn, 'a')
    else:
        wf = None

    cnt = 0
    for r in lrrr["results"]:
        word_dicts = r["alternatives"][0]["words"]
        for i in range(0, len(word_dicts), max_words_per_subtitle):
            cnt += 1
            words_range = word_dicts[i:i+max_words_per_subtitle]
            words = [word_info["word"] for word_info in words_range]
            words = " ".join(words[:len(words) // 2]) +  "\n" + " ".join(words[len(words) // 2:])
            start_time = words_range[0]["startTime"][:-1]
            end_time = words_range[-1]["endTime"][:-1]
            print(cnt, file=wf)
            print(async_srt_format_timestamp(start_time), "-->", async_srt_format_timestamp(end_time), file=wf)
            print(words, file=wf)
            print(file=wf)

In [3]:
pickle_loc =  "/home/cgn/Downloads/egocom-pickles/"
# subtitle_loc = "/home/cgn/Downloads/egocom-aligned-final/"
json_loc = "/home/cgn/Downloads/egocom-intermediate/"
# gcs_uri_loc = "gs://egocom-audio/wav16-summed/"
# keys = sorted([fn[:-4] for fn in os.listdir("/home/cgn/Downloads/egocom-audio-only-summed/")])
# gcs_uri_loc = "gs://egocom-audio/wav16-all/"
# keys = sorted([fn[:-4] for fn in os.listdir("/home/cgn/Downloads/egocom-audio-only/")])
gcs_uri_loc = "gs://egocom-audio/wav16-ica/"
keys = sorted([fn[:-4] for fn in os.listdir("/home/cgn/Downloads/egocom-ICA/")])

In [4]:
# POST service requestS for transcription (via PROTOCOL)
# See: https://cloud.google.com/speech-to-text/docs/async-recognize#speech-async-recognize-gcs-protocol

process_ids = []
for key in keys:
    json_fn = json_loc +key + ".json"
    json_string = '''{{
      "config": {{
          "sampleRateHertz": 44100,
          "languageCode": "en-US",
          "enableWordTimeOffsets": true,
      }},
      "audio": {{
          "uri":"{gcs_uri}"
      }}
    }}'''.format(gcs_uri = gcs_uri_loc + key + ".wav")
    print(json_string, file=open(json_fn, 'w'))

    cmd = """curl -s -H "Content-Type: application/json" \
        -H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
        https://speech.googleapis.com/v1/speech:longrunningrecognize \
        -d @{json_fn}""".format(json_fn = json_fn)
    
    done = False
    while not done:
        out = json.loads(subprocess.check_output(cmd, shell = True))
        if 'name' in out:
            process_ids.append(out["name"])
            done = True
        elif 'error' in out and out['error']['code'] == 429:
            # Google Speech API limits the amount of requests per 100 seconds.
            print('Sleeping 50 seconds. Started sleeping at', datetime.now())
            time.sleep(50)
        else:
            raise Exception("ERROR parsing {f}:\n".format(f=json_fn)+str(out))
        
    os.remove(json_fn)

# GET service requests (via PROTOCOL)

print("POST requests completed. Now we fetch results via GET requests (once Google Speech API completes).\n")
for name, key in zip(process_ids, keys):
    print('Fetching. Job name:', name, "Key:", key)
    done = False
    get_request_cnt = 0
    while not done:
        get_request_cnt += 1
        # If multiple tries, wait 30s for results to finish computing.
        if get_request_cnt > 1:
            time.sleep(30)
        cmd = '''curl -H "Authorization: Bearer "$(gcloud auth application-default print-access-token) \
             -H "Content-Type: application/json; charset=utf-8" \
             "https://speech.googleapis.com/v1/operations/{name}"'''.format(name=name)
        out = json.loads(subprocess.check_output(cmd, shell = True))
        if 'error' in out.keys():
            raise Exception("ERROR during longrunningrecognize():\n"+str(out))
        elif 'progressPercent' in out['metadata'].keys():
            pp = out['metadata']["progressPercent"]
            if pp < 100:
                print('Still transcribing... Percent done:', pp)
            else:
                print('Success. Transcription received.')
                done = True
        else:
            print('Job', name, 'is still waiting to be processed.')
            
    # Pickle all transcription data for re-use later.
    data = [z['alternatives'][0] for z in out["response"]["results"]]
#     confidences = [z["confidence"] for z in data]
#     transcripts = [z["transcript"] for z in data]
#     starts = [d['words'][0]['startTime'] for d in data]
#     ends = [d['words'][-1]['endTime'] for d in data]
#     # Create dictionary of lists
#     DL = {
#         'confidence': confidences,
#         'transcript': transcripts,
#         'startTime': starts,
#         'endTime': ends,
#     }
#     # Convert to list of dictionaries
#     LD = [dict(zip(DL,t)) for t in zip(*DL.values())]
    pickle.dump( data, open(pickle_loc +key+"_all-transcription-data-ica.p", "wb" ) )

    # Generate subtitles for the associated .MP4 video.
#     write_subtitles(out["response"], wfn = subtitle_loc + key + ".srt")

Sleeping 50 seconds. Started sleeping at 2018-09-24 19:57:20.865536
Sleeping 50 seconds. Started sleeping at 2018-09-24 19:58:57.938776
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:00:09.093969
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:01:02.175163
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:02:02.145161
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:02:55.171030
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:03:58.148949
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:04:57.948956
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:05:50.519916
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:06:50.147128
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:07:42.057669
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:08:41.978810
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:09:34.158080
Sleeping 50 seconds. Started sleeping at 2018-09-24 20:10:34.102139
Sleeping 50 seconds. Started sleeping at 2018-09

## DEPRECATED BELOW HERE - Non-synchronous method requires the jobs to wait one at a time in Python

In [None]:


# Imports the Google Cloud client library
# from google.cloud import speech
# from google.cloud.speech import enums
# from google.cloud.speech import types

# Instantiates a client
# client = speech.SpeechClient()

# def transcribe_gcs(gcs_uri, verbose = False):
#     """Asynchronously transcribes the audio file specified by the gcs_uri."""
#     from google.cloud import speech
#     from google.cloud.speech import enums
#     from google.cloud.speech import types
#     client = speech.SpeechClient()

#     audio = types.RecognitionAudio(uri=gcs_uri)
#     config = types.RecognitionConfig(
#         sample_rate_hertz=44100,
#         language_code='en-US',
#         enable_word_time_offsets=True)

#     operation = client.long_running_recognize(config, audio)

#     print('Waiting for operation to complete...')
#     response = operation.result(timeout=90)

#     # Each result is for a consecutive portion of the audio. Iterate through
#     # them to get the transcripts for the entire audio file.
#     for result in response.results:
#         # The first alternative is the most likely one for this portion.
#         print(u'Transcript: {}'.format(result.alternatives[0].transcript))
#         print('Confidence: {}'.format(result.alternatives[0].confidence))
        
#     return response


# def srt_format_timestamp(seconds):
#     stamp = str(timedelta(seconds=seconds))
#     first, second, third = stamp.split(":")
#     third = "{:.3f}".format(float(third))
#     sec, milisec = third.split(".")
#     third = ",".join([sec.zfill(2), milisec])
#     return ":".join([first.zfill(2), second.zfill(2), third])



# # The name of the audio file to transcribe
# file_name = "/home/cgn/Downloads/egocom-audio-only-summed/day_1__con_1__part1.wav"

# # Loads the audio into memory
# with io.open(file_name, 'rb') as audio_file:
#     content = audio_file.read()
#     audio = types.RecognitionAudio(content=content)

# config = types.RecognitionConfig(
#     encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
#     sample_rate_hertz=16000,
#     language_code='en-US')

# # Detects speech in the audio file
# response = client.recognize(config, audio)

# for result in response.results:
#     print('Transcript: {}'.format(result.alternatives[0].transcript))
    
# result = transcribe_gcs("gs://egocom-audio/wav16-summed/day_1__con_1__part1.wav")


# max_words_per_subtitle = 16
# wfn = "/home/cgn/Downloads/egocom-aligned/out.srt"
# wf = open(wfn, 'a') if wfn is not None else wfn

# cnt = 0
# for r in result.results:
#     alternative = r.alternatives[0]
#     for i in range(0, len(alternative.words), max_words_per_subtitle):
#         cnt += 1
#         words_range = alternative.words[i:i+max_words_per_subtitle]
#         words = [word_info.word for word_info in words_range]
#         words = " ".join(words[:len(words) // 2]) +  "\n" + " ".join(words[len(words) // 2:])
#         start_time = words_range[0].start_time.seconds + words_range[0].start_time.nanos * 1e-9
#         end_time =  words_range[-1].end_time.seconds + words_range[-1].end_time.nanos * 1e-9
#         print(cnt, file=wf)
#         print(srt_format_timestamp(start_time), "-->", srt_format_timestamp(end_time), file=wf)
#         print(words, file=wf)
#         print(file=wf)