In [2]:
import os
from google.cloud import speech
import wave
from pydub import AudioSegment
from google.cloud import storage

In [3]:
train_path = os.path.relpath('./Data/train/')
test_path = os.path.relpath('./Data/test')
validation_path = os.path.relpath('./Data/validation')
train_write_path = os.path.relpath('./Data/transcripts/train')
test_write_path = os.path.relpath('./Data/transcripts/test')
validation_write_path = os.path.relpath('./Data/transcripts/validation')
save_path = os.path.relpath('.')
bucket_name = 'msc_research'
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/changhyun/workspace/ABI_research/config/config.json"


In [4]:
def transcribing():
    client = speech.SpeechClient()
    responses = []
    errors = []
    for file in os.listdir(audio_path):
        path = os.path.join(audio_path, file)
        with wave.open(path, "r") as wf:
            channel = wf.getnchannels()
            frame_rate = wf.getframerate()
            with open(path, "rb") as audio_file:
                content = audio_file.read()
            audio = speech.RecognitionAudio(content=content)
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz= frame_rate,
                audio_channel_count=channel,
                enable_separate_recognition_per_channel=True,
                language_code="en-US",
            )
            try:
                responses.append(client.recognize(config=config, audio=audio))
            except Exception as e:
                print(path)
                print(e)
                errors.append(file)

    return responses, errors


In [5]:
def check_if_files_mono():
    sample = AudioSegment.from_wav(audio_path)
    print(sample.channels)

In [29]:
def frame_rate_channel_freq(audio_path):
    frame_rates = {}
    channels = {}
    for file in os.listdir(audio_path):
        path = os.path.join(audio_path, file)
        with wave.open(path, "r") as wf:
            frame_rate = wf.getframerate()
            channel = wf.getnchannels()
            freq = frame_rates.get(frame_rate, "None")
            if freq == "None":
                frame_rates[frame_rate] = 1
            else:
                frame_rates[frame_rate] += 1
            freq = channels.get(channel, "None")
            if freq == "None":
                channels[channel] = 1
            else:
                channels[channel] += 1
    return frame_rates, channels


In [30]:
frame_rates, channels = frame_rate_channel_freq(train_path)

In [31]:
frame_rates

{16000: 2753, 44100: 30}

In [17]:
channels

{1: 2753, 2: 30}

In [18]:
# responses, errors = transcribing()

In [19]:
# limit 60sec & 10MB
def find_long_audios(path):
    files = []
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        size = byte_to_mb(os.path.getsize(file_path))
        if size > 10:
            files.append(file)
            continue
        with wave.open(file_path, "r") as wf:
            frame_rate = wf.getframerate()
            channel = wf.getnchannels()
            n_frames = wf.getnframes()
            duration = n_frames / float(frame_rate)
            if duration > 60:
                files.append(file)
    return files

In [20]:
def byte_to_mb(size):
    return size / 1024 / 1024

In [21]:
train_long_files = find_long_audios(train_path)
test_long_files = find_long_audios(test_path)
valid_long_files = find_long_audios(validation_path)

In [22]:
train_long_files

['MSP-PODCAST_1170_0047.wav',
 'MSP-PODCAST_1167_0089.wav',
 'MSP-PODCAST_0422_0206.wav',
 'MSP-PODCAST_1170_0023.wav',
 'MSP-PODCAST_0456_0084.wav',
 'MSP-PODCAST_1184_0053.wav',
 'MSP-PODCAST_0456_0086.wav',
 'MSP-PODCAST_0422_0233.wav',
 'MSP-PODCAST_0361_0032.wav',
 'MSP-PODCAST_0418_0053.wav',
 'MSP-PODCAST_1167_0092.wav',
 'MSP-PODCAST_1167_0004.wav',
 'MSP-PODCAST_1353_0031.wav',
 'MSP-PODCAST_1353_0035.wav',
 'MSP-PODCAST_0456_0077.wav',
 'MSP-PODCAST_0380_0232.wav',
 'MSP-PODCAST_0456_0094.wav']

In [23]:
test_long_files

['MSP-PODCAST_1159_0022.wav',
 'MSP-PODCAST_1154_0024.wav',
 'MSP-PODCAST_0498_0348.wav',
 'MSP-PODCAST_1130_0008.wav',
 'MSP-PODCAST_1159_0007.wav',
 'MSP-PODCAST_1159_0004.wav',
 'MSP-PODCAST_1130_0002.wav',
 'MSP-PODCAST_1154_0035.wav',
 'MSP-PODCAST_1159_0020.wav',
 'MSP-PODCAST_1130_0006.wav',
 'MSP-PODCAST_1183_0037.wav',
 'MSP-PODCAST_0538_0094.wav',
 'MSP-PODCAST_1130_0004.wav']

In [24]:
valid_long_files

['MSP-PODCAST_1186_0014.wav',
 'MSP-PODCAST_1191_0026.wav',
 'MSP-PODCAST_1186_0007.wav',
 'MSP-PODCAST_1185_0010.wav',
 'MSP-PODCAST_1187_0001.wav',
 'MSP-PODCAST_1188_0023.wav',
 'MSP-PODCAST_1186_0010.wav',
 'MSP-PODCAST_1190_0063.wav',
 'MSP-PODCAST_1185_0011.wav',
 'MSP-PODCAST_1191_0010.wav',
 'MSP-PODCAST_1187_0007.wav',
 'MSP-PODCAST_1186_0006.wav',
 'MSP-PODCAST_1191_0018.wav',
 'MSP-PODCAST_1187_0015.wav',
 'MSP-PODCAST_1191_0019.wav',
 'MSP-PODCAST_1187_0026.wav',
 'MSP-PODCAST_1187_0032.wav']

In [25]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

In [26]:
def delete_blob(bucket_name, blob_name):
    """Deletes a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.delete()

In [32]:
def frame_rate_channel(audio_file_name):
    with wave.open(audio_file_name, "rb") as wave_file:
        frame_rate = wave_file.getframerate()
        channels = wave_file.getnchannels()
        return frame_rate,channels

In [36]:
def google_transcribe(audio_file_name):
    
#     file_name = filepath + audio_file_name

    # The name of the audio file to transcribe
    
    frame_rate, channels = frame_rate_channel(audio_file_name)
    
#     source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    
    upload_blob(bucket_name, audio_file_name, destination_blob_name)
    
    gcs_uri = 'gs://' + bucket_name + '/' + audio_file_name
    transcript = ''
    
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)
    value = False
    if channels > 1:
        value = True

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        audio_channel_count=channels,
        enable_separate_recognition_per_channel=value,
        language_code='en-US')

    # Detects speech in the audio file
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=10000)

    for result in response.results:
        transcript += result.alternatives[0].transcript
    
    delete_blob(bucket_name, destination_blob_name)
    return transcript

In [37]:
def write_transcripts(transcript_filename,transcript):
    f= open(output_filepath + transcript_filename,"w+")
    f.write(transcript)
    f.close()

In [38]:
# for file in train_long_files:
file_name = os.path.join(train_path, train_long_files[0])
transcript = google_transcribe(file_name)
# write_transcripts(save_path, transcript)