In [49]:
import os
from google.cloud import speech
import wave
from pydub import AudioSegment
from google.cloud import storage

In [52]:
train_path = os.path.relpath('./Data/train/')
test_path = os.path.relpath('./Data/test')
validation_path = os.path.relpath('./Data/validation')
train_write_path = os.path.relpath('./Data/transcripts/train')
test_write_path = os.path.relpath('./Data/transcripts/test')
validation_write_path = os.path.relpath('./Data/transcripts/validation')
save_path = os.path.relpath('.')
bucket_name = 'msc_research'
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/changhyun/workspace/ABI_research/config/google_services.json"


In [3]:
def transcribing():
    client = speech.SpeechClient()
    responses = []
    errors = []
    for file in os.listdir(audio_path):
        path = os.path.join(audio_path, file)
        with wave.open(path, "r") as wf:
            channel = wf.getnchannels()
            frame_rate = wf.getframerate()
            with open(path, "rb") as audio_file:
                content = audio_file.read()
            audio = speech.RecognitionAudio(content=content)
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz= frame_rate,
                audio_channel_count=channel,
                enable_separate_recognition_per_channel=True,
                language_code="en-US",
            )
            try:
                responses.append(client.recognize(config=config, audio=audio))
            except Exception as e:
                print(path)
                print(e)
                errors.append(file)

    return responses, errors


In [4]:
def check_if_files_mono():
    sample = AudioSegment.from_wav(audio_path)
    print(sample.channels)

In [5]:
def frame_rate_channel(audio_path):
    frame_rates = {}
    channels = {}
    for file in os.listdir(audio_path):
        path = os.path.join(audio_path, file)
        with wave.open(path, "r") as wf:
            frame_rate = wf.getframerate()
            channel = wf.getnchannels()
            freq = frame_rates.get(frame_rate, "None")
            if freq == "None":
                frame_rates[frame_rate] = 1
            else:
                frame_rates[frame_rate] += 1
            freq = channels.get(channel, "None")
            if freq == "None":
                channels[channel] = 1
            else:
                channels[channel] += 1
    return frame_rates, channels


In [6]:
frame_rates, channels = frame_rate_channel(audio_path)

In [7]:
frame_rates

{16000: 608, 44100: 119, 22050: 23}

In [27]:
channels

{1: 598, 2: 152}

In [8]:
responses, errors = transcribing()

Data/validation/MSP-PODCAST_1186_0014.wav
Data/validation/MSP-PODCAST_1191_0026.wav
Data/validation/MSP-PODCAST_1186_0007.wav
Data/validation/MSP-PODCAST_1185_0010.wav
Data/validation/MSP-PODCAST_1187_0001.wav
Data/validation/MSP-PODCAST_1188_0023.wav
Data/validation/MSP-PODCAST_1186_0010.wav
Data/validation/MSP-PODCAST_1190_0063.wav
Data/validation/MSP-PODCAST_1185_0011.wav
Data/validation/MSP-PODCAST_1191_0010.wav
Data/validation/MSP-PODCAST_1187_0007.wav
Data/validation/MSP-PODCAST_1186_0006.wav
Data/validation/MSP-PODCAST_1191_0018.wav
Data/validation/MSP-PODCAST_1187_0015.wav
Data/validation/MSP-PODCAST_1191_0019.wav
Data/validation/MSP-PODCAST_1187_0026.wav
Data/validation/MSP-PODCAST_1187_0032.wav


[results {
   alternatives {
     transcript: "for the evidence of proof or anything"
     confidence: 0.8420075178146362
   }
   result_end_time {
     seconds: 2
     nanos: 130000000
   }
   language_code: "en-us"
 }
 total_billed_time {
   seconds: 15
 },
 results {
   alternatives {
     transcript: "next time you\'re going to be a freaking that we\'re going to build a like a gigantic crater cross I mean"
     confidence: 0.8188715577125549
   }
   result_end_time {
     seconds: 5
     nanos: 10000000
   }
   language_code: "en-us"
 }
 total_billed_time {
   seconds: 15
 },
 results {
   alternatives {
     transcript: "the dragon Rising you can get a book"
     confidence: 0.9100139141082764
   }
   channel_tag: 2
   result_end_time {
     seconds: 6
     nanos: 50000000
   }
   language_code: "en-us"
 }
 results {
   alternatives {
     transcript: "we should just play around with The Rock"
     confidence: 0.8178876638412476
   }
   channel_tag: 1
   result_end_time {
     sec

In [21]:
# limit 60sec & 10MB
def find_long_audios(path):
    files = []
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        size = byte_to_mb(os.path.getsize(file_path))
        if size > 10:
            files.append(file)
            continue
        with wave.open(file_path, "r") as wf:
            frame_rate = wf.getframerate()
            channel = wf.getnchannels()
            n_frames = wf.getnframes()
            duration = n_frames / float(frame_rate)
            if duration > 60:
                files.append(file)
    return files

In [11]:
def byte_to_mb(size):
    return size / 1024 / 1024

In [22]:
train_long_files = find_long_audios(train_path)
test_long_files = find_long_audios(test_path)
valid_long_files = find_long_audios(validation_path)

In [23]:
train_long_files

['MSP-PODCAST_1170_0047.wav',
 'MSP-PODCAST_1167_0089.wav',
 'MSP-PODCAST_0422_0206.wav',
 'MSP-PODCAST_1170_0023.wav',
 'MSP-PODCAST_0456_0084.wav',
 'MSP-PODCAST_1184_0053.wav',
 'MSP-PODCAST_0456_0086.wav',
 'MSP-PODCAST_0422_0233.wav',
 'MSP-PODCAST_0361_0032.wav',
 'MSP-PODCAST_0418_0053.wav',
 'MSP-PODCAST_1167_0092.wav',
 'MSP-PODCAST_1167_0004.wav',
 'MSP-PODCAST_1353_0031.wav',
 'MSP-PODCAST_1353_0035.wav',
 'MSP-PODCAST_0456_0077.wav',
 'MSP-PODCAST_0380_0232.wav',
 'MSP-PODCAST_0456_0094.wav']

In [24]:
test_long_files

['MSP-PODCAST_1159_0022.wav',
 'MSP-PODCAST_1154_0024.wav',
 'MSP-PODCAST_0498_0348.wav',
 'MSP-PODCAST_1130_0008.wav',
 'MSP-PODCAST_1159_0007.wav',
 'MSP-PODCAST_1159_0004.wav',
 'MSP-PODCAST_1130_0002.wav',
 'MSP-PODCAST_1154_0035.wav',
 'MSP-PODCAST_1159_0020.wav',
 'MSP-PODCAST_1130_0006.wav',
 'MSP-PODCAST_1183_0037.wav',
 'MSP-PODCAST_0538_0094.wav',
 'MSP-PODCAST_1130_0004.wav']

In [26]:
valid_long_files

['MSP-PODCAST_1186_0014.wav',
 'MSP-PODCAST_1191_0026.wav',
 'MSP-PODCAST_1186_0007.wav',
 'MSP-PODCAST_1185_0010.wav',
 'MSP-PODCAST_1187_0001.wav',
 'MSP-PODCAST_1188_0023.wav',
 'MSP-PODCAST_1186_0010.wav',
 'MSP-PODCAST_1190_0063.wav',
 'MSP-PODCAST_1185_0011.wav',
 'MSP-PODCAST_1191_0010.wav',
 'MSP-PODCAST_1187_0007.wav',
 'MSP-PODCAST_1186_0006.wav',
 'MSP-PODCAST_1191_0018.wav',
 'MSP-PODCAST_1187_0015.wav',
 'MSP-PODCAST_1191_0019.wav',
 'MSP-PODCAST_1187_0026.wav',
 'MSP-PODCAST_1187_0032.wav']

In [28]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

In [29]:
def delete_blob(bucket_name, blob_name):
    """Deletes a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.delete()

In [39]:
def frame_rate_channel(audio_file_name):
    with wave.open(audio_file_name, "rb") as wave_file:
        frame_rate = wave_file.getframerate()
        channels = wave_file.getnchannels()
        return frame_rate,channels

In [46]:
def google_transcribe(audio_file_name):
    
#     file_name = filepath + audio_file_name

    # The name of the audio file to transcribe
    
    frame_rate, channels = frame_rate_channel(audio_file_name)
    
#     source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    
    upload_blob(bucket_name, audio_file_name, destination_blob_name)
    
    gcs_uri = 'gs://' + bucketname + '/' + audio_file_name
    transcript = ''
    
    client = speech.SpeechClient()
    audio = types.RecognitionAudio(uri=gcs_uri)

    config = types.RecognitionConfig(
    encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
    sample_rate_hertz=frame_rate,
    language_code='en-US')

    # Detects speech in the audio file
    operation = client.long_running_recognize(config, audio)
    response = operation.result(timeout=10000)

    for result in response.results:
        transcript += result.alternatives[0].transcript
    
    delete_blob(bucket_name, destination_blob_name)
    return transcript

In [31]:
def write_transcripts(transcript_filename,transcript):
    f= open(output_filepath + transcript_filename,"w+")
    f.write(transcript)
    f.close()

In [53]:
# for file in train_long_files:
file_name = os.path.join(train_path, train_long_files[0])
transcript = google_transcribe(file_name)
write_transcripts(save_path, transcript)

Forbidden: 403 GET https://storage.googleapis.com/storage/v1/b/msc_research?projection=noAcl&prettyPrint=false: abi-research@round-booking-342500.iam.gserviceaccount.com does not have storage.buckets.get access to the Google Cloud Storage bucket.