In [1]:
import os
import fnmatch
import io
from google.cloud import speech
import wave
from pydub import AudioSegment
from tqdm import tqdm
from google.cloud import storage

In [2]:
data_path = './ABI_data/'
audio_path = data_path + 'Audios/'
diary_path = data_path + 'diarization/'
temp_path = data_path + 'temp/'

# This path will be followed by series name
output_path_base = data_path + 'Transcripts/'
lld_path = data_path + 'LLDs/'

failed_path = data_path + 'transcribing_failed.txt'
bucket_name = 'msc_research_kings'
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="/home/changhyun/workspace/ABI_research/config/config3.json"

In [23]:
def transcribing():
    client = speech.SpeechClient()
    responses = []
    errors = []
    for file in os.listdir(audio_path):
        path = os.path.join(audio_path, file)
        with wave.open(path, "r") as wf:
            channel = wf.getnchannels()
            frame_rate = wf.getframerate()
            with open(path, "rb") as audio_file:
                content = audio_file.read()
            audio = speech.RecognitionAudio(content=content)
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz= frame_rate,
                audio_channel_count=channel,
                enable_separate_recognition_per_channel=True,
                language_code="en-US",
            )
            try:
                responses.append(client.recognize(config=config, audio=audio))
            except Exception as e:
                print(path)
                print(e)
                errors.append(file)

    return responses, errors


def short_transcribe(audio_file_name):
    client = speech.SpeechClient()
    confidences = []
    transcript=''
    frame_rate, channels = frame_rate_channel(audio_file_name)
    value = False
#     if channels > 1:
#         value = True
    with io.open(audio_file_name, "rb") as audio_file:
        content = audio_file.read()
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        audio_channel_count=channels,
        enable_automatic_punctuation=True,
        enable_separate_recognition_per_channel=False,
        language_code="en-US",
    )
    try:
        response = client.recognize(config=config, audio=audio)
    except:
        print(audio_file_name)
    
    print(response)
    for result in response.results:
        transcript += result.alternatives[0].transcript
        confidences.append(result.alternatives[0].confidence)

    return transcript, mean(confidences)


def long_transcribe(audio_file_name):
    #     file_name = filepath + audio_file_name

    # The name of the audio file to transcribe

    frame_rate, channels = frame_rate_channel(audio_file_name)

    #     source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name

    upload_blob(bucket_name, audio_file_name, destination_blob_name)

    gcs_uri = 'gs://' + bucket_name + '/' + audio_file_name
    transcript = ''

    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)
    value = False
    if channels > 1:
        value = True

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        audio_channel_count=channels,
        enable_automatic_punctuation=True,
        enable_separate_recognition_per_channel=False,
        language_code='en-US')

    # Detects speech in the audio file
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=10000)
    confidence = []
    
    print(response)
    for result in response.results:
        transcript += result.alternatives[0].transcript
        confidence.append(result.alternatives[0].confidence)

    delete_blob(bucket_name, destination_blob_name)
    return transcript, mean(confidence)


In [4]:
def check_if_files_mono():
    sample = AudioSegment.from_wav(audio_path)
    print(sample.channels)

In [5]:
def frame_rate_channel_freq(audio_path):
    frame_rates = {}
    channels = {}
    for file in os.listdir(audio_path):
        path = os.path.join(audio_path, file)
        with wave.open(path, "r") as wf:
            frame_rate = wf.getframerate()
            channel = wf.getnchannels()
            freq = frame_rates.get(frame_rate, "None")
            if freq == "None":
                frame_rates[frame_rate] = 1
            else:
                frame_rates[frame_rate] += 1
            freq = channels.get(channel, "None")
            if freq == "None":
                channels[channel] = 1
            else:
                channels[channel] += 1
    return frame_rates, channels


In [6]:
def mean(li):
    if len(li) == 0:
        return 0
    return sum(li) / len(li)

In [7]:
# def short_transcribe(audio_file_name):
#     client = speech.SpeechClient()
#     confidences = []
#     transcript=''
#     frame_rate, channels = frame_rate_channel(audio_file_name)
#     value = False
#     if channels > 1:
#         value = True
#     with io.open(audio_file_name, "rb") as audio_file:
#         content = audio_file.read()
#     audio = speech.RecognitionAudio(content=content)
#     config = speech.RecognitionConfig(
#         encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
#         sample_rate_hertz=frame_rate,
#         audio_channel_count=channels,
#         enable_separate_recognition_per_channel=value,
#         language_code="en-US",
#     )
#     try:
#         response = client.recognize(config=config, audio=audio)
#     except:
#         print(audio_file_name)
#     for result in response.results:
#         transcript += result.alternatives[0].transcript
#         confidences.append(result.alternatives[0].confidence)
#     print(response)

#     return transcript, mean(confidences)

In [8]:
with open(data_path+'selected.txt', 'r') as f:
    lines = f.readlines()
    series = []
    for line in lines:
        series.append(line.strip('\n'))


In [24]:
import json

for s in series:
    s_diaries = os.listdir(diary_path + s)
    episodes = fnmatch.filter(s_diaries, '*_cleaned.json')
    episodes.sort()

    confidences = []
    for episode in tqdm(episodes):
        f = open(diary_path + s + '/' + episode)
        data = json.load(f)

        inst = episode.split('.')[0].split('_')[0]
        audio_file_path = audio_path + s + '/' + inst + '.wav'
        file = AudioSegment.from_wav(audio_file_path)
        output_lld_path = lld_path + s + '/'
        if not os.path.exists(output_lld_path):
            os.mkdir(output_lld_path)

        print('Current episode: %s' % (inst))

        for key in data:
            
            segment = data[key]
            start = segment['start'] * 1000
            end = segment['end'] * 1000
            sliced = file[start:end]
            temp_file_path = temp_path + key + '.wav'
            sliced.export(temp_file_path, format="wav")
            if (end - start) / 1000 < 60.0:
                transcript, conf = short_transcribe(temp_file_path)
            else:
                transcript, conf = long_transcribe(temp_file_path)


            if conf == 0:
                print("Failed to transcribe:", key)
                fi = open(failed_path, "a")
                fi.write(key + '\n')
                fi.close()

            confidences.append(conf)
            new_path = os.path.join(output_path_base + s, key + '.txt')
            if not os.path.exists(output_path_base + s):
                os.mkdir(output_path_base + s)
#             write_transcripts(new_path, transcript)
#             print(key, '.txt has been created')

#             extracted = smile.process_file(temp_file_path)
#             extracted.to_csv(output_lld_path + key + '.csv')

            os.remove(temp_file_path)

    conf_dict[inst] = confidences


  0%|                                                    | 0/26 [00:00<?, ?it/s]

Current episode: Brain Injury Today Ep1 (Keeping the brain injury community connected during the coronavirus outbreak)
results {
  alternatives {
    transcript: "Brain injury today is sponsored by the Washington State traumatic. Brain injury Council and produced by goal 17 media storytellers for the common good."
    confidence: 0.9441017508506775
  }
  result_end_time {
    seconds: 9
    nanos: 330000000
  }
  language_code: "en-us"
}
total_billed_time {
  seconds: 15
}

results {
  alternatives {
    transcript: "Hi, everyone. This is Deborah crawling executive director for the brain injury. Alliance of Washington is an exciting day here today as we are launching our first podcast, brain injury today and we welcome all of you were going to start today with just kind of a dialogue with my board. President is joining us, Allison Molnar as we are in uncertain times of being able to really have communication with all of our community. We\'re taking advantage of all technology has to of

  0%|                                                    | 0/26 [01:28<?, ?it/s]


KeyboardInterrupt: 

In [10]:
# limit 60sec & 10MB
def find_long_audios(path):
    files = []
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        size = byte_to_mb(os.path.getsize(file_path))
        if size > 10:
            files.append(file)
            continue
        with wave.open(file_path, "r") as wf:
            frame_rate = wf.getframerate()
            channel = wf.getnchannels()
            n_frames = wf.getnframes()
            duration = n_frames / float(frame_rate)
            if duration > 60:
                files.append(file)
    return files

In [11]:
def byte_to_mb(size):
    return size / 1024 / 1024

In [12]:
train_long_files = find_long_audios(train_path)
test_long_files = find_long_audios(test_path)
valid_long_files = find_long_audios(validation_path)

In [13]:
train_long_files

['MSP-PODCAST_1170_0047.wav',
 'MSP-PODCAST_1167_0089.wav',
 'MSP-PODCAST_0422_0206.wav',
 'MSP-PODCAST_1170_0023.wav',
 'MSP-PODCAST_0456_0084.wav',
 'MSP-PODCAST_1184_0053.wav',
 'MSP-PODCAST_0456_0086.wav',
 'MSP-PODCAST_0422_0233.wav',
 'MSP-PODCAST_0361_0032.wav',
 'MSP-PODCAST_0418_0053.wav',
 'MSP-PODCAST_1167_0092.wav',
 'MSP-PODCAST_1167_0004.wav',
 'MSP-PODCAST_1353_0031.wav',
 'MSP-PODCAST_1353_0035.wav',
 'MSP-PODCAST_0456_0077.wav',
 'MSP-PODCAST_0380_0232.wav',
 'MSP-PODCAST_0456_0094.wav']

In [14]:
test_long_files

['MSP-PODCAST_1159_0022.wav',
 'MSP-PODCAST_1154_0024.wav',
 'MSP-PODCAST_0498_0348.wav',
 'MSP-PODCAST_1130_0008.wav',
 'MSP-PODCAST_1159_0007.wav',
 'MSP-PODCAST_1159_0004.wav',
 'MSP-PODCAST_1130_0002.wav',
 'MSP-PODCAST_1154_0035.wav',
 'MSP-PODCAST_1159_0020.wav',
 'MSP-PODCAST_1130_0006.wav',
 'MSP-PODCAST_1183_0037.wav',
 'MSP-PODCAST_0538_0094.wav',
 'MSP-PODCAST_1130_0004.wav']

In [15]:
valid_long_files

['MSP-PODCAST_1186_0014.wav',
 'MSP-PODCAST_1191_0026.wav',
 'MSP-PODCAST_1186_0007.wav',
 'MSP-PODCAST_1185_0010.wav',
 'MSP-PODCAST_1187_0001.wav',
 'MSP-PODCAST_1188_0023.wav',
 'MSP-PODCAST_1186_0010.wav',
 'MSP-PODCAST_1190_0063.wav',
 'MSP-PODCAST_1185_0011.wav',
 'MSP-PODCAST_1191_0010.wav',
 'MSP-PODCAST_1187_0007.wav',
 'MSP-PODCAST_1186_0006.wav',
 'MSP-PODCAST_1191_0018.wav',
 'MSP-PODCAST_1187_0015.wav',
 'MSP-PODCAST_1191_0019.wav',
 'MSP-PODCAST_1187_0026.wav',
 'MSP-PODCAST_1187_0032.wav']

In [14]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

In [15]:
def delete_blob(bucket_name, blob_name):
    """Deletes a blob from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)

    blob.delete()

In [10]:
def frame_rate_channel(audio_file_name):
    with wave.open(audio_file_name, "rb") as wave_file:
        frame_rate = wave_file.getframerate()
        channels = wave_file.getnchannels()
        return frame_rate,channels

In [43]:
def google_transcribe(audio_file_name):
    
#     file_name = filepath + audio_file_name

    # The name of the audio file to transcribe
    
    frame_rate, channels = frame_rate_channel(audio_file_name)
    
#     source_file_name = filepath + audio_file_name
    destination_blob_name = audio_file_name
    
    upload_blob(bucket_name, audio_file_name, destination_blob_name)
    
    gcs_uri = 'gs://' + bucket_name + '/' + audio_file_name
    transcript = ''
    
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri=gcs_uri)
    value = False
    if channels > 1:
        value = True

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=frame_rate,
        audio_channel_count=channels,
        enable_separate_recognition_per_channel=value,
        language_code='en-US')

    # Detects speech in the audio file
    operation = client.long_running_recognize(config=config, audio=audio)
    response = operation.result(timeout=10000)
    confidence = []

    for result in response.results:
        transcript += result.alternatives[0].transcript
        confidence.append(result.alternatives[0].confidence)
    
    delete_blob(bucket_name, destination_blob_name)
    return transcript, mean(confidence)

In [25]:
def write_transcripts(transcript_filename,transcript):
    f= open(transcript_filename,"w+")
    f.write(transcript)
    f.close()

In [35]:
# for file in tqdm(train_long_files):
#     file_name = os.path.join(train_path, file)
#     transcript = google_transcribe(file_name)
#     write_path = os.path.join(train_write_path, file[0:21] + '.txt')
#     write_transcripts(write_path, transcript)
#     print(file[0:21], '.txt has been created')

In [45]:
test_confidence = []
for file in tqdm(test_long_files):
    file_name = os.path.join(test_path, file)
    transcript, confidence = google_transcribe(file_name)
    test_confidence.append(confidence)
    write_path = os.path.join(test_write_path, file[0:21] + '.txt')
    write_transcripts(write_path, transcript)
    print(file[0:21], '.txt has been created')

  8%|███▍                                        | 1/13 [00:26<05:19, 26.60s/it]

MSP-PODCAST_1159_0022 .txt has been created


 15%|██████▊                                     | 2/13 [00:46<04:10, 22.77s/it]

MSP-PODCAST_1154_0024 .txt has been created


 23%|██████████▏                                 | 3/13 [01:57<07:27, 44.77s/it]

MSP-PODCAST_0498_0348 .txt has been created


 31%|█████████████▌                              | 4/13 [02:23<05:37, 37.47s/it]

MSP-PODCAST_1130_0008 .txt has been created


 38%|████████████████▉                           | 5/13 [03:05<05:11, 38.92s/it]

MSP-PODCAST_1159_0007 .txt has been created


 46%|████████████████████▎                       | 6/13 [04:10<05:34, 47.72s/it]

MSP-PODCAST_1159_0004 .txt has been created


 54%|███████████████████████▋                    | 7/13 [04:48<04:27, 44.52s/it]

MSP-PODCAST_1130_0002 .txt has been created


 62%|███████████████████████████                 | 8/13 [05:28<03:35, 43.11s/it]

MSP-PODCAST_1154_0035 .txt has been created


 69%|██████████████████████████████▍             | 9/13 [06:14<02:56, 44.14s/it]

MSP-PODCAST_1159_0020 .txt has been created


 77%|█████████████████████████████████          | 10/13 [06:54<02:08, 42.74s/it]

MSP-PODCAST_1130_0006 .txt has been created


 85%|████████████████████████████████████▍      | 11/13 [07:32<01:22, 41.28s/it]

MSP-PODCAST_1183_0037 .txt has been created


 92%|███████████████████████████████████████▋   | 12/13 [07:52<00:34, 34.95s/it]

MSP-PODCAST_0538_0094 .txt has been created


100%|███████████████████████████████████████████| 13/13 [08:23<00:00, 38.69s/it]

MSP-PODCAST_1130_0004 .txt has been created





In [46]:
test_confidence

[0.9495081007480621,
 0.9791086912155151,
 0.9364469528198243,
 0.9494617581367493,
 0.9273580511411031,
 0.8997351825237274,
 0.967968612909317,
 0.9590785106023153,
 0.7883926033973694,
 0.9483107626438141,
 0.9263116896152497,
 0.9501621723175049,
 0.9424928625424703]

In [47]:
validation_confidence = []
for file in tqdm(valid_long_files):
    file_name = os.path.join(validation_path, file)
    transcript, confidence = google_transcribe(file_name)
    validation_confidence.append(confidence)
    write_path = os.path.join(validation_write_path, file[0:21] + '.txt')
    write_transcripts(write_path, transcript)
    print(file[0:21], '.txt has been created')

  6%|██▌                                         | 1/17 [00:18<05:02, 18.93s/it]

MSP-PODCAST_1186_0014 .txt has been created


 12%|█████▏                                      | 2/17 [00:58<07:44, 30.95s/it]

MSP-PODCAST_1191_0026 .txt has been created


 18%|███████▊                                    | 3/17 [01:14<05:39, 24.24s/it]

MSP-PODCAST_1186_0007 .txt has been created


 24%|██████████▎                                 | 4/17 [02:00<07:05, 32.71s/it]

MSP-PODCAST_1185_0010 .txt has been created


 29%|████████████▉                               | 5/17 [02:38<06:55, 34.59s/it]

MSP-PODCAST_1187_0001 .txt has been created


 35%|███████████████▌                            | 6/17 [03:35<07:45, 42.30s/it]

MSP-PODCAST_1188_0023 .txt has been created


 41%|██████████████████                          | 7/17 [04:05<06:21, 38.18s/it]

MSP-PODCAST_1186_0010 .txt has been created


 47%|████████████████████▋                       | 8/17 [04:52<06:09, 41.05s/it]

MSP-PODCAST_1190_0063 .txt has been created


 53%|███████████████████████▎                    | 9/17 [05:17<04:47, 35.98s/it]

MSP-PODCAST_1185_0011 .txt has been created


 59%|█████████████████████████▎                 | 10/17 [05:44<03:53, 33.39s/it]

MSP-PODCAST_1191_0010 .txt has been created


 65%|███████████████████████████▊               | 11/17 [06:13<03:11, 31.88s/it]

MSP-PODCAST_1187_0007 .txt has been created


 71%|██████████████████████████████▎            | 12/17 [06:56<02:57, 35.50s/it]

MSP-PODCAST_1186_0006 .txt has been created


 76%|████████████████████████████████▉          | 13/17 [07:19<02:06, 31.72s/it]

MSP-PODCAST_1191_0018 .txt has been created


 82%|███████████████████████████████████▍       | 14/17 [07:33<01:18, 26.31s/it]

MSP-PODCAST_1187_0015 .txt has been created


 88%|█████████████████████████████████████▉     | 15/17 [07:52<00:48, 24.06s/it]

MSP-PODCAST_1191_0019 .txt has been created


 94%|████████████████████████████████████████▍  | 16/17 [08:20<00:25, 25.26s/it]

MSP-PODCAST_1187_0026 .txt has been created


100%|███████████████████████████████████████████| 17/17 [08:58<00:00, 31.65s/it]

MSP-PODCAST_1187_0032 .txt has been created





In [48]:
validation_confidence

[0.9499222040176392,
 0.9642936885356903,
 0.9638748466968536,
 0.9199155569076538,
 0.9694797843694687,
 0.9587657898664474,
 0.9480800032615662,
 0.9034708042939504,
 0.963307335972786,
 0.9440132677555084,
 0.9383042305707932,
 0.9420467913150787,
 0.9685478806495667,
 0.96807461977005,
 0.9333463907241821,
 0.9595717936754227,
 0.8928799304095182]