# ASR Parallel Processing

This notebook contains the results of the experiment documented in [here](https://docs.google.com/document/d/16ifixrlyDK5A6-MmYMm8HEdJPwtyCmiUTc8HdWlJxM4/edit).

The outcome of this test should give us guidance for customers on whether they can process long audio files split in parallel, and whether this affects results drastically

In [47]:
import os 
import openai
import time
from pydub import AudioSegment
from pydub.utils import make_chunks
import requests
from concurrent.futures import ThreadPoolExecutor
from math import ceil
# Transcription compare comes from this repo: https://github.com/voicegain/transcription-compare
# Clone it and install to your virtual environment with `python setup.py transcription_compare`
from transcription_compare.levenshtein_distance_calculator import UKKLevenshteinDistanceCalculator
from transcription_compare.tokenizer import CharacterTokenizer, WordTokenizer
from transcription_compare.local_optimizer.digit_util import DigitUtil
from transcription_compare.local_optimizer.local_cer_optimizer import LocalCerOptimizer
from transcription_compare.results import MultiResult

from asr import call_asr
from transformers import convert_mp4_to_mp3

In [48]:
# set API key
openai.api_key = os.environ.get("OPENAI_API_KEY")

# Set data directory
data_dir = os.path.join(os.pardir,'data','long_video')

# Get references for audio files
audio_files = sorted([x for x in os.listdir(data_dir) ])
print(audio_files)

['gregbrockman_futureofllms.mp3', 'gregbrockman_futureofllms.mp4']


In [49]:
def transcription_extractor(audio_filepath):
    response = call_asr(openai.api_key,audio_filepath)
    return(response)

def make_chunks_updated(audio_segment, chunk_length):
    """
    Breaks an AudioSegment into chunks that are <chunk_length> milliseconds
    long.
    if chunk_length is 50 then you'll get a list of 50 millisecond long audio
    segments back (except the last one, which can be shorter)
    """
    number_of_chunks = ceil(len(audio_segment) / float(chunk_length))
    return [audio_segment[i * chunk_length:(i + 1) * chunk_length]
            for i in range(int(number_of_chunks))]

In [50]:
#importing file from location by giving its path
full_audio_file = AudioSegment.from_mp3(os.path.join(data_dir,audio_files[0]))

## Transcription

We will perform two transcriptions to get our sets to run through the API and compare
- First we'll take the full file, run it into the ASR API and save the responses along with the elapsed time
- Next we'll take the file, break it into 5 minute chunks and do the same
- Lastly we'll take the file, break it into 10 minute chunks and pull responses and times

### Long File Transcription

This section transcribes the full file with no splitting

In [51]:
full_st = time.time()
print(f'Start time is {time.ctime()}')
full_response = call_asr(openai.api_key,os.path.join(data_dir,audio_files[0]))
full_et = time.time()
print(f'End time is {time.ctime()}')

full_elapsed_time = full_et - full_st
print('Execution time:', full_elapsed_time, 'seconds')

Start time is Thu Dec 15 09:59:28 2022
Sending request
Request successful
End time is Thu Dec 15 10:02:02 2022
Execution time: 154.58868384361267 seconds


### 5 Minute Chunk Transcription

In [52]:
# set 5 minute chunks directory
chunks_5_dir = os.path.join(os.curdir,'chunks_5')

# create the directory if it doesn't yet exist
if not os.path.isdir(chunks_5_dir):
    os.mkdir(chunks_5_dir)
    
# clear any existing files down
if len(os.listdir(chunks_5_dir)) > 0:
    os.system(f"rm {chunks_5_dir}/*")

In [53]:
chunks_5 = make_chunks_updated(full_audio_file, 300000)

l = len(chunks_5)
for i, ch in enumerate(chunks_5):

    ch.export(os.path.join(chunks_5_dir,'chunk_5_' + str(i) + '.mp3'), format='mp3')

In [54]:
chunks_5_files = [os.path.join(os.curdir,'chunks_5',x) for x in sorted(os.listdir(chunks_5_dir))]
chunks_5_files

['./chunks_5/chunk_5_0.mp3',
 './chunks_5/chunk_5_1.mp3',
 './chunks_5/chunk_5_2.mp3',
 './chunks_5/chunk_5_3.mp3',
 './chunks_5/chunk_5_4.mp3',
 './chunks_5/chunk_5_5.mp3',
 './chunks_5/chunk_5_6.mp3',
 './chunks_5/chunk_5_7.mp3',
 './chunks_5/chunk_5_8.mp3',
 './chunks_5/chunk_5_9.mp3']

In [55]:
with ThreadPoolExecutor(max_workers=10) as pool:
    start_5_ts =time.time()
    response_list_5 = list(pool.map(transcription_extractor,chunks_5_files))
    end_5_ts =time.time()
    elapsed_5 = (end_5_ts - start_5_ts)  
    print("\n","time elapsed is :", elapsed_5)

Sending request
Sending request
Sending request
Sending request
Sending request
Sending request
Sending request
Sending request
Sending request
Sending request
Request successful
Request successful
Request successful
Request successful
Request successful
Request successful
Request successful
Request successful
Request successful
Request successful

 time elapsed is : 111.0569999217987


### 10 Minute Chunk Transcription

In [56]:
# set 10 minute chunks directory
chunks_10_dir = os.path.join(os.curdir,'chunks_10')

# create the directory if it doesn't yet exist
if not os.path.isdir(chunks_10_dir):
    os.mkdir(chunks_10_dir)
    
# clear any existing files down
if len(os.listdir(chunks_10_dir)) > 0:
    os.system(f"rm {chunks_10_dir}/*")

In [57]:
chunks_10 = make_chunks_updated(full_audio_file, 600000)

l = len(chunks_10)
for i, ch in enumerate(chunks_10):

    ch.export(os.path.join(chunks_10_dir,'chunk_10_' + str(i) + '.mp3'), format='mp3')

In [58]:
chunks_10_files = [os.path.join(os.curdir,'chunks_10',x) for x in sorted(os.listdir(chunks_10_dir))]
chunks_10_files

['./chunks_10/chunk_10_0.mp3',
 './chunks_10/chunk_10_1.mp3',
 './chunks_10/chunk_10_2.mp3',
 './chunks_10/chunk_10_3.mp3',
 './chunks_10/chunk_10_4.mp3']

In [59]:
with ThreadPoolExecutor(max_workers=10) as pool:
    start_10_ts =time.time()
    response_list_10 = list(pool.map(transcription_extractor,chunks_10_files))
    end_10_ts =time.time()
    elapsed_10 = (end_10_ts - start_10_ts)  
    print("\n","time elapsed is :", elapsed_10)

Sending requestSending request
Sending request
Sending request

Sending request
Request successful
Request successful
Request successful
Request successful
Request successful

 time elapsed is : 117.20787191390991


### Timing Summary

Customer quoted 48 minutes for 48 minute file

In [60]:
improvement_5 = round(full_elapsed_time/elapsed_5-1,2)
improvement_10 = round(full_elapsed_time/elapsed_10-1,2)

In [61]:
from datetime import timedelta

print(f"Processing one file took {str(timedelta(seconds=full_elapsed_time))}")
print(f"Processing ten minute chunks took {str(timedelta(seconds=elapsed_10))} and delivered a {improvement_10*100}% improvement")
print(f"Processing five minute chunks took {str(timedelta(seconds=elapsed_5))} and delivered a {improvement_5*100}% improvement")

Processing one file took 0:02:34.588684
Processing ten minute chunks took 0:01:57.207872 and delivered a 32.0% improvement
Processing five minute chunks took 0:01:51.057000 and delivered a 39.0% improvement


## Accuracy

Now we'll compare the outputs to confirm whether the chunking decreases accuracy meaningfully

TODO: This is not completed, as the results above were so positive we have not investigated the chunking method further

In [28]:
chunks_5_list = []
for response in response_list_5:
    chunks_5_list.append(response.json()['text'])
    
chunks_10_list = []
for response in response_list_10:
    chunks_10_list.append(response.json()['text'])

In [29]:
full_text = full_response.json()['text']
chunks_10_text = ''.join(chunks_10_list)
chunks_5_text = ''.join(chunks_5_list)

In [30]:
len(full_text), len(chunks_10_text), len(chunks_5_text)

(50549, 50416, 50443)

In [None]:
calculator = UKKLevenshteinDistanceCalculator(
        tokenizer=WordTokenizer(),
        get_alignment_result=True,
        local_optimizers=[DigitUtil(process_output_digit=True), LocalCerOptimizer()]
    )

output_results = calculator.get_distance(reference_text, value,
                                          brackets_list=["[]", "()", "<>"],
                                          to_lower=True,
                                          remove_punctuation=True,
                                          use_alternative_spelling=True)

In [None]:
def run_transcription_compare(reference_data, comparison_file, output_html_path):
    print("Start to compare results")

    #with open(reference_path, "r", encoding='utf-8') as reference_file:
        #reference_text = reference_file.read()

    calculator = UKKLevenshteinDistanceCalculator(
        tokenizer=WordTokenizer(),
        get_alignment_result=True,
        local_optimizers=[DigitUtil(process_output_digit=True), LocalCerOptimizer()]
    )

    output_all = dict()  # (output identifier -> output string)
    for output_path in output_file_list:
        with open(output_path, "r", encoding='utf-8') as output_file:
            output_text = output_file.read()
        output_path_name = os.path.basename(output_path)
        output_all[output_path_name] = output_text
    #logging.info("Finish reading all results")

    output_results = dict()  # (output_identifier -> output_string)
    for (key, value) in output_all.items():
        logging.info("Start to process {}".format(key))
        output_results[key] = calculator.get_distance(reference_text, value,
                                                      brackets_list=["[]", "()", "<>"],
                                                      to_lower=True,
                                                      remove_punctuation=True,
                                                      use_alternative_spelling=True)

    #logging.info("Merge all results into one HTML")
    calculator_local = UKKLevenshteinDistanceCalculator(
                tokenizer=CharacterTokenizer(),
                get_alignment_result=False)

    result = MultiResult(output_results, calculator_local)
    s = result.to_html()

    with open(output_html_path, 'w') as f:
        f.write(s)

In [31]:
calculator = UKKLevenshteinDistanceCalculator(
        tokenizer=WordTokenizer(),
        get_alignment_result=True,
        local_optimizers=[DigitUtil(process_output_digit=True), LocalCerOptimizer()]
    )