# ASR & subtitles generation

### *How to generate SRT file?*

- Get transcription from **ASR** model

In [None]:
import os
from dotenv import load_dotenv
from pathlib import Path

dotenv_path = Path('/workspace/.env')
load_dotenv(dotenv_path=dotenv_path)

# access the environment variables from the .env file
asr_endpoint_url = os.environ.get('ASR_EN_US_ENDPOINT')
ai_endpoint_token = os.environ.get('OVH_AI_ENDPOINTS_ACCESS_TOKEN')

In [None]:
import riva.client

# ASR function
def asr_transcription(source_lang, audio_input):

    # connect with riva asr server
    asr_service = riva.client.ASRService(
                    riva.client.Auth(
                        uri=asr_endpoint_url,
                        use_ssl=True, 
                        metadata_args=[["authorization", f"bearer {ai_endpoint_token}"]]   
                    )
                )

    # set up config
    asr_config = riva.client.RecognitionConfig(
        language_code=source_lang,
        max_alternatives=1,
        enable_automatic_punctuation=True,
        enable_word_time_offsets=True,
        audio_channel_count = 1,
    )
    
    # open and read audio file
    with open(audio_input, 'rb') as fh:
        audio = fh.read()
    
    riva.client.add_audio_file_specs_to_config(asr_config, audio)
    riva.client.add_word_boosting_to_config(asr_config, ["Ovh", "datacenter", "cloud"], 20.0)

    # return response
    resp = asr_service.offline_recognize(audio, asr_config)
    output_asr = []
    
    # extract sentence information
    for s in range(len(resp.results)):

        # define output lists
        output = resp.results[s].alternatives[0]
        output_sentence = []
        
        sentence = output.transcript
        output_sentence.append(sentence)
        
        for w in range(len(output.words)):
            start_sentence = output.words[0].start_time
            end_sentence = output.words[w].end_time
        
        # add start time and stop time of the sentence
        output_sentence.append(start_sentence)
        output_sentence.append(end_sentence)
       
        # final asr transcription and time sequences
        output_asr.append(output_sentence)
        
    # return response
    return output_asr

- Convert ms into timecode

In [None]:
# convert ms into timecode
def ms_to_timecode(x):
     
    hour, x = divmod(x, 3600000)
    minute, x = divmod(x, 60000)
    second, x = divmod(x, 1000)
    millisecond, x = divmod(x, 1)

    return '%.2d:%.2d:%.2d,%.3d' % (hour, minute, second, millisecond)

- Create SRT file for video subtitles

In [None]:
# create SRT file with subtitles
def generate_str_file(output_asr):
    
    lines = []
    for t in range(len(output_asr)):
        lines.append("%d" % t)
        lines.append(
            "%s --> %s" %
            (
                ms_to_timecode(output_asr[t][1]),
                ms_to_timecode(output_asr[t][2])
            )
        )
        lines.append(output_asr[t][0])
        lines.append('')
    
    return '\n'.join(lines)

- Play audio sample

In [None]:
from IPython.display import Audio

audio_input = "audio_ovhcloud_en_1.wav"
Audio(f"/workspace/ai-multimedia-translator/audio_samples/{audio_input}")

- Get results from **RIVA ASR**

In [None]:
# audio transcription
output_asr = asr_transcription("en-US", f"/workspace/ai-multimedia-translator/audio_samples/{audio_input}")
print("Transcription output - RIVA ASR:\n\n", output_asr)

- Generate **SRT file**

In [None]:
# subtitles generation
with open(f"/workspace/ai-multimedia-translator/outputs/subtitles_{audio_input[:-4]}.srt", 'w') as f:
    f.write(generate_str_file(output_asr))
    print("Generated subtitles - SRT file:\n\n", generate_str_file(output_asr))