# Speech to Text Transcription with Diarization

Use Google Cloud's speech to text API to transcribe the videos. This API also includes optional speaker diarization to label different speakers, which may be helpful in us separating out what is said by the teacher versus the students. For large audio files (longer than a minute) they must be uploaded to Google Cloud storage first. 

References:
- https://towardsdatascience.com/extracting-audio-from-video-using-python-58856a940fd
- https://medium.com/codex/google-speech-to-text-api-tutorial-with-python-2e049ae3f525
- https://towardsdatascience.com/how-to-use-google-speech-to-text-api-to-transcribe-long-audio-files-1c886f4eb3e9
- https://cloud.google.com/speech-to-text/docs/async-recognize
- https://cloud.google.com/speech-to-text/docs/multiple-voices


In [1]:
# !pip install ffmpeg moviepy 
# !pip install google-cloud-speech
# !pip install google-cloud-storage
# !pip install protobuf==3.20.1 
# !pip install pydub

In [2]:
# Import libraries
from pydub import AudioSegment
import io
import os
from google.cloud import speech
from google.cloud import storage
import wave
import moviepy.editor as mp




In [3]:
# Directory paths
mp4_path = './Data/MP4_Files/' # Input mp4 file path
wav_path = './Data/WAV_Files/' # Audio file path
text_path = './Data/Text_Files/' # Final transcript path

# Create empty directories for wav and text files (if they don't exist)
if not os.path.exists(wav_path):
    os.makedirs(wav_path)
    
if not os.path.exists(text_path):
    os.makedirs(text_path)

# Name of the google cloud bucket (this must be already created)
bucket_name = "education_mindfulness" 

In [4]:
# Convert mp4 file to wav file
def convert_mp4_to_wav(file_name, mp4_path, wav_path):
    '''
    Loads mp4 file and write to .wav file
    
    Inputs:
    - file_name: name of file (without extension)
    - mp4_path: path to folder for mp4 files
    - wav_path: path to folder for wav files
    
    Outputs:
    - Saves .wav file to wav_path
    '''
    
    # Load mp4 file
    orig_video = mp.VideoFileClip(mp4_path + file_name + ".mp4")
    # Write to wav file
    orig_video.audio.write_audiofile(wav_path + file_name + ".wav")
    
    return

In [5]:
# Convert stereo file to mono file
def stereo_to_mono(file_name, wav_path):
    sound = AudioSegment.from_wav(wav_path + file_name + '.wav')
    sound = sound.set_channels(1)
    sound.export(wav_path + file_name + '.wav', format="wav")

# Get frame rate and number of channels
def frame_rate_channel(file_name, wav_path):
    with wave.open(wav_path + file_name + '.wav', "rb") as wave_file:
        frame_rate = wave_file.getframerate()
        channels = wave_file.getnchannels()
        return frame_rate, channels

In [6]:
# Upload file to Google storage
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    
# Delete file in Google storage
def delete_blob(bucket_name, blob_name):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.delete()

In [8]:
# Transcribe audio with diarization
def google_transcribe(file_name, wav_path, bucket_name):    
    # Convert stereo file to mono file
    frame_rate, channels = frame_rate_channel(file_name, wav_path)
    if channels > 1:
        stereo_to_mono(file_name, wav_path)
    
    # Upload file to google storage
    source_file_name = wav_path + file_name + '.wav' # Local file path
    destination_blob_name = file_name + '.wav' # File name on google
    upload_blob(bucket_name, source_file_name, destination_blob_name) # Upload
    gcs_uri = 'gs://' + bucket_name + '/' + file_name + '.wav' # Bucket path to file
    
    # Transcription
    # Setup for transcription
    client = speech.SpeechClient()
    audio = speech.RecognitionAudio(uri = gcs_uri)
    # Configuration to use speaker diarization
    diarization_config = speech.SpeakerDiarizationConfig(
        enable_speaker_diarization = True,
        min_speaker_count = 2, # Teacher and other
        max_speaker_count = 7, # Teacher, 5 students, begin/end simulation speaker
    )
    # Overall configuration
    config = speech.RecognitionConfig(
        encoding = speech.RecognitionConfig.AudioEncoding.LINEAR16,
        enable_automatic_punctuation = True,
        sample_rate_hertz = frame_rate,
        language_code = 'en-US',
        diarization_config = diarization_config)
    # Transcribe audio
    operation = client.long_running_recognize(config = config, audio = audio)
    response = operation.result(timeout = 10000)
    
    # Delete audio file from google storage
    delete_blob(bucket_name, destination_blob_name)
    
    # Turn raw response into transcript with speaker tags
    transcript = transcript_with_speaker_tags(response)
    
    return transcript

In [9]:
# Turn raw response into transcript with speaker tags
def transcript_with_speaker_tags(response):
    transcript = '' # Initialize empty transcript
    result = response.results[-1]
    words_info = result.alternatives[0].words

    tag = 1 # Initialize current speaker tag to 1
    temp_transcript = '' # Initialize temporary transcript of what current speaker has said

    # For each word in transcription
    for word_info in words_info: 
        # If speaker tag is same as current tag
        if word_info.speaker_tag == tag: 
            # Add current word to what they've already said (temp_transcript)
            temp_transcript = temp_transcript + " " + word_info.word 

        # Otherwise we have a change in speaker
        else: 
            # Add last speaker's temp_transcript to overall transcript
            transcript += f'speaker {tag}: {temp_transcript} \n'

            # Update current speaker tag to new speaker
            tag = word_info.speaker_tag

            # Initialize this speaker's temp_transcript with what they just said
            temp_transcript = '' + word_info.word

    # Add final speaker's temp_transcript to overall transcript
    transcript += f'speaker {tag}: {temp_transcript} \n'

    return transcript

In [10]:
# Write transcript to text file
def write_transcripts(file_name, text_path, transcript):
    f= open(text_path + file_name + '.txt', "w+")
    f.write(transcript)
    f.close()

In [11]:
# List of file names
file_list = os.listdir(mp4_path) # List all files in mp4 directory
# Updated list of files names
# remove extension and skip files that start with '.' (e.g. ipynb checkpoints)
file_list = [x.replace('.mp4', '')for x in file_list if x[0] != '.'] # Remove extension

In [12]:
# Setting Google credential
os.environ['GOOGLE_APPLICATION_CREDENTIALS']= 'google_secret_key.json'
# Create client instance 
client = speech.SpeechClient()

In [13]:
# Convert mp4 files to wav files
for file_name in file_list:
    convert_mp4_to_wav(file_name, mp4_path, wav_path)

chunk:   3%|▎         | 213/7047 [00:00<00:03, 2129.26it/s, now=None]

MoviePy - Writing audio in ./Data/WAV_Files/223_2.26.20_S_SC.wav


                                                                      

MoviePy - Done.


chunk:   4%|▍         | 265/6877 [00:00<00:02, 2648.80it/s, now=None]

MoviePy - Writing audio in ./Data/WAV_Files/210_1.31.20_S_SC.wav


                                                                      

MoviePy - Done.


chunk:   4%|▍         | 275/6924 [00:00<00:02, 2747.50it/s, now=None]

MoviePy - Writing audio in ./Data/WAV_Files/228_3.4.20_S_SC.wav


                                                                      

MoviePy - Done.


chunk:   4%|▎         | 249/6898 [00:00<00:02, 2484.84it/s, now=None]

MoviePy - Writing audio in ./Data/WAV_Files/226_2.26.20_S_SC.wav


                                                                      

MoviePy - Done.




In [14]:
# Exceute program
if __name__ == "__main__":
    for file_name in file_list:
        # Transcribe using google speech
        transcript = google_transcribe(file_name, wav_path, bucket_name)
        # Write transcript to text file
        write_transcripts(file_name, text_path, transcript)