In [None]:
# ! pip install google
# ! pip install --user google.cloud
# ! pip install --user google.cloud.speech

In [3]:
import os
import io
import pickle
import time

import pandas as pd
import numpy as np

from pydub import AudioSegment
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from multiprocessing.dummy import Pool

In [4]:
with open("../data/Pickles/streets.pkl", "rb") as fp:
    streets = pickle.load(fp)

Pool is a class in the multiprocessing package that distributes functionality across multiple processes in a computer. Simply put, it lets the computer assign more than one person to build a fence instead of 1. This dramatically speeds up the time it takes for computationally expensive tasks to run and it called and placed around such tasks. 

In [6]:
pool = Pool(12) 
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = #Insert API key here
client = speech.SpeechClient()

In [77]:
path = '../audio_data/audio_files/wav_files/'
wav_file = []
for filename in os.listdir(path)[1:]:
    wav_file.append(path + filename)
wav_file[:3]

['../audio_data/audio_files/wav_files/10904-20190730-1657_1.wav',
 '../audio_data/audio_files/wav_files/10904-20190730-1657_10.wav',
 '../audio_data/audio_files/wav_files/10904-20190730-1657_11.wav']

Google’s Speech to Text API accepts a wav file and stores it into memory. When calling the speech to text API, we can pass in a dictionary for it to reference its results on. We decided to run the API with and without street context to see if which setting would provide better results. The API then proceeds to detect speech in the audio file and return a transcription of what it heard and it’s confidence in the results. We returned those results as a dataframe. 

## With Street Name Context



In [84]:
def speech_to_text_context(file_name):
    transcript = ''
    conf = 0
    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    #speech_to_text
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='en-US',
        model="video",
        speech_contexts = [{
                        "phrases": np.random.choice(streets, 5000)
                         }]
    )
    
    #Detects speech in audio file
    response = client.recognize(config, audio)

    for result in response.results:
        transcript = result.alternatives[0].transcript
        confidence = result.alternatives[0].confidence

    time.sleep(1)
    return transcript, confidence, file_name

In [85]:
pool = Pool(12) 

list_of_transcripts_context = pool.map(speech_to_text_context, wav_file)

captions_context = [a[0] for a in list_of_transcripts_context if a[0] != '']
confidence_context = [a[1] for a in list_of_transcripts_context if a[0] != '']

data_context = {'transcripts': captions_context, 
                'confidence': confidence_context}
df_context = pd.DataFrame(data_context)
# df_context.to_csv('../data/Data/transcribed_radio_with_street_context.csv')
df_context.head(25)
    
pool.close()
pool.join()

In [86]:
np.mean(df_context['confidence'])

0.7664087745878432

## Without Street Name Context

In [61]:
def speech_to_text(file_name):
    transcript = ''
    conf = 0
    # Loads the audio into memory
    with io.open(file_name, 'rb') as audio_file:
        content = audio_file.read()
        audio = types.RecognitionAudio(content=content)

    #speech_to_text
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        language_code='en-US',
        model="video"
    )

    # Detects speech in the audio file    
    response = client.recognize(config, audio)

    for result in response.results:
        transcript = result.alternatives[0].transcript
        confidence = result.alternatives[0].confidence

    time.sleep(1)
    
    return transcript, confidence

In [80]:
pool = Pool(12) 

list_of_transcripts = pool.map(speech_to_text, wav_file)
pool.close()
pool.join()

captions = [a[0] for a in list_of_transcripts if a[0] != '']
confidence = [a[1] for a in list_of_transcripts if a[0] != '']
names = [a[2] for a in list_of_transcripts if a[0] != '']

data = {'transcripts': captions, 
        'confidence': confidence}
df = pd.DataFrame(data)
# df.to_csv('../data/Data/transcribed_radio.csv')
df.head(25)

Unnamed: 0,transcripts,confidence
0,I was just,0.502997
1,our brothers are critical for 1636 thank you,0.842904
2,is going to be,0.652573
3,to drive robson's plane set the driver knocked...,0.700766
4,Market 11:30 high was 31130 hi,0.803496
5,drat or kind of somebody can price and stay fa...,0.772701
6,Peridot we close races here,0.680532
7,Library 612 Smithfield Street between 6th Aven...,0.83922
8,down there markets weren't,0.785316


In [81]:
np.mean(df['confidence'])

0.7311672965685526

>Code adapted from [NY General Assembly DSI radio-to-location repository](https://github.com/mchbmn/radio-to-location)
