### Input .wav files from gs into google's speech to text model 

In [18]:
import os
import pandas as pd
import sys 
import re

sys.path.insert(0, '..')

# Add google app crendtials
#os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=""

pd.set_option('max_colwidth', 1000)

In [19]:
# Read participants df
df=pd.read_csv("../data/speaker.csv")

# Sample 100 IDs each from each race
id_lst=list(pd.concat(
    [
        df[df.race=='CHINESE'].sample(100, random_state=99),
        df[df.race=='MALAY'].sample(100, random_state=99),
        df[df.race=='INDIAN'].sample(100, random_state=99)
    ]
).id)

# Remove 2 errors id
id_lst.remove(2888)
id_lst.remove(3151)

In [20]:
df

Unnamed: 0,id,sex,race
0,2852,F,CHINESE
1,2670,F,MALAY
2,3064,F,CHINESE
3,2862,F,CHINESE
4,2861,F,CHINESE
...,...,...,...
1026,2026,F,CHINESE
1027,2068,M,CHINESE
1028,2173,F,CHINESE
1029,2216,M,CHINESE


In [None]:
%%time
# Imports the Google Cloud libraries
from google.cloud import speech
from google.cloud import storage


# Instantiates a client
client = speech.SpeechClient()

# Get the first 10 text outputs of each sampled IDs using google speech model
for p in id_lst:
    lst=[]
    i=0
    o=1
    # Speech file numbers can be missing inbetween
    while i!=10:
        if o>300:
            break
        name = f"out/0{p}{o:04d}.WAV"   
        storage_client = storage.Client()
        bucket_name = 'national_speech_corpus'
        bucket = storage_client.bucket(bucket_name)
        
        # Check if file exists
        if storage.Blob(bucket=bucket, name=name).exists(storage_client):
            gcs_uri = f"gs://national_speech_corpus/out/0{p}00{o:02d}.WAV"
            audio = speech.RecognitionAudio(uri=gcs_uri)
            config = speech.RecognitionConfig(
                language_code="en-SG",
                enable_automatic_punctuation=False,
            )
            # Detects speech in the audio file
            response = client.recognize(config=config, audio=audio)
            
            # Check if there's output from the model
            if len(response.results) > 0:
                sentence = response.results[0].alternatives[0].transcript

                # Replace digits into words
                my_dict = {'0': 'zero ', '1': 'one ', '2': 'two ', '3': 'three ', '4': 'four ', '5': 'five ', '6': 'six ', '7': 'seven ', '8': 'eight ', '9': 'nine '}
                for item in sentence:
                    if item in my_dict.keys():
                        sentence=sentence.replace(item, my_dict[item])
                lst.append(sentence)
            else:
                lst.append('')
            i+=1
        o+=1
    
    # Read transcript (truth) file for a particular participant
    # Drop even index, first column and keep only first 10 sentences 
    output_df = pd.read_csv(f"script/0{p}0.txt", sep="\t", header=None).iloc[1:20:2].drop(0,axis=1).reset_index(drop=True)
    output_df.columns=['truth']
    output_df['output']=lst
    output_df.to_csv(f"output/{p}.csv",index=False)


In [49]:
# Add accuracy metrics to the output df and sampled speaker df
df_speaker=df[df.id.isin(id_lst)]
df_speaker['truth_count']=0
df_speaker['match_count']=0
for p in id_lst:
    output_df = pd.read_csv(f'output/{p}.csv')
    output_df['truth_split'] = output_df.truth.apply(lambda x: str(x).lower().split())
    output_df['output_split'] = output_df.output.apply(lambda x: str(x).lower().split())
    output_df['truth_count']=output_df.truth_split.apply(lambda x: len(set(x)))
    output_df['match_count']=output_df.apply(lambda x:len(set(x.truth_split).intersection(set(x.output_split))),axis=1) 
    output_df.to_csv(f"output/{p}.csv",index=False)
    
    df_speaker.loc[df_speaker.id==p,'truth_count']=output_df.truth_count.sum()
    df_speaker.loc[df_speaker.id==p,'match_count']=output_df.match_count.sum()

df_speaker.reset_index().to_csv('speech_sample_output.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_speaker['truth_count']=0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_speaker['match_count']=0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [60]:
# Sample transcribed text vs truth
speaker = pd.read_csv(f"output/2002.csv")
speaker

Unnamed: 0,truth,output,truth_split,output_split,truth_count,match_count
0,nine,nine,['nine'],['nine'],1,1
1,De Fabio Factory and Apple,disable Factory in apple,"['de', 'fabio', 'factory', 'and', 'apple']","['disable', 'factory', 'in', 'apple']",5,2
2,Oyakodon Yakizakana and Carrot Halwa,oil Condon and carrot halwa,"['oyakodon', 'yakizakana', 'and', 'carrot', 'halwa']","['oil', 'condon', 'and', 'carrot', 'halwa']",5,3
3,where can I find the best Red ruby,where can I find the best way to be,"['where', 'can', 'i', 'find', 'the', 'best', 'red', 'ruby']","['where', 'can', 'i', 'find', 'the', 'best', 'way', 'to', 'be']",8,6
4,Kuih Kaswi waffle and Coconut Kuih,Quaker sweet waffle in Coconut Creek,"['kuih', 'kaswi', 'waffle', 'and', 'coconut', 'kuih']","['quaker', 'sweet', 'waffle', 'in', 'coconut', 'creek']",5,2
5,please tell me how to cook Satay Babat,please tell me how to cook saute Bobbitt,"['please', 'tell', 'me', 'how', 'to', 'cook', 'satay', 'babat']","['please', 'tell', 'me', 'how', 'to', 'cook', 'saute', 'bobbitt']",8,6
6,six five two three nine seven nine three,six five to three nine seven nine three,"['six', 'five', 'two', 'three', 'nine', 'seven', 'nine', 'three']","['six', 'five', 'to', 'three', 'nine', 'seven', 'nine', 'three']",6,5
7,Denis D Cotta Paine Eric and Chong Tze Chien,Dennis Dakota and Chong searching,"['denis', 'd', 'cotta', 'paine', 'eric', 'and', 'chong', 'tze', 'chien']","['dennis', 'dakota', 'and', 'chong', 'searching']",9,2
8,I can't eat all of this Satay Babat,I can't eat all of this Saturday Bobbitt,"['i', ""can't"", 'eat', 'all', 'of', 'this', 'satay', 'babat']","['i', ""can't"", 'eat', 'all', 'of', 'this', 'saturday', 'bobbitt']",8,6
9,Maxi Cash Cadbury and Ajinomoto,Mexican Inn Camp Bowie and Angie nomoto,"['maxi', 'cash', 'cadbury', 'and', 'ajinomoto']","['mexican', 'inn', 'camp', 'bowie', 'and', 'angie', 'nomoto']",5,1


In [23]:
# output to be passed into feat tests (subgroup disparity and min/max threshold)
df_speaker=pd.read_csv('../data/speech_sample_output.csv')
df_speaker

Unnamed: 0,sex,race,truth_count,match_count
0,F,CHINESE,77,60
1,F,CHINESE,84,64
2,M,CHINESE,80,53
3,F,INDIAN,80,53
4,M,CHINESE,70,45
...,...,...,...,...
293,M,CHINESE,74,50
294,M,MALAY,68,52
295,F,CHINESE,82,58
296,F,CHINESE,79,52


In [24]:
# Sample accuracy stats across race

tmp=df_speaker.copy()
tmp['rate']=tmp.match_count/tmp.truth_count
tmp.groupby('race').rate.mean()

race
CHINESE    0.671469
INDIAN     0.690566
MALAY      0.683185
Name: rate, dtype: float64