<a href="https://colab.research.google.com/github/dawoodwasif/Customer-Audio-Analytics-of-Accent-and-Sentiment/blob/main/4_Speech_to_Text_for_UBL_Datathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installation and Imports

In [1]:
!pip install SpeechRecognition
!pip install pydub

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 128 kB/s 
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.8.1
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [2]:
import speech_recognition as sr
from os import path
from pydub import AudioSegment


In [4]:
available_languages = {
'australia' :	'en-AU',
'canada'	: 'en-CA'	,
'ghana' :	'en-GH' ,	
'hong kong':	'en-HK'	,			
'india':	'en-IN',
'ireland':	'en-IE',
'kenya':	'en-KE',		
'new zealand':	'en-NZ',	
'nigeria':	'en-NG',	
'pakistan' : 'en-PK',
'philippines': 'en-PH',
'singapore' :	'en-SG'	,
'south africa':	'en-ZA',	
'tanzania': 'en-TZ',
'uk':	'en-GB'	,
'usa' :	'en-US'
}	


# files                                                                         
src = '/content/farsi1.mp3'
dst = '/content/test.wav'

# convert wav to mp3                                                            
sound = AudioSegment.from_mp3(src)
sound.export(dst, format="wav")

<_io.BufferedRandom name='/content/test.wav'>

###  WER function for transcription error

In [10]:
import sys
import numpy

def editDistance(r, h):
    '''
    This function is to calculate the edit distance of reference sentence and the hypothesis sentence.
    Main algorithm used is dynamic programming.
    Attributes: 
        r -> the list of words produced by splitting reference sentence.
        h -> the list of words produced by splitting hypothesis sentence.
    '''
    d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8).reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        d[i][0] = i
    for j in range(len(h)+1):
        d[0][j] = j
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return d

def getStepList(r, h, d):
    '''
    This function is to get the list of steps in the process of dynamic programming.
    Attributes: 
        r -> the list of words produced by splitting reference sentence.
        h -> the list of words produced by splitting hypothesis sentence.
        d -> the matrix built when calulating the editting distance of h and r.
    '''
    x = len(r)
    y = len(h)
    list = []
    while True:
        if x == 0 and y == 0: 
            break
        elif x >= 1 and y >= 1 and d[x][y] == d[x-1][y-1] and r[x-1] == h[y-1]: 
            list.append("e")
            x = x - 1
            y = y - 1
        elif y >= 1 and d[x][y] == d[x][y-1]+1:
            list.append("i")
            x = x
            y = y - 1
        elif x >= 1 and y >= 1 and d[x][y] == d[x-1][y-1]+1:
            list.append("s")
            x = x - 1
            y = y - 1
        else:
            list.append("d")
            x = x - 1
            y = y
    return list[::-1]

def alignedPrint(list, r, h, result):
    '''
    This funcition is to print the result of comparing reference and hypothesis sentences in an aligned way.
    
    Attributes:
        list   -> the list of steps.
        r      -> the list of words produced by splitting reference sentence.
        h      -> the list of words produced by splitting hypothesis sentence.
        result -> the rate calculated based on edit distance.
    '''
    print("REF:", end=" ")
    for i in range(len(list)):
        if list[i] == "i":
            count = 0
            for j in range(i):
                if list[j] == "d":
                    count += 1
            index = i - count
            print(" "*(len(h[index])), end=" ")
        elif list[i] == "s":
            count1 = 0
            for j in range(i):
                if list[j] == "i":
                    count1 += 1
            index1 = i - count1
            count2 = 0
            for j in range(i):
                if list[j] == "d":
                    count2 += 1
            index2 = i - count2
            if len(r[index1]) < len(h[index2]):
                print(r[index1] + " " * (len(h[index2])-len(r[index1])), end=" ")
            else:
                print(r[index1], end=" "),
        else:
            count = 0
            for j in range(i):
                if list[j] == "i":
                    count += 1
            index = i - count
            print(r[index], end=" "),
    print("\nHYP:", end=" ")
    for i in range(len(list)):
        if list[i] == "d":
            count = 0
            for j in range(i):
                if list[j] == "i":
                    count += 1
            index = i - count
            print(" " * (len(r[index])), end=" ")
        elif list[i] == "s":
            count1 = 0
            for j in range(i):
                if list[j] == "i":
                    count1 += 1
            index1 = i - count1
            count2 = 0
            for j in range(i):
                if list[j] == "d":
                    count2 += 1
            index2 = i - count2
            if len(r[index1]) > len(h[index2]):
                print(h[index2] + " " * (len(r[index1])-len(h[index2])), end=" ")
            else:
                print(h[index2], end=" ")
        else:
            count = 0
            for j in range(i):
                if list[j] == "d":
                    count += 1
            index = i - count
            print(h[index], end=" ")
    print("\nEVA:", end=" ")
    for i in range(len(list)):
        if list[i] == "d":
            count = 0
            for j in range(i):
                if list[j] == "i":
                    count += 1
            index = i - count
            print("D" + " " * (len(r[index])-1), end=" ")
        elif list[i] == "i":
            count = 0
            for j in range(i):
                if list[j] == "d":
                    count += 1
            index = i - count
            print("I" + " " * (len(h[index])-1), end=" ")
        elif list[i] == "s":
            count1 = 0
            for j in range(i):
                if list[j] == "i":
                    count1 += 1
            index1 = i - count1
            count2 = 0
            for j in range(i):
                if list[j] == "d":
                    count2 += 1
            index2 = i - count2
            if len(r[index1]) > len(h[index2]):
                print("S" + " " * (len(r[index1])-1), end=" ")
            else:
                print("S" + " " * (len(h[index2])-1), end=" ")
        else:
            count = 0
            for j in range(i):
                if list[j] == "i":
                    count += 1
            index = i - count
            print(" " * (len(r[index])), end=" ")
    print("\nWER: " + result)

def wer(r, h):
    """
    This is a function that calculate the word error rate in ASR.
    You can use it like this: wer("what is it".split(), "what is".split()) 
    """
    # build the matrix
    d = editDistance(r, h)

    # find out the manipulation steps
    list = getStepList(r, h, d)

    # print the result in aligned way
    result = float(d[len(r)][len(h)]) / len(r) * 100
    result = str("%.2f" % result) + "%"
    alignedPrint(list, r, h, result)


In [11]:
country = "pakistan"
#country = accent_classifier_model(src)

In [12]:
language_code = "en-US" # default
if country in available_languages.keys():
      language_code = available_languages[country]
      print(language_code)

en-PK


### Transcribe speech

In [19]:
r = sr.Recognizer()

with sr.AudioFile(dst) as source:
    audio = r.record(source)

sp2txt = r.recognize_google(audio, language=language_code)
sp2txt


'please call Stella ask her to bring this things with her from still sore 6 spoon of fresh snow piece 566 of blue cheese and maybe a snake for her brother Bob we also need a small plastic snake and a big toy frog for the kids she can scoop this things into 3 red bags and we will go meet her Wednesday at the train station'

### Find error

In [23]:
speech_accent_archieve_transcript = 'please call Stella and ask her to bring these things with her from the store 6 spoons of fresh snow peas 5 thick slabs of blue cheese and maybe a snack for her brother Bob we also need a small plastic snake and a big toy frog for the kids she can scoop these things into 3 red bags and we will go meet her Wednesday at the train station'


In [24]:
r = speech_accent_archieve_transcript
# copy text from transcribed function
h = sp2txt
wer(r, h)   # error is only 9.23%

REF: p l e a s e   c a l l   S t e l l a   a n d   a s k   h e r   t o   b r i n g   t h e s e   t h i n g s   w i t h   h e r   f r o m     t h e     s t o r e   6   s p o o n s   o f   f r e s h   s n o w   p e a s   5   t h i   c k   s l a b s   o f   b l u e   c h e e s e   a n d   m a y b e   a   s n a c k     f o r   h e r   b r o t h e r   B o b   w e   a l s o   n e e d   a   s m a l l   p l a s t i c   s n a k e   a n d   a   b i g   t o y   f r o g   f o r   t h e   k i d s   s h e   c a n   s c o o p   t h e s e   t h i n g s   i n t o   3   r e d   b a g s   a n d   w e   w i l l   g o   m e e t   h e r   W e d n e s d a y   a t   t h e   t r a i n   s t a t i o n 
HYP: p l e a s e   c a l l   S t e l l     a       a s k   h e r   t o   b r i n g   t h i s     t h i n g s   w i t h   h e r   f r o m   s t i l l   s   o r e   6   s p o o n     o f   f r e s h   s n o w   p                 i e c e       5 6 6   o f   b l u e   c h e e s e   a n d   m a y b e   a   s n a   k e