# Implementación del Quiz

In [None]:
# instalación
!pip install huggingsound

You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
# librerías
from huggingsound import SpeechRecognitionModel

# conjunto de palabras
words = ['gato', 'escuela', 'perro', 'niño', 'casa']

In [None]:
# instalación
!pip install transformers

You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
# traducción a Nahuatl
from transformers import AutoModelForSeq2SeqLM
from tqdm.notebook import tqdm_notebook
from transformers import AutoTokenizer

model = AutoModelForSeq2SeqLM.from_pretrained('hackathon-pln-es/t5-small-spanish-nahuatl')
tokenizer = AutoTokenizer.from_pretrained('hackathon-pln-es/t5-small-spanish-nahuatl')

model.eval()
sentence = 'Mi hermano es un ajolote'
input_ids = tokenizer('translate Spanish to Nahuatl: ' + sentence, return_tensors='pt').input_ids
outputs = model.generate(input_ids)
# outputs = miak xochitl istak
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

Downloading:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

# Paso 1: conversión de texto en español a Nahuatl

In [None]:
# arreglo de palabras en Nahuatl
words_nahuatl = []

for i in tqdm_notebook(words):
    sentence = i
    input_ids = tokenizer('translate Spanish to Nahuatl: ' + sentence, return_tensors='pt').input_ids
    outputs = model.generate(input_ids)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    words_nahuatl.append(outputs)
    print(f'"{i}" se dice "{outputs}" en Nahuatl.')

  0%|          | 0/5 [00:00<?, ?it/s]

"gato" se dice "mistle" en Nahuatl.
"escuela" se dice "tlamachtiloyan" en Nahuatl.
"perro" se dice "chichi" en Nahuatl.
"niño" se dice "pili" en Nahuatl.
"casa" se dice "kali" en Nahuatl.


# Paso 2: grabación de palabra en Nahuatl

In [None]:
# all imports
from IPython.display import Javascript
#from google.colab import output
from base64 import b64decode

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb') as f:
    f.write(b)
  return 'audio.wav'

In [None]:
#record()

# Paso 3: de voz en Nahuatl a texto en Nahuatl

In [None]:
model = SpeechRecognitionModel("tyoc213/wav2vec2-large-xlsr-nahuatl")
audio_paths = ["/content/audio.wav"]

04/30/2022 17:32:13 - INFO - huggingsound.speech_recognition.model - Loading model...


Downloading:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/158 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/138 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
transcriptions_nah = model.transcribe(audio_paths)[0]
transcriptions_nah['transcription']

  0%|          | 0/1 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/content/audio.wav'

# Paso 4: análisis de similitud

Emplearemos la similitud de Jaccard que está dada por:

$$ \text{sim} _ {\text{jaccard}} (s_1, s_2) = \frac{\text{c}_{\text{common}}}{\text{c}_1 + \text{c}_2 - \text{c}_{\text{common}}}$$

donde:

- $\text{c}_{\text{common}} = $ número de bigramas en común.

- $\text{c}_i = \text{len}(s_i)- 1$

In [None]:
def sim_jac(s1, s2):

    bigrams_s1 = []
    bigrams_s2 = []

    for i in range(len(s1) - 1):
        bigrams_s1.append(s1[i:i+2])
    
    for i in range(len(s2) - 1):
        bigrams_s2.append(s2[i:i+2])

    c_common = 0

    for i in bigrams_s1:
        if bigrams_s2.count(i) > 0:
            c_common += 1

    return c_common / ((len(s1) - 1) + (len(s2) - 1) - c_common)

In [None]:
# comparación entre mistle y miestleh
sim_jac('mistle', transcriptions_nah['transcription'])

0.5

## Ahora con modelo de español a texto y ver su similitud

In [None]:
from ipywebrtc import AudioRecorder, CameraStream
import torchaudio
from IPython.display import Audio


camera = CameraStream(constraints={'audio': True,'video':False})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

In [None]:
with open('./grabaciones-webm/test.webm', 'wb') as f:
    f.write(recorder.audio.value)

!ffmpeg -i ./grabaciones-webm/test.webm -ac 1 -f wav ./grabaciones-wav/test.wav -y -hide_banner -loglevel panic

In [None]:
from huggingsound import SpeechRecognitionModel

model = SpeechRecognitionModel("patrickvonplaten/wav2vec2-large-xlsr-53-spanish-with-lm")
audio_paths = ["./grabaciones-wav/test.wav"]

transcriptions_es = model.transcribe(audio_paths)[0]

04/29/2022 23:25:52 - INFO - huggingsound.speech_recognition.model - Loading model...


Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/262 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/412 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

100%|██████████| 1/1 [00:03<00:00,  3.05s/it]


In [None]:
transcriptions_es['transcription']

'mistley'

In [None]:
# comparación entre mistle y miestleh
sim_jac('mistle', transcriptions_es['transcription'])

0.8333333333333334

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=17740a58-e960-4de3-bb66-1575286930f1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>