# Load Whisper

In [4]:
!pip install -q --upgrade torch torchvision torchaudio
!pip install -q git+https://github.com/huggingface/transformers
!pip install -q accelerate optimum
!pip install -q ipython-autotime

import torch
from transformers import pipeline

# Optional: load multiple versions of whisper
# models = ["openai/whisper-large","openai/whisper-large-v2","openai/whisper-large-v3"]
models = ["openai/whisper-large-v3"]

d_models = {}

for m in models:
  d_models[m] = pipeline("automatic-speech-recognition",
                m,
                torch_dtype=torch.float16,
                device="cuda:0")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.94k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

In [6]:
def transcribe(audio):
    outputs = {}

    r = d_models[models[-1]](audio,
                    chunk_length_s=30,
                    batch_size=24,
                    return_timestamps=True)['text']
    return r

# Speech Accent Archive [link](https://accent.gmu.edu/browse_language.php)

In [7]:
!pip install jiwer

Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Collecting rapidfuzz<4,>=3 (from jiwer)
  Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.3 rapidfuzz-3.6.1


In [8]:
import pandas as pd
import pickle
from jiwer import wer, mer
from tqdm import tqdm

Extract file paths from SAA

In [9]:
saa_files = pd.read_html('https://accent.gmu.edu/soundtracks/')[0]
saa_files = list(saa_files['Name'])
saa_files = [x for x in saa_files if '.mp3' in str(x)]

Define stem of audio files

In [10]:
path_stem = 'https://accent.gmu.edu/soundtracks/'

Transcribe first audio file

In [11]:
transcribe(path_stem + saa_files[0])

' Please call Stella, ask her to bring these things with her from the store. Six spoons of fresh snow peas, five thick slabs of blue cheese and maybe a snack for her brother Bob. We also need a small plastic snake and a big two-way frog for the kids. She can scoop these things into three red bags and we will go meet her Wednesday at the train station.'

Transcribe each file in the SAA

In [None]:
out = {}

for f in tqdm(saa_files):
  out[f] = transcribe(path_stem + f)

Save transcripts

In [None]:
with open('SAA_Whisperv3.pkl', 'wb') as f:
  pickle.dump(out, f)

## Load for SAA

Load transcripts

In [None]:
with open('SAA_Whisperv3.pkl', 'rb') as f:
  saa_transcripts = pickle.load(f)

View example

In [None]:
saa_transcripts['english489.mp3']

' Please call Stella. Ask her to bring these things with her from the store. Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob. We also need a small plastic snake and a big toy frog for the kids. She can school these things into three red bags and we will go meet her Wednesday at the train station.'

Define elicitation passage for WER/MER calculations

In [None]:
elicitation = 'Please call Stella.  Ask her to bring these things with her from the store:  Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob.  We also need a small plastic snake and a big toy frog for the kids.  She can scoop these things into three red bags, and we will go meet her Wednesday at the train station.'

Create a dataframe and add transcripts, WER, and MER

In [None]:
df = pd.DataFrame()
df.index = saa_transcripts.keys()
df['whisper'] = saa_transcripts.values()
df['wer'] = [wer(elicitation, x) for x in saa_transcripts.values()]
df['mer'] = [mer(elicitation, x) for x in saa_transcripts.values()]
df

Unnamed: 0,whisper,wer,mer
afrikaans1.mp3,"Please call Stella, ask her to bring these th...",0.072464,0.072464
afrikaans2.mp3,Please call Stella. Ask her to bring these th...,0.043478,0.043478
afrikaans3.mp3,Please call Stella. Ask her to bring these th...,0.043478,0.043478
afrikaans4.mp3,Please call Stella. Ask her to bring these th...,0.028986,0.028986
afrikaans5.mp3,Please call Stella. Ask her to bring these th...,0.057971,0.057143
...,...,...,...
yoruba8.mp3,Please call Stella. Ask her to bring these th...,0.115942,0.115942
yupik1.mp3,Please call Stella. Ask her to bring these th...,0.028986,0.028986
yupik2.mp3,Please call Stella. Ask her to bring these th...,0.217391,0.182927
yupik3.mp3,Please call Stella. Ask her to bring these th...,0.028986,0.028986


#SAA Metadata

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import trange

In [None]:
outs = {}

for i in trange(1,3036):
  url = f'https://accent.gmu.edu/browse_language.php?function=detail&speakerid={i}'
  r = requests.get(url)

  html = BeautifulSoup(r.content)
  t = [x.text for x in html.find_all('li') if '</em>' in str(x)]
  t = {x.split(':')[0]:x.split(':')[1][1:] for x in t}
  t['features'] = [x.text for x in html.findAll('a') if 'browse_language.php?function=detail&amp' in str(x)][1:]

  t['index'] = i
  n = html.findAll('h5')[1].em.text
  outs[n] = t

100%|██████████| 3035/3035 [46:19<00:00,  1.09it/s]


In [None]:
with open('/content/drive/Shareddrives/Ling Thesis/Data_Dir/SAA_metadata.pkl', 'wb') as f:
  pickle.dump(outs, f)

In [None]:
df['to_join'] = [x[0] for x in df.index.str.split('.')]

In [None]:
df_meta = pd.DataFrame(outs).T

In [None]:
df_meta['to_join'] = df_meta.index

In [None]:
df_merged = df.merge(df_meta, on='to_join', how='left')

In [None]:
df_merged.index = df_merged['to_join']
df_merged.fillna(' , ', inplace=True)
df_merged['birth place'] = df_merged['birth place'].apply(lambda x: str(x).replace(' (map)',''))
df_merged['native language'] = df_merged['native language'].apply(lambda x: str(x).split('\n')[0])
df_merged['length of english residence'] = df_merged['length of english residence'].apply(lambda x: str(x).split(' ')[0])

df_merged['age'] = df_merged['age, sex'].apply(lambda x: str(x).split(', ')[0])
df_merged['sex'] = df_merged['age, sex'].apply(lambda x: str(x).split(', ')[1])

In [None]:
df_merged = df_merged.drop(['whisper','age, sex', 'to_join'], axis=1)
df_merged.to_csv('SAA_full.csv')

In [None]:
df_merged

Unnamed: 0_level_0,wer,mer,birth place,native language,other language(s),age of english onset,english learning method,english residence,length of english residence,features,index,age,sex
to_join,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
afrikaans1,0.072464,0.072464,"virginia, south africa",afrikaans,tswana,9,academic,usa,0.5,"[final obstruent devoicing, interdental fricat...",1,27,female
afrikaans2,0.043478,0.043478,"pretoria, south africa",afrikaans,dutch german french,5,academic,usa,10,"[final obstruent devoicing, vowel shortening]",2,40,male
afrikaans3,0.043478,0.043478,"pretoria, transvaal, south africa",afrikaans,gujarati sindhi french,4,naturalistic,usa,20.7,"[final obstruent devoicing, r to trill, non as...",418,43,male
afrikaans4,0.028986,0.028986,"pretoria, south africa",afrikaans,flemish dutch,8,academic,usa,15,[],1159,26,male
afrikaans5,0.057971,0.057143,"cape town, south africa",afrikaans,none,6,academic,australia,9,[],1432,19,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...
yoruba8,0.115942,0.115942,"ibadan, oyo, nigeria",yoruba,none,2,academic,"nigeria, usa",21,[],2985,21,male
yupik1,0.028986,0.028986,"bethel, alaska, usa",yupik,spanish,1,naturalistic,alaska,31,"[final obstruent devoicing, interdental fricat...",571,31,female
yupik2,0.217391,0.182927,"nome, alaska, usa",yupik,russian,6,academic,usa,40,[],2637,40,male
yupik3,0.028986,0.028986,"nome, alaska, usa",yupik,none,6,naturalistic,usa,42,[],2638,42,female
