# CTC Forced Alignment
This demonstrates how to get transcription symbols forced aligned with an audio file for a wav2vec 2.0 model transcription.

In [1]:
from pathlib import Path

from ctc_forced_aligner import (
    load_audio,
    load_alignment_model,
    generate_emissions,
    get_alignments,
    get_spans,
    postprocess_results,
)
import datasets
import transformers
import torch

from multipa.data_utils import load_buckeye_split


MODEL = "ginic/data_seed_bs64_1_wav2vec2-large-xlsr-53-buckeye-ipa"

HF_DATA_DIR = Path("../data/buckeye")

DEVICE = torch.device("cpu")



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pipe = transformers.pipeline("automatic-speech-recognition", model=MODEL, device=DEVICE)

In [3]:
alignment_model, alignment_tokenizer = load_alignment_model(DEVICE, MODEL)

In [4]:
val_data = load_buckeye_split(HF_DATA_DIR, "validation").select(list(range(0, 10))).cast_column("audio", datasets.Audio(sampling_rate=16_000))
display(val_data.to_pandas())

Unnamed: 0,utterance_id,duration,buckeye_transcript,text,ipa,speaker_id,speaker_gender,speaker_age_range,interviewer_gender,file_path,audio,__index_level_0__
0,s1801a_Utt1,0.393962,ah m hh ah m,um-hum,ʌ m h ʌ m,S18,f,o,f,data/buckeye/validation/s1801a_Utt1.wav,"{'bytes': None, 'path': '/Users/virginia/works...",0
1,s1801a_Utt2,1.322374,U U ah m er r ih jh n l iy f r ah m k l ah m v...,VOCNOISE HES-um i'm originally from columbus,ʌ m ɹ̩ ɹ ɪ dʒ n l i f ɹ ʌ m k l ʌ m v ɪ s,S18,f,o,f,data/buckeye/validation/s1801a_Utt2.wav,"{'bytes': None, 'path': '/Users/virginia/works...",1
2,s1801a_Utt3,4.190445,U U U ah m ah nx ah v eh n k ow r n ey dx er f...,VOCNOISE EXT-and HES-um i'm an event coordinat...,ʌ m ʌ ɾ̃ ʌ v ɛ n k oʊ ɹ n eɪ ɾ ɹ̩ f ɹ̩ w ɪ tʃ ...,S18,f,o,f,data/buckeye/validation/s1801a_Utt3.wav,"{'bytes': None, 'path': '/Users/virginia/works...",2
3,s1801a_Utt4,3.829312,ih n jh er m ah n v ah l ey ch U U U U ay m,in german village NOISE VOCNOISE HES-um VOCNOI...,ɪ n dʒ ɹ̩ m ʌ n v ʌ l eɪ tʃ aɪ m,S18,f,o,f,data/buckeye/validation/s1801a_Utt4.wav,"{'bytes': None, 'path': '/Users/virginia/works...",3
4,s1801a_Utt5,1.974183,f ao r dx iy th r iy y iy r z ow l dx ah v jh ...,forty three years old i've just been married uh,f ɔ ɹ ɾ i θ ɹ i j i ɹ z oʊ l ɾ ʌ v dʒ ɪ s b n̩...,S18,f,o,f,data/buckeye/validation/s1801a_Utt5.wav,"{'bytes': None, 'path': '/Users/virginia/works...",4
5,s1801a_Utt6,1.583139,th r iy y iy r z ah hh ae n ow ch ih l d r ah n,three years i have no children,θ ɹ i j i ɹ z ʌ h æ n oʊ tʃ ɪ l d ɹ ʌ n,S18,f,o,f,data/buckeye/validation/s1801a_Utt6.wav,"{'bytes': None, 'path': '/Users/virginia/works...",5
6,s1801a_Utt7,0.434642,ah,uh,ʌ,S18,f,o,f,data/buckeye/validation/s1801a_Utt7.wav,"{'bytes': None, 'path': '/Users/virginia/works...",6
7,s1801a_Utt8,5.247608,ay v l ih v d en k l ah m b ah s m ow s t ah m...,i've lived in columbus most of my life althoug...,aɪ v l ɪ v d n̩ k l ʌ m b ʌ s m oʊ s t ʌ m aɪ ...,S18,f,o,f,data/buckeye/validation/s1801a_Utt8.wav,"{'bytes': None, 'path': '/Users/virginia/works...",7
8,s1801a_Utt9,5.143126,ay y uw z ch ah hh ae v ey ah en t iy er p l a...,i used to have a uh interior plantscape busine...,aɪ j u z tʃ ʌ h æ v eɪ ʌ n̩ t i ɹ̩ p l æ n s k...,S18,f,o,f,data/buckeye/validation/s1801a_Utt9.wav,"{'bytes': None, 'path': '/Users/virginia/works...",8
9,s1801a_Utt10,3.934375,U s ow ah m r ih l iy ih nx er eh s t ih d ih ...,VOCNOISE so i'm really interested in VOCNOISE ...,s oʊ ʌ m ɹ ɪ l i ɪ ɾ̃ ɹ̩ ɛ s t ɪ d ɪ d ɪ n h ɔ...,S18,f,o,f,data/buckeye/validation/s1801a_Utt10.wav,"{'bytes': None, 'path': '/Users/virginia/works...",9


In [5]:
# Examine the model's vocabulary for good measure, this helps to show which diacritics
# are actually assigned their own tokens
for k, v in alignment_tokenizer.get_vocab().items():
    print(k, v)

A 194
E 200
I 108
O 14
[PAD] 308
[UNK] 307
_ 299
a 89
ã 269
b 213
b̪ 114
b̪͡v 101
b͡ꞵ 224
c 74
cʼ 145
c͡ç 274
d 188
d̼ 96
d͡z 228
d͡ð 234
d͡ɮ 253
d͡ʑ 286
d͡ʒ 21
e 31
ẽ 115
e̞ 182
ẽ̞ 12
f 22
fʼ 140
h 132
i 223
j 40
k 141
kxʼ 199
kǀ 135
kǁ 45
kǂ 79
kǃ 289
kʘ 73
kʼ 255
k̚ 97
k͡p 264
k͡x 144
l 225
m 102
m̥ 180
n 221
n̥ 26
n̼ 71
o 127
õ 275
o̞ 99
õ̞ 201
p 305
pʼ 290
p̚ 293
p̪ 281
p̪͡f 107
p͡f 112
p͡ɸ 65
q 121
qǀ 20
qǁ 137
qǂ 178
qǃ 236
qʘ 278
qʼ 206
q͡ʡ 227
q͡χʼ 146
q͡ꭓ 123
r 139
r̥ 270
s 294
sʼ 249
t 131
tʼ 231
t̚ 166
t̪͡θʼ 239
t̼ 257
t͡s 32
t͡sʼ 117
t͡ɕ 261
t͡ɬ 263
t͡ɬʼ 23
t͡ʃ 105
t͡ʃʼ 86
t͡θ 76
u 41
ũ 153
v 240
w 77
x 58
xʼ 262
y 230
ỹ 9
z 116
{ 244
} 272
ã 189
ä 304
ä̃ 246
æ 4
æ̃ 238
ç 161
ð 0
ð̠ 136
ð̼ 208
õ 149
ø 207
ø̃ 133
ø̞ 39
ø̞̃ 204
ħ 119
ĩ 38
ŋ 5
ŋǀ 163
ŋǁ 148
ŋǂ 155
ŋǃ 85
ŋʘ 92
ŋ̊ 301
ŋ͡m 214
œ 288
œ̃ 235
ũ 113
ɐ 198
ɐ̃ 104
ɑ 226
ɑ̃ 167
ɒ 46
ɒ̃ 217
ɓ 209
ɓ̥ 75
ɔ 138
ɔ̃ 110
ɕ 143
ɕʼ 36
ɖ 98
ɖ͡ʐ 103
ɗ 173
ɗ̥ 295
ɘ 172
ɘ̃ 184
ə 171
ə̃ 49
ɚ 24
ɛ 237
ɛ̃ 256
ɜ 125
ɜ̃ 63
ɝ 56
ɞ

In [6]:
for row in val_data:
    print("Attempting to align", row["file_path"])
    print("\tOrthographic transcription:", row["text"])
    print("\tOriginal buckeye:", row["buckeye_transcript"])
    print("\tBuckeye transcript converted to IPA:", row["ipa"])
    transcription = pipe(row["audio"])["text"]
    split_tokens = list(transcription)
    starred_tokens = []
    for t in split_tokens:
        starred_tokens.extend(["<star>", t])
    print("\tPredicted transcription:", transcription)
    print("\tSplit and starred tokens:", starred_tokens)
    wav_path = Path("../") / row["file_path"]
    audio_waveform = load_audio(wav_path, alignment_model.dtype, alignment_model.device)
    emissions, stride = generate_emissions(alignment_model, audio_waveform, batch_size=64, context_length=8)
    print("\tEmissions shape:", emissions.shape)
    print("\tStride:", stride)
    segments, scores, blank_tokens = get_alignments(emissions, starred_tokens, alignment_tokenizer)
    print("\tSegments:", segments)
    try:
        spans = get_spans(starred_tokens, segments, blank_tokens)
    except AssertionError as e:
        print("\tAlignment failed due to mismatched spans")
        print("\t", e)
    else:
        token_timestamps = postprocess_results(starred_tokens, spans, stride, scores)
        print("\tToken timestamps:", token_timestamps)



Attempting to align data/buckeye/validation/s1801a_Utt1.wav
	Orthographic transcription: um-hum
	Original buckeye: ah m hh ah m
	Buckeye transcript converted to IPA: ʌ m h ʌ m
	Predicted transcription: m̩hm̩
	Split and starred tokens: ['<star>', 'm', '<star>', '̩', '<star>', 'h', '<star>', 'm', '<star>', '̩']
	Emissions shape: torch.Size([19, 312])
	Stride: 21
	Segments: [[pad]: [    0,     1), <star>: [    2,     2), m: [    3,     3), <star>: [    4,     4), ̩: [    5,     5), [pad]: [    6,     7), <star>: [    8,     8), h: [    9,     9), [pad]: [   10,    11), <star>: [   12,    12), m: [   13,    13), <star>: [   14,    14), ̩: [   15,    15), [pad]: [   16,    18)]
	Token timestamps: [{'start': 0.063, 'end': 0.063, 'text': 'm', 'score': 0.0}, {'start': 0.105, 'end': 0.126, 'text': '̩', 'score': -0.43681490421295166}, {'start': 0.189, 'end': 0.21, 'text': 'h', 'score': -0.13337308168411255}, {'start': 0.273, 'end': 0.273, 'text': 'm', 'score': 0.0}, {'start': 0.315, 'end': 0.378

Initial observations:
- We're having some problems with dipthongs and IPA symbols that are represented as more than one computer character. This is probably my fault with fault training data tokenization. Perhaps we should try preprocessing tokenization with something like https://pypi.org/project/ipatok/
- Some symbols (dipthong starts?) seem to have 0.0 second durations. Maybe we could do something in post proc, like if a symbol has a zero duration, pair it with the next symbol in output.
- I'm not sure what causes the span mismatch. That seems to be related to the batch size and context window in some way I don't understand yet.