# Forced Alignment with Vosk

Originally, we evaluated the tagger's F1-score by simply using indices, which may be too penalising. In order to properly evaluate the performance of our PII identification pipeline, we would need to perform *forced alignment*, which aligns token-level transcripts into their corresponding timestamps in the audio files.

For this, we shall be using *Vosk*, a toolkit which offers forced-alignment models. 

## Change Directory to Root Project

In [1]:
import os 

os.chdir('..')

In [489]:
os.getcwd()

'/Users/farhan/Desktop/Research'

## Helper functions

Helper function for sorting the audio file names by ID

In [490]:
def retrieve_key(file: str) -> int:
    try:
        # 3 digit
        key = int(file[2:5])
    except ValueError:
        # 1 digit
        if file[3] == '.':
            key = int(file[2])
        else:
            key = int(file[2:4])
    return key

Helper function to embed the entities within the transcripts (given a dataframe)

In [817]:
import pandas as pd

def insert_entity_tags_to_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Inserts entity boundary tags into the 'text' column based on 'entities',
    and adds a new column 'tagged_text' with the result.

    Args:
        df (pd.DataFrame): Must contain 'text' and 'entities' columns.

    Returns:
        pd.DataFrame: Same DataFrame with an additional 'tagged_text' column.
    """
    def insert_tags(row):
        text = row["text"]
        entities = row["entities"]

        # Sort entities in reverse order of start index to avoid offset issues
        entities_sorted = sorted(entities, key=lambda x: x[0], reverse=True)

        for start, end, label in entities_sorted:
            tag_start = f"[{label}_START]"
            tag_end = f"[{label}_END]"
            text = text[:end] + tag_end + text[end:]
            text = text[:start] + tag_start + text[start:]
        
        return text

    df = df.copy()
    df["tagged_text"] = df.apply(insert_tags, axis=1)
    return df

Helper function to unify whitespaces

In [818]:
import re

def unify_whitespace(s):
    """
    - Unify multiple spaces into one space across the text.
    - Ensure exactly one space after [XXX_START] and before [XXX_END].
    - Ensure one space after [XXX_END] if missing.
    - Ensure one space before [XXX_START] if missing.
    """
    if not isinstance(s, str):
        return s

    # Step 1: unify all whitespace
    s = re.sub(r'\s+', ' ', s.strip())

    # Step 2: ensure space after [XXX_START] and before [XXX_END]
    s = re.sub(r'(\[\w+_START\])(\S)', r'\1 \2', s)  # Add space after [START] if missing
    s = re.sub(r'(\S)(\[\w+_END\])', r'\1 \2', s)    # Add space before [END] if missing

    # Step 3: ensure space after [XXX_END] if missing
    s = re.sub(r'(\[\w+_END\])(\S)', r'\1 \2', s)

    # Step 4: ensure space before [XXX_START] if missing
    s = re.sub(r'(\S)(\[\w+_START\])', r'\1 \2', s)  # Add space before [START] if missing

    return s

Helper function to run Vosk forced-alignment model

In [493]:
from vosk import Model, KaldiRecognizer
import soundfile as sf
import json

def run_vosk(audio_path: str, vosk_model: Model) -> list:
    # Load audio
    audio_data, sample_rate = sf.read(audio_path)
    
    # Prepare recognizer
    rec = KaldiRecognizer(vosk_model, sample_rate)
    rec.SetWords(True)
    
    if audio_data.ndim > 1:
        audio_data = audio_data.mean(axis=1)  # Stereo to mono

    pcm_data = (audio_data * 32767).astype("int16").tobytes()

    rec.AcceptWaveform(pcm_data)
    result = json.loads(rec.FinalResult())
    
    return result

Helper function for tokenizing and standardizing the reference/hypothesis generated by LLM correction module / Gold data

In [856]:
import re
import string

digit_map = {
    '0': 'zero', '1': 'one', '2': 'two', '3': 'three', '4': 'four',
    '5': 'five', '6': 'six', '7': 'seven', '8': 'eight', '9': 'nine'
}

def tokenize_reference(text):
    tokens = []
    # Match tags (e.g., [EMAIL_START]) or words with allowed symbols
    parts = re.findall(r'\[.*?\]|[\w]+|[@._-]', text)

    for part in parts:
        if re.fullmatch(r'\[.*?\]', part):
            tokens.append(part)

        elif re.fullmatch(r'\d{4}-\d{4}-\d{4}-\d{4}', part):
            # e.g. credit card numbers
            for char in part:
                if char in digit_map:
                    tokens.append(digit_map[char])

        elif re.fullmatch(r'\d{3}-\d{5}-\d', part):
            # e.g. bank account numbers
            for char in part:
                if char in digit_map:
                    tokens.append(digit_map[char])

        elif part in {'@', '.', '-', '_'}:
            tokens.append(part)

        else:
            clean = part.lower().strip(string.punctuation)
            if clean:
                tokens.append(clean)

    return tokens

#### Edit-distance Heuristics

1. Equal: Exact match is found. Append the reference token and include the `start` and `end` times.
2. Deletion: Found in reference, but not in Vosk. Append the reference, but set `start` and `end` to `None`.
3. Insertion: Found in Vosk, but not in reference. Skip vosk word.
4. Replace: Word in vosk different to reference. 

Helper function to align reference Whisper-generated transcript to forced-alignment model (edit-distance based)

In [740]:
from difflib import SequenceMatcher
import string

def clean_token(token):
    """Lowercase and strip punctuation from a token (except tags)."""
    if token.startswith('[') and token.endswith(']'):
        return token.lower()
    return token.lower().translate(str.maketrans('', '', string.punctuation))

def is_start_tag(token):
    return token.endswith('_START]')

def is_end_tag(token):
    return token.endswith('_END]')

def align_transcript_with_vosk(vosk_words, transcript):
    ref_tokens = tokenize_reference(transcript)
    ref_tokens_clean = [clean_token(t) for t in ref_tokens]
    vosk_tokens = [w['word'] for w in vosk_words]
    vosk_tokens_clean = [clean_token(t) for t in vosk_tokens]

    matcher = SequenceMatcher(None, ref_tokens_clean, vosk_tokens_clean)
    aligned = []

    current_entity = None
    entity_buffer = []
    entity_start_time = None
    entity_end_time = None

    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == "equal":
            for r_i, v_i in zip(range(i1, i2), range(j1, j2)):
                token = ref_tokens[r_i]
                if is_start_tag(token):
                    current_entity = token.replace('[', '').replace(']', '').replace('_START', '')
                    entity_buffer = [token]
                    entity_start_time = vosk_words[v_i]['start']
                elif is_end_tag(token):
                    entity_buffer.append(token)
                    entity_end_time = vosk_words[v_i - 1]['end']
                    aligned.append({
                        'word': ' '.join(entity_buffer),
                        'start': entity_start_time,
                        'end': entity_end_time
                    })
                    current_entity = None
                    entity_buffer = []
                elif current_entity:
                    entity_buffer.append(token)
                else:
                    aligned.append({
                        'word': token,
                        'start': vosk_words[v_i]['start'],
                        'end': vosk_words[v_i]['end']
                    })

        elif tag == "delete":
            for r_i in range(i1, i2):
                aligned.append({
                    'word': ref_tokens[r_i],
                    'start': None,
                    'end': None
                })

        elif tag == "replace":
            # Assign start from first Vosk token and end from last Vosk token
            vosk_start = vosk_words[j1]['start'] if j1 < len(vosk_words) else None
            vosk_end = vosk_words[j2 - 1]['end'] if j2 - 1 < len(vosk_words) else None
            for r_i in range(i1, i2):
                aligned.append({
                    'word': ref_tokens[r_i],
                    'start': vosk_start if r_i == i1 else None,
                    'end': vosk_end if r_i == i2 - 1 else None
                })

        elif tag == "insert":
            continue  # Ignore inserted Vosk words not in reference

    return aligned

Extract id from file name function

In [753]:
def extract_id_number(filename):
    match = re.search(r'id(\d+)\.wav', filename)
    if match:
        return int(match.group(1))
    return None

## Load the dataset

In [23]:
import os

os.chdir('..')

Load batch 1 (150 samples)

In [820]:
import pandas as pd

batch_one_ref = pd.read_json('data/true_data_150.jsonl', lines=True)
batch_one_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [821]:
import os

batch_one_files = sorted(os.listdir("data/Audio_Files_for_testing"), key=retrieve_key)
batch_one_files  = [f'data/Audio_Files_for_testing/{file}' for file in batch_one_files]

import pandas as pd

batch_one_df = pd.DataFrame(data=batch_one_files, columns=['file_name'])
batch_one_df.head()

Unnamed: 0,file_name
0,data/Audio_Files_for_testing/id1.wav
1,data/Audio_Files_for_testing/id2.wav
2,data/Audio_Files_for_testing/id3.wav
3,data/Audio_Files_for_testing/id4.wav
4,data/Audio_Files_for_testing/id5.wav


Load batch 2 (350 samples)

In [822]:
import pandas as pd

batch_two_ref = pd.read_json('data/newtest_151_500_updated_TTS.jsonl', lines=True)
batch_two_ref.head()

Unnamed: 0,id,text,entities
0,151,"456 729103 8 is Kaifu Lee's DBS bank account, ...","[[0, 12, BANK_ACCOUNT]]"
1,152,"Jacob's OCBC bank account is 192-58462-3, and ...","[[29, 40, BANK_ACCOUNT]]"
2,153,"788 305194 2 is Zheng Qi's POSB bank account, ...","[[0, 12, BANK_ACCOUNT]]"
3,154,"Geetha's UOB bank account is 341-92741-9, and ...","[[29, 40, BANK_ACCOUNT]]"
4,155,"623 481057 6 is Ah Seng's Maybank account, and...","[[0, 12, BANK_ACCOUNT]]"


In [823]:
import os

batch_two_files = sorted(os.listdir("data/newtest_151_500_updated_TTS"), key=retrieve_key)
batch_two_files  = [f'data/newtest_151_500_updated_TTS/{file}' for file in batch_two_files]

import pandas as pd

batch_two_df = pd.DataFrame(data=batch_two_files, columns=['file_name'])
batch_two_df.head()

Unnamed: 0,file_name
0,data/newtest_151_500_updated_TTS/id151.wav
1,data/newtest_151_500_updated_TTS/id152.wav
2,data/newtest_151_500_updated_TTS/id153.wav
3,data/newtest_151_500_updated_TTS/id154.wav
4,data/newtest_151_500_updated_TTS/id155.wav


As you can see, for batch 2, the entities enclosed are not in the reference transcripts. As we are given the indices, we can write a helper function to include them within the transcripts. 

Preprocess transcripts

In [824]:
batch_two_ref = insert_entity_tags_to_df(batch_two_ref)

In [825]:
batch_two_ref.head()

Unnamed: 0,id,text,entities,tagged_text
0,151,"456 729103 8 is Kaifu Lee's DBS bank account, ...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]456 729103 8[BANK_ACCOUNT_...
1,152,"Jacob's OCBC bank account is 192-58462-3, and ...","[[29, 40, BANK_ACCOUNT]]",Jacob's OCBC bank account is [BANK_ACCOUNT_STA...
2,153,"788 305194 2 is Zheng Qi's POSB bank account, ...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]788 305194 2[BANK_ACCOUNT_...
3,154,"Geetha's UOB bank account is 341-92741-9, and ...","[[29, 40, BANK_ACCOUNT]]",Geetha's UOB bank account is [BANK_ACCOUNT_STA...
4,155,"623 481057 6 is Ah Seng's Maybank account, and...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]623 481057 6[BANK_ACCOUNT_...


In [826]:
batch_two_ref.tail()

Unnamed: 0,id,text,entities,tagged_text
346,496,Patrick Loh boasting about his email patrick.l...,"[[37, 60, EMAIL]]",Patrick Loh boasting about his email [EMAIL_ST...
347,497,Jasmine Yeo got sian when someone spell her em...,"[[50, 69, EMAIL]]",Jasmine Yeo got sian when someone spell her em...
348,498,Bobby Tan write his email bobby.tan@gmail.com ...,"[[26, 45, EMAIL]]",Bobby Tan write his email [EMAIL_START]bobby.t...
349,499,Kamala Singh telling the IT guy her email kama...,"[[42, 60, EMAIL]]",Kamala Singh telling the IT guy her email [EMA...
350,500,Raymond Koh say his email raymond.k@singnet.co...,"[[26, 50, EMAIL]]",Raymond Koh say his email [EMAIL_START]raymond...


In [827]:
batch_two_ref = batch_two_ref.drop(columns=['entities', 'text', 'id'], axis=1)
batch_two_ref.rename(columns={'tagged_text': 'text'}, inplace=True)
batch_two_ref.head()

Unnamed: 0,text
0,[BANK_ACCOUNT_START]456 729103 8[BANK_ACCOUNT_...
1,Jacob's OCBC bank account is [BANK_ACCOUNT_STA...
2,[BANK_ACCOUNT_START]788 305194 2[BANK_ACCOUNT_...
3,Geetha's UOB bank account is [BANK_ACCOUNT_STA...
4,[BANK_ACCOUNT_START]623 481057 6[BANK_ACCOUNT_...


Combine the datasets [Run this when dataset not combined yet]

In [828]:
test_set_ref = pd.concat([batch_one_ref, batch_two_ref], ignore_index=True)

In [829]:
test_set_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [830]:
test_set_ref.tail()

Unnamed: 0,text
496,Patrick Loh boasting about his email [EMAIL_ST...
497,Jasmine Yeo got sian when someone spell her em...
498,Bobby Tan write his email [EMAIL_START]bobby.t...
499,Kamala Singh telling the IT guy her email [EMA...
500,Raymond Koh say his email [EMAIL_START]raymond...


In [831]:
test_set_ref['text'] = test_set_ref['text'].apply(unify_whitespace)

In [None]:
test_set_ref.to_json('data/test_set_ref_combined.jsonl', lines=True, orient='records')

Load the combined processed dataset [When already combined]

In [837]:
test_set_ref = pd.read_json('data/test_set_ref_all.jsonl', lines=True)

In [838]:
test_set_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [839]:
test_set_ref.tail()

Unnamed: 0,text
495,Patrick Loh boasting about his email [EMAIL_ST...
496,Jasmine Yeo got sian when someone spell her em...
497,Bobby Tan write his email [EMAIL_START] bobby....
498,Kamala Singh telling the IT guy her email [EMA...
499,Raymond Koh say his email [EMAIL_START] raymon...


## Load the model (Vosk)

Unfortunately, there are no current models that are tuned for Singaporean English (Singlish). As such, we shall use the `vosk-model-en-us-0.42-gigaspeech` model.

In [498]:
from vosk import Model, KaldiRecognizer
import soundfile as sf
import json

# Load model (replace path with your model directory)
model_path = "models/vosk-model-en-us-0.42-gigaspeech"
model = Model(model_path)

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=8
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from models/vosk-model-en-us-0.42-gigaspeech/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from models/vosk-model-en-us-0.42-gigaspeech/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:294) Loading words from models/vosk-model-en-us-0.42-gigaspeech/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo models/vosk-model-en-us-0.42-gigaspeech/graph/phones/word_bound

## Load and Read Audio (With just one sample)

So here, several things are happening:

1. We create a `KaldiRecognizer` instance and set `.SetWords()` to `True`, which means that we will get word-level timestamps.
2. The `.AcceptWaveform()` method is used to process the waveform
3. The `.FinalResult()` method is finally called to retrieve the word-level timestamps (transcribed from the Vosk Model - although with some innacurracies, as Vosk is not a full-fledged ASR model)

Test on one audio sample (Simple example with Name and Email PIIs)

In [707]:
audio_path = "data/Audio_Files_for_testing/id48.wav"
audio_data, sample_rate = sf.read(audio_path)

In [708]:
sample = run_vosk(audio_path, model)

In [709]:
sample

{'result': [{'conf': 1.0, 'end': 0.9, 'start': 0.51, 'word': 'but'},
  {'conf': 1.0, 'end': 1.83, 'start': 1.5, 'word': 'each'},
  {'conf': 1.0, 'end': 2.28, 'start': 1.86, 'word': 'time'},
  {'conf': 1.0, 'end': 2.79, 'start': 2.43, 'word': 'the'},
  {'conf': 1.0, 'end': 3.06, 'start': 2.91, 'word': 'the'},
  {'conf': 1.0, 'end': 3.48, 'start': 3.09, 'word': 'guest'},
  {'conf': 1.0, 'end': 3.99, 'start': 3.48, 'word': 'inside'},
  {'conf': 1.0, 'end': 4.38, 'start': 3.99, 'word': 'rate'},
  {'conf': 1.0, 'end': 4.83, 'start': 4.59, 'word': 'they'},
  {'conf': 1.0, 'end': 5.07, 'start': 4.86, 'word': 'if'},
  {'conf': 0.642219, 'end': 5.25, 'start': 5.07, 'word': "they're"},
  {'conf': 1.0, 'end': 5.49, 'start': 5.25, 'word': 'not'},
  {'conf': 1.0, 'end': 5.700046, 'start': 5.52, 'word': 'doing'},
  {'conf': 0.996174, 'end': 5.81997, 'start': 5.700046, 'word': 'the'},
  {'conf': 0.897181, 'end': 6.21, 'start': 5.81997, 'word': 'running'},
  {'conf': 0.60384, 'end': 6.57, 'start': 6.2

In [713]:
len(sample['result'])

50

In [717]:
vosk_output = []

for item in sample['result']:
    vosk_output.append(item['word'])

print(vosk_output)

['but', 'each', 'time', 'the', 'the', 'guest', 'inside', 'rate', 'they', 'if', "they're", 'not', 'doing', 'the', 'running', 'track', 'thing', 'they', 'are', 'they', 'need', 'to', 'wear', 'masks', 'then', 'each', 'time', 'we', 'are', 'able', 'to', 'allow', 'for', 'fifteen', 'guests', 'inside', 'here', 'are', 'my', 'bank', 'account', 'details', 'seven', 'seventy', 'three', 'seven', 'eight', 'zero', 'seven', 'seven']


Reference Transcript

In [710]:
test_set_ref['text'].iloc[47]

'but uh each time the the guest inside right they if they are not doing the running track right they are they need to wear mask then uh each time we are able to allow for [CARDINAL_START] fifteen [CARDINAL_END] guests inside"Here are my bank account details: [BANK_ACCOUNT_START] 773-3780-7-7 [BANK_ACCOUNT_END]'

In [712]:
tokenized_reference = tokenize_reference(test_set_ref['text'].iloc[47])

print("Number of tokens:", len(tokenized_reference))
print(tokenized_reference)

Number of tokens: 58
['but', 'uh', 'each', 'time', 'the', 'the', 'guest', 'inside', 'right', 'they', 'if', 'they', 'are', 'not', 'doing', 'the', 'running', 'track', 'right', 'they', 'are', 'they', 'need', 'to', 'wear', 'mask', 'then', 'uh', 'each', 'time', 'we', 'are', 'able', 'to', 'allow', 'for', '[CARDINAL_START]', 'fifteen', '[CARDINAL_END]', 'guests', 'inside', 'here', 'are', 'my', 'bank', 'account', 'details', '[BANK_ACCOUNT_START]', 'seven', 'seven', 'three', 'three', 'seven', 'eight', 'zero', 'seven', 'seven', '[BANK_ACCOUNT_END]']


#### Using `SequenceMatcher` by `difflib` to perform edit-distance based heuristics

**Limitations of Greedy-alignment**

As we observed, greedy-based approaches are not feasible due to the variability of the dataset plus the difference in output format between the Kaldi alignment model and the reference text. For example, some words in the reference text include fillers like "uh" whereas the Vosk model seem to omit it. Another example include the handling of contractions - with the Vosk model outputting contractions as-is (considered as one token) whereas the reference text separates the words (e.g., they're -> they are, considered as two tokens). Lastly, Kaldi is designed foremost as a forced-alignment model and not a speech-recognition model - which means that it often outputs mistranscriptions (e.g., seventy vs seven three). This can have severe impact when it comes to alignment quality. Therefore, it has since been decided that greedy-alignment is not sufficient for our use case.

**Edit Distance Heuristics with `SequenceMatcher`**

Edit-distance alignment approaches, such as those based on `SequenceMatcher`, offer a more robust alternative to greedy heuristics in aligning reference transcripts with forced alignment outputs. Unlike greedy methods that proceed sequentially and can easily desynchronize when encountering mismatches, edit-distance techniques globally evaluate the sequences and can better tolerate insertions, deletions, or substitutions. For example, in the reference transcript, the phrase "they are" may appear as two tokens, while Vosk outputs "they're" as a single token—this would break a greedy matcher, but an edit-distance algorithm would register this as a "replace" operation and continue aligning the rest correctly. Similarly, reference text may contain filler words like "uh" or structured tag blocks like `[EMAIL_START] foo bar [EMAIL_END]` that do not exist in the Vosk output. Edit-distance allows these to be treated as deletions while maintaining alignment integrity. Lastly, in numerical sequences, Vosk may transcribe "seven three" as "seventy", which would otherwise cause greedy approaches to misalign all following tokens. With edit-distance, such errors are localized and do not compromise the entire alignment.

In [719]:
from difflib import SequenceMatcher

ref = tokenize_reference(test_set_ref['text'].iloc[47])

vosk = ['but', 'each', 'time', 'the', 'the', 'guest', 'inside', 'rate', 'they', 'if', "they're", 'not', 'doing', 'the', 'running', 'track', 'thing', 'they', 'are', 'they', 'need', 'to', 'wear', 'masks', 'then', 'each', 'time', 'we', 'are', 'able', 'to', 'allow', 'for', 'fifteen', 'guests', 'inside', 'here', 'are', 'my', 'bank', 'account', 'details', 'seven', 'seventy', 'three', 'seven', 'eight', 'zero', 'seven', 'seven']

matcher = SequenceMatcher(None, ref, vosk)
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
    print(f"{tag:10} ref[{i1}:{i2}] -> vosk[{j1}:{j2}]")
    print(f"  REF : {ref[i1:i2]}")
    print(f"  VOSK: {vosk[j1:j2]}")

equal      ref[0:1] -> vosk[0:1]
  REF : ['but']
  VOSK: ['but']
delete     ref[1:2] -> vosk[1:1]
  REF : ['uh']
  VOSK: []
equal      ref[2:8] -> vosk[1:7]
  REF : ['each', 'time', 'the', 'the', 'guest', 'inside']
  VOSK: ['each', 'time', 'the', 'the', 'guest', 'inside']
replace    ref[8:9] -> vosk[7:8]
  REF : ['right']
  VOSK: ['rate']
equal      ref[9:11] -> vosk[8:10]
  REF : ['they', 'if']
  VOSK: ['they', 'if']
replace    ref[11:13] -> vosk[10:11]
  REF : ['they', 'are']
  VOSK: ["they're"]
equal      ref[13:18] -> vosk[11:16]
  REF : ['not', 'doing', 'the', 'running', 'track']
  VOSK: ['not', 'doing', 'the', 'running', 'track']
replace    ref[18:19] -> vosk[16:17]
  REF : ['right']
  VOSK: ['thing']
equal      ref[19:25] -> vosk[17:23]
  REF : ['they', 'are', 'they', 'need', 'to', 'wear']
  VOSK: ['they', 'are', 'they', 'need', 'to', 'wear']
replace    ref[25:26] -> vosk[23:24]
  REF : ['mask']
  VOSK: ['masks']
equal      ref[26:27] -> vosk[24:25]
  REF : ['then']
  VOSK: ['th

Align vosk output with reference (edit-distance based approach)

In [741]:
align_transcript_with_vosk(sample['result'], test_set_ref['text'].iloc[47])

[{'word': 'but', 'start': 0.51, 'end': 0.9},
 {'word': 'uh', 'start': None, 'end': None},
 {'word': 'each', 'start': 1.5, 'end': 1.83},
 {'word': 'time', 'start': 1.86, 'end': 2.28},
 {'word': 'the', 'start': 2.43, 'end': 2.79},
 {'word': 'the', 'start': 2.91, 'end': 3.06},
 {'word': 'guest', 'start': 3.09, 'end': 3.48},
 {'word': 'inside', 'start': 3.48, 'end': 3.99},
 {'word': 'right', 'start': 3.99, 'end': 4.38},
 {'word': 'they', 'start': 4.59, 'end': 4.83},
 {'word': 'if', 'start': 4.86, 'end': 5.07},
 {'word': 'they', 'start': 5.07, 'end': None},
 {'word': 'are', 'start': None, 'end': 5.25},
 {'word': 'not', 'start': 5.25, 'end': 5.49},
 {'word': 'doing', 'start': 5.52, 'end': 5.700046},
 {'word': 'the', 'start': 5.700046, 'end': 5.81997},
 {'word': 'running', 'start': 5.81997, 'end': 6.21},
 {'word': 'track', 'start': 6.24, 'end': 6.57},
 {'word': 'right', 'start': 6.57, 'end': 6.87},
 {'word': 'they', 'start': 7.05, 'end': 7.29},
 {'word': 'are', 'start': 7.29, 'end': 7.59},
 {

Test on one audio sample (Numerical PIIs)

In [742]:
audio_path = "data/Audio_Files_for_testing/id52.wav"
audio_data, sample_rate = sf.read(audio_path)

In [743]:
sample2 = run_vosk(audio_path, model)

In [744]:
sample2

{'result': [{'conf': 0.634362, 'end': 1.14, 'start': 0.66, 'word': 'ok'},
  {'conf': 0.287375, 'end': 1.8, 'start': 1.5, 'word': 'ah'},
  {'conf': 1.0, 'end': 2.55, 'start': 2.01, 'word': 'contact'},
  {'conf': 1.0, 'end': 3.0, 'start': 2.55, 'word': 'number'},
  {'conf': 1.0, 'end': 3.48, 'start': 3.12, 'word': 'just'},
  {'conf': 1.0, 'end': 3.75, 'start': 3.48, 'word': 'put'},
  {'conf': 1.0, 'end': 4.38, 'start': 3.93, 'word': 'nine'},
  {'conf': 1.0, 'end': 4.71, 'start': 4.41, 'word': 'eight'},
  {'conf': 1.0, 'end': 5.07, 'start': 4.71, 'word': 'four'},
  {'conf': 1.0, 'end': 5.43, 'start': 5.07, 'word': 'zero'},
  {'conf': 1.0, 'end': 5.85, 'start': 5.43, 'word': 'six'},
  {'conf': 0.995706, 'end': 6.18, 'start': 5.85, 'word': 'four'},
  {'conf': 1.0, 'end': 6.48, 'start': 6.18, 'word': 'one'},
  {'conf': 1.0, 'end': 6.75, 'start': 6.51, 'word': 'three'}],
 'text': 'ok ah contact number just put nine eight four zero six four one three'}

Reference

In [745]:
test_set_ref['text'].iloc[51]

'okay uh contact number just put [PHONE_START] Nine eight four zero six four one three [PHONE_END]'

Force align

credit_card, car_plate, bank_account, nric, phone, passport_num

In [746]:
align_transcript_with_vosk(sample2['result'], test_set_ref['text'].iloc[51])

[{'word': 'okay', 'start': 0.66, 'end': None},
 {'word': 'uh', 'start': None, 'end': 1.8},
 {'word': 'contact', 'start': 2.01, 'end': 2.55},
 {'word': 'number', 'start': 2.55, 'end': 3.0},
 {'word': 'just', 'start': 3.12, 'end': 3.48},
 {'word': 'put', 'start': 3.48, 'end': 3.75},
 {'word': '[PHONE_START]', 'start': None, 'end': None},
 {'word': 'nine', 'start': 3.93, 'end': 4.38},
 {'word': 'eight', 'start': 4.41, 'end': 4.71},
 {'word': 'four', 'start': 4.71, 'end': 5.07},
 {'word': 'zero', 'start': 5.07, 'end': 5.43},
 {'word': 'six', 'start': 5.43, 'end': 5.85},
 {'word': 'four', 'start': 5.85, 'end': 6.18},
 {'word': 'one', 'start': 6.18, 'end': 6.48},
 {'word': 'three', 'start': 6.51, 'end': 6.75},
 {'word': '[PHONE_END]', 'start': None, 'end': None}]

Another example

In [747]:
sample3 = run_vosk("data/newtest_151_500_updated_TTS/id192.wav", model)

In [748]:
sample3

{'result': [{'conf': 1.0, 'end': 0.48, 'start': 0.09, 'word': 'john'},
  {'conf': 1.0, 'end': 0.78, 'start': 0.48, 'word': 'c'},
  {'conf': 1.0, 'end': 1.02, 'start': 0.78, 'word': 'as'},
  {'conf': 1.0, 'end': 1.86, 'start': 1.05, 'word': 'ocbc'},
  {'conf': 1.0, 'end': 2.28, 'start': 1.86, 'word': 'account'},
  {'conf': 1.0, 'end': 2.49, 'start': 2.31, 'word': 'is'},
  {'conf': 1.0, 'end': 2.79, 'start': 2.49, 'word': 'six'},
  {'conf': 1.0, 'end': 3.06, 'start': 2.79, 'word': 'five'},
  {'conf': 1.0, 'end': 3.27, 'start': 3.06, 'word': 'eight'},
  {'conf': 1.0, 'end': 3.69, 'start': 3.42, 'word': 'three'},
  {'conf': 1.0, 'end': 4.08, 'start': 3.69, 'word': 'seven'},
  {'conf': 0.948567, 'end': 4.29, 'start': 4.08, 'word': 'two'},
  {'conf': 1.0, 'end': 4.56, 'start': 4.29, 'word': 'nine'},
  {'conf': 0.90834, 'end': 4.86, 'start': 4.56, 'word': 'four'},
  {'conf': 1.0, 'end': 5.19, 'start': 4.86, 'word': 'one'},
  {'conf': 1.0, 'end': 5.61, 'start': 5.46, 'word': 'and'},
  {'conf':

In [None]:
test_set_ref['text'].iloc[47]

"John Seah's OCBC account is [BANK_ACCOUNT_START] 658-37294-1 [BANK_ACCOUNT_END] , and he's been neglecting it since his move to Tampines"

In [None]:
align_transcript_with_vosk(sample3['result'], test_set_ref['text'].iloc[47])

[{'word': 'john', 'start': 0.09, 'end': 0.48},
 {'word': 'seah', 'start': 0.48, 'end': None},
 {'word': 's', 'start': None, 'end': 1.02},
 {'word': 'ocbc', 'start': 1.05, 'end': 1.86},
 {'word': 'account', 'start': 1.86, 'end': 2.28},
 {'word': 'is', 'start': 2.31, 'end': 2.49},
 {'word': '[BANK_ACCOUNT_START]', 'start': None, 'end': None},
 {'word': 'six', 'start': 2.49, 'end': 2.79},
 {'word': 'five', 'start': 2.79, 'end': 3.06},
 {'word': 'eight', 'start': 3.06, 'end': 3.27},
 {'word': 'three', 'start': 3.42, 'end': 3.69},
 {'word': 'seven', 'start': 3.69, 'end': 4.08},
 {'word': 'two', 'start': 4.08, 'end': 4.29},
 {'word': 'nine', 'start': 4.29, 'end': 4.56},
 {'word': 'four', 'start': 4.56, 'end': 4.86},
 {'word': 'one', 'start': 4.86, 'end': 5.19},
 {'word': '[BANK_ACCOUNT_END]', 'start': None, 'end': None},
 {'word': 'and', 'start': 5.46, 'end': 5.61},
 {'word': 'he', 'start': 5.61, 'end': None},
 {'word': 's', 'start': None, 'end': 5.88},
 {'word': 'been', 'start': 5.88, 'end'

## Load and Read Audio (All 500 samples)

In [840]:
import os

audio_paths = sorted(os.listdir("data/Audio_Files_for_testing") + os.listdir("data/newtest_151_500_updated_TTS"), key=retrieve_key)
audio_paths_with_parent = [f'data/Audio_Files_for_testing/{file}' for file in audio_paths if file.endswith('.wav') and retrieve_key(file) < 151]
audio_paths_with_parent += [f'data/newtest_151_500_updated_TTS/{file}' for file in audio_paths if file.endswith('.wav') and retrieve_key(file) >= 151]

In [841]:
audio_paths_with_parent

['data/Audio_Files_for_testing/id1.wav',
 'data/Audio_Files_for_testing/id2.wav',
 'data/Audio_Files_for_testing/id3.wav',
 'data/Audio_Files_for_testing/id4.wav',
 'data/Audio_Files_for_testing/id5.wav',
 'data/Audio_Files_for_testing/id6.wav',
 'data/Audio_Files_for_testing/id7.wav',
 'data/Audio_Files_for_testing/id8.wav',
 'data/Audio_Files_for_testing/id9.wav',
 'data/Audio_Files_for_testing/id10.wav',
 'data/Audio_Files_for_testing/id11.wav',
 'data/Audio_Files_for_testing/id12.wav',
 'data/Audio_Files_for_testing/id13.wav',
 'data/Audio_Files_for_testing/id14.wav',
 'data/Audio_Files_for_testing/id15.wav',
 'data/Audio_Files_for_testing/id16.wav',
 'data/Audio_Files_for_testing/id17.wav',
 'data/Audio_Files_for_testing/id18.wav',
 'data/Audio_Files_for_testing/id19.wav',
 'data/Audio_Files_for_testing/id20.wav',
 'data/Audio_Files_for_testing/id21.wav',
 'data/Audio_Files_for_testing/id22.wav',
 'data/Audio_Files_for_testing/id23.wav',
 'data/Audio_Files_for_testing/id24.wav',
 

In [842]:
len(audio_paths_with_parent)

500

In [859]:
aligned_transcripts = []

In [860]:
from tqdm import tqdm

for audio in tqdm(audio_paths_with_parent, desc="Running forced alignment algorithm", unit="files", total=len(audio_paths_with_parent)):
    try:
        sample = run_vosk(audio, model)
        vosk_output = sample['result']
        # Ref text just get the number e.g., id48
        idx = int(extract_id_number(audio))
        ref_text = test_set_ref['text'].iloc[idx - 1]
        ref_text_aligned = align_transcript_with_vosk(vosk_output, ref_text)
        aligned_transcripts.append({
            'file_name': audio,
            'vosk_output': vosk_output,
            'ref_text': ref_text,
            'aligned_transcript': ref_text_aligned
        })
        # print(ref_text_aligned)
    except Exception as e:
        print(f"Error processing {audio}: {e}")
        aligned_transcripts.append({
            'file_name': audio,
            'vosk_output': None,
            'ref_text': None,
            'aligned_transcript': None
        })
        continue        

Running forced alignment algorithm:  48%|████▊     | 242/500 [06:09<02:29,  1.73files/s]

Error processing data/newtest_151_500_updated_TTS/id243.wav: 'result'


Running forced alignment algorithm: 100%|██████████| 500/500 [11:44<00:00,  1.41s/files]


In [861]:
processed_df = pd.DataFrame(aligned_transcripts)
processed_df.head()

Unnamed: 0,file_name,vosk_output,ref_text,aligned_transcript
0,data/Audio_Files_for_testing/id1.wav,"[{'conf': 1.0, 'end': 0.69, 'start': 0.51, 'wo...","The day before [DATE_START] yesterday, [DATE_E...","[{'word': 'the', 'start': 0.51, 'end': 0.69}, ..."
1,data/Audio_Files_for_testing/id2.wav,"[{'conf': 0.666652, 'end': 1.14, 'start': 0.69...",um my date of birth is uh second [DATE_START] ...,"[{'word': 'um', 'start': 0.69, 'end': 1.14}, {..."
2,data/Audio_Files_for_testing/id3.wav,"[{'conf': 1.0, 'end': 0.84, 'start': 0.48, 'wo...","she handed over a crumpled piece of paper, the...","[{'word': 'she', 'start': 0.48, 'end': 0.84}, ..."
3,data/Audio_Files_for_testing/id4.wav,"[{'conf': 1.0, 'end': 1.5, 'start': 1.11, 'wor...",aglio olio and err uh [CARDINAL_START] three t...,"[{'word': 'aglio', 'start': None, 'end': None}..."
4,data/Audio_Files_for_testing/id5.wav,"[{'conf': 1.0, 'end': 1.65, 'start': 1.14, 'wo...",[PERSON_START] Hong's [PERSON_END] email is [E...,"[{'word': '[PERSON_START]', 'start': 1.14, 'en..."


In [862]:
processed_df.tail()

Unnamed: 0,file_name,vosk_output,ref_text,aligned_transcript
495,data/newtest_151_500_updated_TTS/id496.wav,"[{'conf': 1.0, 'end': 0.45, 'start': 0.03, 'wo...",Patrick Loh boasting about his email [EMAIL_ST...,"[{'word': 'patrick', 'start': 0.03, 'end': 0.4..."
496,data/newtest_151_500_updated_TTS/id497.wav,"[{'conf': 0.990332, 'end': 0.45, 'start': 0.03...",Jasmine Yeo got sian when someone spell her em...,"[{'word': 'jasmine', 'start': 0.03, 'end': 0.4..."
497,data/newtest_151_500_updated_TTS/id498.wav,"[{'conf': 1.0, 'end': 0.45, 'start': 0.03, 'wo...",Bobby Tan write his email [EMAIL_START] bobby....,"[{'word': 'bobby', 'start': 0.03, 'end': 0.45}..."
498,data/newtest_151_500_updated_TTS/id499.wav,"[{'conf': 1.0, 'end': 0.48, 'start': 0.06, 'wo...",Kamala Singh telling the IT guy her email [EMA...,"[{'word': 'kamala', 'start': 0.06, 'end': 0.48..."
499,data/newtest_151_500_updated_TTS/id500.wav,"[{'conf': 1.0, 'end': 0.48, 'start': 0.09, 'wo...",Raymond Koh say his email [EMAIL_START] raymon...,"[{'word': 'raymond', 'start': 0.09, 'end': 0.4..."


In [866]:
processed_df['vosk_output'].iloc[332]

[{'conf': 0.84101, 'end': 0.36, 'start': 0.03, 'word': 'mark'},
 {'conf': 1.0, 'end': 0.81, 'start': 0.36, 'word': 'lee'},
 {'conf': 1.0, 'end': 1.14, 'start': 0.84, 'word': 'gave'},
 {'conf': 1.0, 'end': 1.47, 'start': 1.14, 'word': 'his'},
 {'conf': 1.0, 'end': 2.52, 'start': 1.89, 'word': 'icy'},
 {'conf': 0.855037, 'end': 3.51, 'start': 3.0, 'word': 'eso'},
 {'conf': 1.0, 'end': 3.75, 'start': 3.51, 'word': 'eight'},
 {'conf': 1.0, 'end': 4.23, 'start': 3.75, 'word': 'nine'},
 {'conf': 1.0, 'end': 4.5, 'start': 4.23, 'word': 'one'},
 {'conf': 1.0, 'end': 4.98, 'start': 4.5, 'word': 'seven'},
 {'conf': 1.0, 'end': 5.22, 'start': 4.98, 'word': 'one'},
 {'conf': 1.0, 'end': 5.7, 'start': 5.22, 'word': 'nine'},
 {'conf': 1.0, 'end': 6.27, 'start': 5.88, 'word': 'd'},
 {'conf': 1.0, 'end': 7.08, 'start': 6.9, 'word': 'to'},
 {'conf': 1.0, 'end': 7.2, 'start': 7.08, 'word': 'the'},
 {'conf': 1.0, 'end': 7.53, 'start': 7.2, 'word': 'bank'},
 {'conf': 1.0, 'end': 7.92, 'start': 7.53, 'word

In [867]:
processed_df['ref_text'].iloc[332]

'Mark Lee gave his I C [NRIC_START] S0891719D [NRIC_END] to the bank teller for the new account.'

In [868]:
processed_df['aligned_transcript'].iloc[332]

[{'word': 'mark', 'start': 0.03, 'end': 0.36},
 {'word': 'lee', 'start': 0.36, 'end': 0.81},
 {'word': 'gave', 'start': 0.84, 'end': 1.14},
 {'word': 'his', 'start': 1.14, 'end': 1.47},
 {'word': 'i', 'start': 1.89, 'end': None},
 {'word': 'c', 'start': None, 'end': None},
 {'word': '[NRIC_START]', 'start': None, 'end': None},
 {'word': 's0891719d', 'start': None, 'end': None},
 {'word': '[NRIC_END]', 'start': None, 'end': 6.27},
 {'word': 'to', 'start': 6.9, 'end': 7.08},
 {'word': 'the', 'start': 7.08, 'end': 7.2},
 {'word': 'bank', 'start': 7.2, 'end': 7.53},
 {'word': 'teller', 'start': 7.53, 'end': 7.92},
 {'word': 'for', 'start': 7.92, 'end': 8.1},
 {'word': 'the', 'start': 8.1, 'end': 8.22},
 {'word': 'new', 'start': 8.22, 'end': 8.4},
 {'word': 'account', 'start': 8.4, 'end': 9.06},
 {'word': '.', 'start': None, 'end': None}]

In [869]:
tokenize_reference(test_set_ref['text'].iloc[332])

['mark',
 'lee',
 'gave',
 'his',
 'i',
 'c',
 '[NRIC_START]',
 's0891719d',
 '[NRIC_END]',
 'to',
 'the',
 'bank',
 'teller',
 'for',
 'the',
 'new',
 'account',
 '.']

current issue:

- works very well for numbers
- need to convert the reference piis to text form (if number), and treat them as separate tokens (e.g., S1234567B -> S one two three four five six seven B; applies to NRIC, CAR_PLATE, BANK_ACCOUNT)
- strings grouped together logically needs to be treated separately (e.g., sg -> s g)

## [Archived]

### Heuristics Description (Greedy-based Alignment):

Forced-alignment heuristics are necessary because:
- The **Vosk model** may tokenize words differently compared to the reference transcript, especially for structured data like emails, phone numbers, and other PIIs (Personally Identifiable Information).
- **PII structures vary greatly** (e.g., "rendy.tan@hotmail.com" vs "rendy . tan at hotmail dot com"), and simple word-to-word alignment would fail.
- To achieve robust alignment and accurate timestamp mapping, **manual control** over token splitting and flattening is required based on the entity type.

These heuristics ensure that:
- Common free-text is aligned naturally,
- Structured PII is broken down appropriately for correct timestamp boundary matching.

#### 1. Outside of Entity Boundaries (General Case)

- Tokens are aligned **as-is** with Vosk words.
- No special splitting is done.
- Regular cleaning (punctuation removal except for `"."`) is applied when matching.
- **Example**:
  - Input Transcript: `"reach me at"`
  - Tokens: `["reach", "me", "at"]`
  - Aligned directly without splitting.

#### 2. Inside Entity Boundaries (e.g., [EMAIL_START], [PHONE_START], etc.)

- Special handling is done based on the entity type.

##### (A) EMAIL Entity (`current_entity == 'EMAIL'`)

- Split tokens based on `"."` and `"@"` separators.
- Words like `"at"` are **left intact**.
- **Example**:
  - Input: `"rendy.tan@hotmail.com"`
  - Split into: `["rendy", ".", "tan", "@", "hotmail", ".", "com"]`

##### (B) Other Entity Types (`CREDIT_CARD`, `CAR_PLATE`, `BANK_ACCOUNT`, `NRIC`, `PHONE`, `PASSPORT_NUM`)

- **If the token is a spelled-out number** (checked against a dictionary):
  - **Do not split**; keep the word as a single token.
  - **Example**:
    - Input: `"eight"`
    - Output: `["eight"]`

- **If the token is pure digits** (e.g., numbers like `"98005331"`):
  - **Split** into **individual characters**.
  - **Example**:
    - Input: `"98005331"`
    - Output: `["9", "8", "0", "0", "5", "3", "3", "1"]`

- **If the token is a mix of letters and numbers** (e.g., `"AB1234X"`):
  - **Split** into **individual characters** as well.
  - **Example**:
    - Input: `"AB1234X"`
    - Output: `["A", "B", "1", "2", "3", "4", "X"]`

#### 3. When Flattening Entity Tokens (Before Final Alignment)

- After all splitting:
  - **Spelled-out numbers** (like `"eight"`) and **email parts** (like `"hotmail"`) are **kept whole**.
  - Other tokens (numbers, single characters) appear **character-by-character**.

- **Flattening Examples**:
  - Tokens: `["eight", "5", "0"]`
    - Final output: `"eight 5 0"`
  
  - Tokens: `["hotmail", ".", "com"]`
    - Final output: `"hotmail . com"`

In [None]:
import json
import os
import tempfile
from pydub import AudioSegment
from vosk import Model, KaldiRecognizer
model_path = "vosk-model-en-us-0.42-gigaspeech" #model_new"
model = Model(model_path)
def align_audio_with_text(audio_path, transcription):
    audio = AudioSegment.from_wav(audio_path)
    recognizer = KaldiRecognizer(model, audio.frame_rate)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav:
        temp_wav_path = temp_wav.name
        audio.export(temp_wav_path, format="wav")
    results = []
    try:
        with open(temp_wav_path, "rb") as wf:
            wf.read(44)
            recognizer.SetWords(True)
            while True:
                data = wf.read(4000)
                if len(data) == 0:
                    break
                if recognizer.AcceptWaveform(data):
                    results.append(json.loads(recognizer.Result()))
            results.append(json.loads(recognizer.FinalResult()))
    finally:
        if os.path.exists(temp_wav_path):
            os.remove(temp_wav_path)
    words = []
    for result in results:
        if 'result' in result:
            for word in result['result']:
                words.append(word)
    aligned_segments = []
    for word in words:
        aligned_segments.append({
            "start": word["start"],
            "end": word["end"],
            "word": word["word"]
        })
    return aligned_segments

audio_dir = "/content/drive/MyDrive/Share/Research/speechNER/finetune/Audio_Files_for_testing"
transcription_file = "/content/drive/MyDrive/Share/Research/speechNER/Alignement_data/Text_with_ids_temp_preprocessed.jsonl"
output_file = "/content/drive/MyDrive/Share/Research/speechNER/Alignement_data/tr_aligned_data_new.jsonl"
with open(transcription_file, 'r') as f:
    transcriptions = [json.loads(line) for line in f]
aligned_data = []
for item in transcriptions:
    audio_path = f"{audio_dir}/id{item['id']}.wav"
    aligned_transcription = align_audio_with_text(audio_path, item['text'])
    aligned_data.append({
        "id": item['id'],
        "text": item['text'],
        "align": aligned_transcription
    })
with open(output_file, 'w') as f:
    for item in aligned_data:
        f.write(json.dumps(item) + '\n')

Old vosk alignment function with greedy decoding (Archived)

In [682]:
import re
import string

spelled_out_numbers = {
    'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5',
    'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10',
    'eleven': '11', 'twelve': '12', 'thirteen': '13', 'fourteen': '14',
    'fifteen': '15', 'sixteen': '16', 'seventeen': '17', 'eighteen': '18',
    'nineteen': '19', 'twenty': '20', 'thirty': '30', 'forty': '40', 'fifty': '50',
    'sixty': '60', 'seventy': '70', 'eighty': '80', 'ninety': '90',
    'hundred': '100', 'thousand': '1000'
}

def clean_token(token):
    """Remove punctuation except '.' and lowercase."""
    allowed = '.'
    punctuation_to_remove = ''.join(c for c in string.punctuation if c not in allowed)
    return token.lower().translate(str.maketrans('', '', punctuation_to_remove))

def process_entity_tokens(entity_tokens, char_tokens):
    """Prevent duplicates and extend entity tokens list."""
    for token in char_tokens:
        if token not in entity_tokens:
            entity_tokens.append(token)

def align_transcript_with_vosk(vosk_words, transcript):
    """
    Aligns a reference transcript with Vosk timestamps.
    Handles [XXX_START]... [XXX_END] entities properly.
    """
    tokens = re.findall(r'\[.*?\]|\S+', transcript)  # Tokenize the transcript
    aligned = []
    vosk_idx = 0
    current_entity = None
    entity_tokens = []
    entity_start_time = None
    entity_end_time = None

    entity_types_to_split = ['CREDIT_CARD', 'CAR_PLATE', 'BANK_ACCOUNT', 'NRIC', 'PHONE', 'PASSPORT_NUM']
    
    # Special case for emails: split on the dots (.) and @ but leave 'at' as-is
    def split_email(token):
        # Case 1: email with spaces (no @)
        if '.' in token:
            parts = re.split(r'([.])', token)
            parts = [p for p in parts if p != '']
            # print(parts)
            return parts  # Remove any empty strings
        # Case 2: email with @
        elif '@' in token:
            parts = re.split(r'([@.])', token)
            parts = [p for p in parts if p != '']
            # print(parts)
            return parts
        return [token]

    i = 0  # Index to keep track of the current token in the list

    while i < len(tokens):
        token = tokens[i]

        # print(f"Current token: {token}")

        if token.endswith('_START]'):
            # Start a new entity
            current_entity = token.replace('[', '').replace(']', '').replace('_START', '')
            entity_tokens = []
            entity_start_time = None
            entity_end_time = None
            i += 1
            continue

        if token.endswith('_END]'):
            # End the current entity
            if current_entity:
                # Flatten the entity and align with timestamps
                flattened_entity = []
                # print(f"New entity tokens: {entity_tokens}")
                
                for t in entity_tokens:
                    # Clean the token
                    clean_token_with_no_symbols = clean_token(t)
                    
                    # Check if the token is a spelled-out number
                    if clean_token_with_no_symbols.lower() in spelled_out_numbers or current_entity == 'EMAIL':
                        # If it's a spelled-out number, don't split it into characters
                        flattened_entity.append(clean_token_with_no_symbols)
                    else:
                        # Otherwise, split the token into characters
                        flattened_entity.extend(list(clean_token_with_no_symbols))
                    
                # Join the characters and align timestamps
                aligned.append({
                    "word": f"[{current_entity}_START] {' '.join(flattened_entity)} [{current_entity}_END]",
                    "start": entity_start_time,
                    "end": entity_end_time
                })
            current_entity = None
            entity_tokens = []
            entity_start_time = None
            entity_end_time = None
            i += 1
            continue

        clean_ref_word = clean_token(token)

        if current_entity:
            # Inside an entity, split the token into characters and modify the tokens list
            if vosk_idx < len(vosk_words):
                vosk_word = vosk_words[vosk_idx]['word']
                if not entity_tokens:
                    entity_start_time = vosk_words[vosk_idx]['start']
                entity_end_time = vosk_words[vosk_idx]['end']

                # Special handling for emails: split valid email format
                if current_entity == 'EMAIL':
                    char_tokens = split_email(token)  # Split email into parts
                    tokens[i:i+1] = char_tokens  # Replace the current token with the split characters

                    # Prevent duplicate tokens and extend entity tokens list
                    process_entity_tokens(entity_tokens, char_tokens)
                    print(f"Entity tokens after email split: {entity_tokens}")
                # Inside the loop where you handle the token splitting:
                elif current_entity in entity_types_to_split:
                    clean_token_with_no_symbols = clean_token(token)  # Clean token

                    # Check if the token is a spelled-out number
                    if clean_token_with_no_symbols.lower() in spelled_out_numbers.keys():
                        # If it's a spelled-out number, don't split it
                        char_tokens = [clean_token_with_no_symbols]  # Keep the token as is
                    else:
                        # If it's not a spelled-out number, split it into characters
                        char_tokens = list(clean_token_with_no_symbols)

                    # Modify the tokens list in place by extending with the character tokens
                    tokens[i:i+1] = char_tokens

                    # Prevent duplicates and extend entity tokens list
                    process_entity_tokens(entity_tokens, char_tokens)

                    print(f"Entity tokens after split: {entity_tokens}")

                vosk_idx += 1
            else:
                # No more Vosk words left (shouldn't happen usually)
                entity_tokens.append(token)

        else:
            # Outside entity, normal matching
            while vosk_idx < len(vosk_words):
                clean_vosk_word = clean_token(vosk_words[vosk_idx]['word'])
                aligned.append({
                    "word": token,
                    "start": vosk_words[vosk_idx]['start'],
                    "end": vosk_words[vosk_idx]['end']
                })
                vosk_idx += 1
                break

        i += 1  # Move to the next token
        # print(tokens)

    return aligned