# Forced Alignment with Vosk

Originally, we evaluated the tagger's F1-score by simply using indices, which may be too penalising. In order to properly evaluate the performance of our PII identification pipeline, we would need to perform *forced alignment*, which aligns token-level transcripts into their corresponding timestamps in the audio files.

For this, we shall be using *Vosk*, a toolkit which offers forced-alignment models. 

## Change Directory to Root Project

In [1]:
import os 

os.chdir('..')

In [489]:
os.getcwd()

'/Users/farhan/Desktop/Research'

## Helper functions

Helper function for sorting the audio file names by ID

In [490]:
def retrieve_key(file: str) -> int:
    try:
        # 3 digit
        key = int(file[2:5])
    except ValueError:
        # 1 digit
        if file[3] == '.':
            key = int(file[2])
        else:
            key = int(file[2:4])
    return key

Helper function to embed the entities within the transcripts (given a dataframe)

In [491]:
import pandas as pd

def insert_entity_tags_to_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Inserts entity boundary tags into the 'text' column based on 'entities',
    and adds a new column 'tagged_text' with the result.

    Args:
        df (pd.DataFrame): Must contain 'text' and 'entities' columns.

    Returns:
        pd.DataFrame: Same DataFrame with an additional 'tagged_text' column.
    """
    def insert_tags(row):
        text = row["text"]
        entities = row["entities"]

        # Sort entities in reverse order of start index to avoid offset issues
        entities_sorted = sorted(entities, key=lambda x: x[0], reverse=True)

        for start, end, label in entities_sorted:
            tag_start = f"[{label}_START]"
            tag_end = f"[{label}_END]"
            text = text[:end] + tag_end + text[end:]
            text = text[:start] + tag_start + text[start:]
        
        return text

    df = df.copy()
    df["tagged_text"] = df.apply(insert_tags, axis=1)
    return df

Helper function to unify whitespaces

In [492]:
import re

def unify_whitespace(s):
    """
    - Unify multiple spaces into one space across the text.
    - Ensure exactly one space after [XXX_START] and before [XXX_END].
    - Ensure one space after [XXX_END] if missing.
    - Ensure one space before [XXX_START] if missing.
    """
    if not isinstance(s, str):
        return s

    # Step 1: unify all whitespace
    s = re.sub(r'\s+', ' ', s.strip())

    # Step 2: ensure space after [XXX_START] and before [XXX_END]
    s = re.sub(r'(\[\w+_START\])(\S)', r'\1 \2', s)  # Add space after [START] if missing
    s = re.sub(r'(\S)(\[\w+_END\])', r'\1 \2', s)    # Add space before [END] if missing

    # Step 3: ensure space after [XXX_END] if missing
    s = re.sub(r'(\[\w+_END\])(\S)', r'\1 \2', s)

    # Step 4: ensure space before [XXX_START] if missing
    s = re.sub(r'(\S)(\[\w+_START\])', r'\1 \2', s)  # Add space before [START] if missing

    return s

Helper function to run Vosk forced-alignment model

In [493]:
from vosk import Model, KaldiRecognizer
import soundfile as sf
import json

def run_vosk(audio_path: str, vosk_model: Model) -> list:
    # Load audio
    audio_data, sample_rate = sf.read(audio_path)
    
    # Prepare recognizer
    rec = KaldiRecognizer(vosk_model, sample_rate)
    rec.SetWords(True)
    
    if audio_data.ndim > 1:
        audio_data = audio_data.mean(axis=1)  # Stereo to mono

    pcm_data = (audio_data * 32767).astype("int16").tobytes()

    rec.AcceptWaveform(pcm_data)
    result = json.loads(rec.FinalResult())
    
    return result

Helper function to align reference Whisper-generated transcript to forced-alignment model

In [682]:
import re
import string

spelled_out_numbers = {
    'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5',
    'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10',
    'eleven': '11', 'twelve': '12', 'thirteen': '13', 'fourteen': '14',
    'fifteen': '15', 'sixteen': '16', 'seventeen': '17', 'eighteen': '18',
    'nineteen': '19', 'twenty': '20', 'thirty': '30', 'forty': '40', 'fifty': '50',
    'sixty': '60', 'seventy': '70', 'eighty': '80', 'ninety': '90',
    'hundred': '100', 'thousand': '1000'
}

def clean_token(token):
    """Remove punctuation except '.' and lowercase."""
    allowed = '.'
    punctuation_to_remove = ''.join(c for c in string.punctuation if c not in allowed)
    return token.lower().translate(str.maketrans('', '', punctuation_to_remove))

def process_entity_tokens(entity_tokens, char_tokens):
    """Prevent duplicates and extend entity tokens list."""
    for token in char_tokens:
        if token not in entity_tokens:
            entity_tokens.append(token)

def align_transcript_with_vosk(vosk_words, transcript):
    """
    Aligns a reference transcript with Vosk timestamps.
    Handles [XXX_START]... [XXX_END] entities properly.
    """
    tokens = re.findall(r'\[.*?\]|\S+', transcript)  # Tokenize the transcript
    aligned = []
    vosk_idx = 0
    current_entity = None
    entity_tokens = []
    entity_start_time = None
    entity_end_time = None

    entity_types_to_split = ['CREDIT_CARD', 'CAR_PLATE', 'BANK_ACCOUNT', 'NRIC', 'PHONE', 'PASSPORT_NUM']
    
    # Special case for emails: split on the dots (.) and @ but leave 'at' as-is
    def split_email(token):
        # Case 1: email with spaces (no @)
        if '.' in token:
            parts = re.split(r'([.])', token)
            parts = [p for p in parts if p != '']
            # print(parts)
            return parts  # Remove any empty strings
        # Case 2: email with @
        elif '@' in token:
            parts = re.split(r'([@.])', token)
            parts = [p for p in parts if p != '']
            # print(parts)
            return parts
        return [token]

    i = 0  # Index to keep track of the current token in the list

    while i < len(tokens):
        token = tokens[i]

        # print(f"Current token: {token}")

        if token.endswith('_START]'):
            # Start a new entity
            current_entity = token.replace('[', '').replace(']', '').replace('_START', '')
            entity_tokens = []
            entity_start_time = None
            entity_end_time = None
            i += 1
            continue

        if token.endswith('_END]'):
            # End the current entity
            if current_entity:
                # Flatten the entity and align with timestamps
                flattened_entity = []
                # print(f"New entity tokens: {entity_tokens}")
                
                for t in entity_tokens:
                    # Clean the token
                    clean_token_with_no_symbols = clean_token(t)
                    
                    # Check if the token is a spelled-out number
                    if clean_token_with_no_symbols.lower() in spelled_out_numbers or current_entity == 'EMAIL':
                        # If it's a spelled-out number, don't split it into characters
                        flattened_entity.append(clean_token_with_no_symbols)
                    else:
                        # Otherwise, split the token into characters
                        flattened_entity.extend(list(clean_token_with_no_symbols))
                    
                # Join the characters and align timestamps
                aligned.append({
                    "word": f"[{current_entity}_START] {' '.join(flattened_entity)} [{current_entity}_END]",
                    "start": entity_start_time,
                    "end": entity_end_time
                })
            current_entity = None
            entity_tokens = []
            entity_start_time = None
            entity_end_time = None
            i += 1
            continue

        clean_ref_word = clean_token(token)

        if current_entity:
            # Inside an entity, split the token into characters and modify the tokens list
            if vosk_idx < len(vosk_words):
                vosk_word = vosk_words[vosk_idx]['word']
                if not entity_tokens:
                    entity_start_time = vosk_words[vosk_idx]['start']
                entity_end_time = vosk_words[vosk_idx]['end']

                # Special handling for emails: split valid email format
                if current_entity == 'EMAIL':
                    char_tokens = split_email(token)  # Split email into parts
                    tokens[i:i+1] = char_tokens  # Replace the current token with the split characters

                    # Prevent duplicate tokens and extend entity tokens list
                    process_entity_tokens(entity_tokens, char_tokens)
                    print(f"Entity tokens after email split: {entity_tokens}")
                # Inside the loop where you handle the token splitting:
                elif current_entity in entity_types_to_split:
                    clean_token_with_no_symbols = clean_token(token)  # Clean token

                    # Check if the token is a spelled-out number
                    if clean_token_with_no_symbols.lower() in spelled_out_numbers.keys():
                        # If it's a spelled-out number, don't split it
                        char_tokens = [clean_token_with_no_symbols]  # Keep the token as is
                    else:
                        # If it's not a spelled-out number, split it into characters
                        char_tokens = list(clean_token_with_no_symbols)

                    # Modify the tokens list in place by extending with the character tokens
                    tokens[i:i+1] = char_tokens

                    # Prevent duplicates and extend entity tokens list
                    process_entity_tokens(entity_tokens, char_tokens)

                    print(f"Entity tokens after split: {entity_tokens}")

                vosk_idx += 1
            else:
                # No more Vosk words left (shouldn't happen usually)
                entity_tokens.append(token)

        else:
            # Outside entity, normal matching
            while vosk_idx < len(vosk_words):
                clean_vosk_word = clean_token(vosk_words[vosk_idx]['word'])
                aligned.append({
                    "word": token,
                    "start": vosk_words[vosk_idx]['start'],
                    "end": vosk_words[vosk_idx]['end']
                })
                vosk_idx += 1
                break

        i += 1  # Move to the next token
        # print(tokens)

    return aligned

## Load the dataset

In [23]:
import os

os.chdir('..')

Load batch 1 (150 samples)

In [283]:
import pandas as pd

batch_one_ref = pd.read_json('data/true_data_150.jsonl', lines=True)
batch_one_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [284]:
import os

batch_one_files = sorted(os.listdir("data/Audio_Files_for_testing"), key=retrieve_key)
batch_one_files  = [f'data/Audio_Files_for_testing/{file}' for file in batch_one_files]

import pandas as pd

batch_one_df = pd.DataFrame(data=batch_one_files, columns=['file_name'])
batch_one_df.head()

Unnamed: 0,file_name
0,data/Audio_Files_for_testing/id1.wav
1,data/Audio_Files_for_testing/id2.wav
2,data/Audio_Files_for_testing/id3.wav
3,data/Audio_Files_for_testing/id4.wav
4,data/Audio_Files_for_testing/id5.wav


Load batch 2 (350 samples)

In [285]:
import pandas as pd

batch_two_ref = pd.read_json('data/newtest_151_500_updated_TTS.jsonl', lines=True)
batch_two_ref.head()

Unnamed: 0,id,text,entities
0,151,"456 729103 8 is Kaifu Lee's DBS bank account, ...","[[0, 12, BANK_ACCOUNT]]"
1,152,"Jacob's OCBC bank account is 192-58462-3, and ...","[[29, 40, BANK_ACCOUNT]]"
2,153,"788 305194 2 is Zheng Qi's POSB bank account, ...","[[0, 12, BANK_ACCOUNT]]"
3,154,"Geetha's UOB bank account is 341-92741-9, and ...","[[29, 40, BANK_ACCOUNT]]"
4,155,"623 481057 6 is Ah Seng's Maybank account, and...","[[0, 12, BANK_ACCOUNT]]"


In [286]:
import os

batch_two_files = sorted(os.listdir("data/newtest_151_500_updated_TTS"), key=retrieve_key)
batch_two_files  = [f'data/newtest_151_500_updated_TTS/{file}' for file in batch_two_files]

import pandas as pd

batch_two_df = pd.DataFrame(data=batch_two_files, columns=['file_name'])
batch_two_df.head()

Unnamed: 0,file_name
0,data/newtest_151_500_updated_TTS/id151.wav
1,data/newtest_151_500_updated_TTS/id152.wav
2,data/newtest_151_500_updated_TTS/id153.wav
3,data/newtest_151_500_updated_TTS/id154.wav
4,data/newtest_151_500_updated_TTS/id155.wav


As you can see, for batch 2, the entities enclosed are not in the reference transcripts. As we are given the indices, we can write a helper function to include them within the transcripts. 

Preprocess transcripts

In [287]:
batch_two_ref = insert_entity_tags_to_df(batch_two_ref)

In [288]:
batch_two_ref.head()

Unnamed: 0,id,text,entities,tagged_text
0,151,"456 729103 8 is Kaifu Lee's DBS bank account, ...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]456 729103 8[BANK_ACCOUNT_...
1,152,"Jacob's OCBC bank account is 192-58462-3, and ...","[[29, 40, BANK_ACCOUNT]]",Jacob's OCBC bank account is [BANK_ACCOUNT_STA...
2,153,"788 305194 2 is Zheng Qi's POSB bank account, ...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]788 305194 2[BANK_ACCOUNT_...
3,154,"Geetha's UOB bank account is 341-92741-9, and ...","[[29, 40, BANK_ACCOUNT]]",Geetha's UOB bank account is [BANK_ACCOUNT_STA...
4,155,"623 481057 6 is Ah Seng's Maybank account, and...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]623 481057 6[BANK_ACCOUNT_...


In [289]:
batch_two_ref.tail()

Unnamed: 0,id,text,entities,tagged_text
346,496,Patrick Loh boasting about his email patrick.l...,"[[37, 60, EMAIL]]",Patrick Loh boasting about his email [EMAIL_ST...
347,497,Jasmine Yeo got sian when someone spell her em...,"[[50, 69, EMAIL]]",Jasmine Yeo got sian when someone spell her em...
348,498,Bobby Tan write his email bobby.tan@gmail.com ...,"[[26, 45, EMAIL]]",Bobby Tan write his email [EMAIL_START]bobby.t...
349,499,Kamala Singh telling the IT guy her email kama...,"[[42, 60, EMAIL]]",Kamala Singh telling the IT guy her email [EMA...
350,500,Raymond Koh say his email raymond.k@singnet.co...,"[[26, 50, EMAIL]]",Raymond Koh say his email [EMAIL_START]raymond...


In [290]:
batch_two_ref = batch_two_ref.drop(columns=['entities', 'text', 'id'], axis=1)
batch_two_ref.rename(columns={'tagged_text': 'text'}, inplace=True)
batch_two_ref.head()

Unnamed: 0,text
0,[BANK_ACCOUNT_START]456 729103 8[BANK_ACCOUNT_...
1,Jacob's OCBC bank account is [BANK_ACCOUNT_STA...
2,[BANK_ACCOUNT_START]788 305194 2[BANK_ACCOUNT_...
3,Geetha's UOB bank account is [BANK_ACCOUNT_STA...
4,[BANK_ACCOUNT_START]623 481057 6[BANK_ACCOUNT_...


Combine the datasets [Run this when dataset not combined yet]

In [291]:
test_set_ref = pd.concat([batch_one_ref, batch_two_ref], ignore_index=True)

In [292]:
test_set_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [293]:
test_set_ref.tail()

Unnamed: 0,text
496,Patrick Loh boasting about his email [EMAIL_ST...
497,Jasmine Yeo got sian when someone spell her em...
498,Bobby Tan write his email [EMAIL_START]bobby.t...
499,Kamala Singh telling the IT guy her email [EMA...
500,Raymond Koh say his email [EMAIL_START]raymond...


In [294]:
test_set_ref['text'] = test_set_ref['text'].apply(unify_whitespace)

In [295]:
test_set_ref.to_json('data/test_set_ref_all.jsonl', lines=True, orient='records')

Load the combined processed dataset [When already combined]

In [495]:
test_set_ref = pd.read_json('data/test_set_ref_all.jsonl', lines=True)

In [496]:
test_set_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [497]:
test_set_ref.tail()

Unnamed: 0,text
496,Patrick Loh boasting about his email [EMAIL_ST...
497,Jasmine Yeo got sian when someone spell her em...
498,Bobby Tan write his email [EMAIL_START] bobby....
499,Kamala Singh telling the IT guy her email [EMA...
500,Raymond Koh say his email [EMAIL_START] raymon...


## Load the model (Vosk)

Unfortunately, there are no current models that are tuned for Singaporean English (Singlish). As such, we shall use the `vosk-model-en-us-0.42-gigaspeech` model.

In [498]:
from vosk import Model, KaldiRecognizer
import soundfile as sf
import json

# Load model (replace path with your model directory)
model_path = "models/vosk-model-en-us-0.42-gigaspeech"
model = Model(model_path)

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=8
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from models/vosk-model-en-us-0.42-gigaspeech/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from models/vosk-model-en-us-0.42-gigaspeech/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:294) Loading words from models/vosk-model-en-us-0.42-gigaspeech/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo models/vosk-model-en-us-0.42-gigaspeech/graph/phones/word_bound

## Load and Read Audio (With just one sample)

So here, several things are happening:

1. We create a `KaldiRecognizer` instance and set `.SetWords()` to `True`, which means that we will get word-level timestamps.
2. The `.AcceptWaveform()` method is used to process the waveform
3. The `.FinalResult()` method is finally called to retrieve the word-level timestamps (transcribed from the Vosk Model - although with some innacurracies, as Vosk is not a full-fledged ASR model)

Test on one audio sample (Simple example with Name and Email PIIs)

In [499]:
audio_path = "data/Audio_Files_for_testing/id1.wav"
audio_data, sample_rate = sf.read(audio_path)

In [500]:
sample = run_vosk(audio_path, model)

In [501]:
sample

{'result': [{'conf': 1.0, 'end': 0.69, 'start': 0.51, 'word': 'the'},
  {'conf': 1.0, 'end': 0.87, 'start': 0.69, 'word': 'day'},
  {'conf': 1.0, 'end': 1.2, 'start': 0.87, 'word': 'before'},
  {'conf': 1.0, 'end': 1.68, 'start': 1.2, 'word': 'yesterday'},
  {'conf': 0.558658, 'end': 2.04, 'start': 1.77, 'word': 'ram'},
  {'conf': 1.0, 'end': 2.46, 'start': 2.04, 'word': 'received'},
  {'conf': 1.0, 'end': 2.88, 'start': 2.49, 'word': 'another'},
  {'conf': 1.0, 'end': 3.27, 'start': 2.94, 'word': 'email'},
  {'conf': 1.0, 'end': 3.6, 'start': 3.27, 'word': 'from'},
  {'conf': 0.974828, 'end': 4.111625, 'start': 3.69, 'word': 'ahri'},
  {'conf': 0.54586, 'end': 4.236342, 'start': 4.111625, 'word': 'and'},
  {'conf': 0.276569, 'end': 4.53, 'start': 4.236342, 'word': 'envoy'},
  {'conf': 1.0, 'end': 4.74, 'start': 4.546376, 'word': 'it'},
  {'conf': 1.0, 'end': 5.16, 'start': 4.95, 'word': 'club'},
  {'conf': 1.0, 'end': 5.43, 'start': 5.16, 'word': 'dot'},
  {'conf': 0.605647, 'end': 5.

Reference Transcript

In [502]:
test_set_ref['text'].iloc[0]

'The day before [DATE_START] yesterday, [DATE_END] [PERSON_START] Ram [PERSON_END] received another email from [EMAIL_START] r e m y at outlook dot sg [EMAIL_END]'

Align method usage

In [562]:
align_transcript_with_vosk(sample['result'], test_set_ref['text'].iloc[0])

['r']
['e']
['m']
['y']
['at']
['outlook']
['dot']
['sg']


[{'word': 'The', 'start': 0.51, 'end': 0.69},
 {'word': 'day', 'start': 0.69, 'end': 0.87},
 {'word': 'before', 'start': 0.87, 'end': 1.2},
 {'word': '[DATE_START] y e s t e r d a y [DATE_END]',
  'start': 1.2,
  'end': 1.68},
 {'word': '[PERSON_START] r a m [PERSON_END]', 'start': 1.77, 'end': 2.04},
 {'word': 'received', 'start': 2.04, 'end': 2.46},
 {'word': 'another', 'start': 2.49, 'end': 2.88},
 {'word': 'email', 'start': 2.94, 'end': 3.27},
 {'word': 'from', 'start': 3.27, 'end': 3.6},
 {'word': '[EMAIL_START] r e m y a t o u t l o o k d o t s g [EMAIL_END]',
  'start': 3.69,
  'end': 5.94}]

Test on one audio sample (Numerical PIIs)

In [684]:
audio_path = "data/Audio_Files_for_testing/id52.wav"
audio_data, sample_rate = sf.read(audio_path)

In [685]:
sample2 = run_vosk(audio_path, model)

In [686]:
sample2

{'result': [{'conf': 0.633794, 'end': 1.17, 'start': 0.66, 'word': 'ok'},
  {'conf': 0.292178, 'end': 1.8, 'start': 1.5, 'word': 'ah'},
  {'conf': 1.0, 'end': 2.55, 'start': 2.01, 'word': 'contact'},
  {'conf': 1.0, 'end': 3.0, 'start': 2.55, 'word': 'number'},
  {'conf': 1.0, 'end': 3.48, 'start': 3.12, 'word': 'just'},
  {'conf': 1.0, 'end': 3.75, 'start': 3.48, 'word': 'put'},
  {'conf': 1.0, 'end': 4.38, 'start': 3.93, 'word': 'nine'},
  {'conf': 1.0, 'end': 4.71, 'start': 4.41, 'word': 'eight'},
  {'conf': 1.0, 'end': 5.07, 'start': 4.71, 'word': 'four'},
  {'conf': 1.0, 'end': 5.43, 'start': 5.07, 'word': 'zero'},
  {'conf': 1.0, 'end': 5.85, 'start': 5.43, 'word': 'six'},
  {'conf': 0.995706, 'end': 6.18, 'start': 5.85, 'word': 'four'},
  {'conf': 1.0, 'end': 6.48, 'start': 6.18, 'word': 'one'},
  {'conf': 1.0, 'end': 6.75, 'start': 6.51, 'word': 'three'}],
 'text': 'ok ah contact number just put nine eight four zero six four one three'}

Reference

In [687]:
test_set_ref['text'].iloc[51]

'okay uh contact number just put [PHONE_START] Nine eight four zero six four one three [PHONE_END]'

Force align

credit_card, car_plate, bank_account, nric, phone, passport_num

In [688]:
align_transcript_with_vosk(sample2['result'], test_set_ref['text'].iloc[51])

Entity tokens after split: ['nine']
Entity tokens after split: ['nine', 'eight']
Entity tokens after split: ['nine', 'eight', 'four']
Entity tokens after split: ['nine', 'eight', 'four', 'zero']
Entity tokens after split: ['nine', 'eight', 'four', 'zero', 'six']
Entity tokens after split: ['nine', 'eight', 'four', 'zero', 'six']
Entity tokens after split: ['nine', 'eight', 'four', 'zero', 'six', 'one']
Entity tokens after split: ['nine', 'eight', 'four', 'zero', 'six', 'one', 'three']


[{'word': 'okay', 'start': 0.66, 'end': 1.17},
 {'word': 'uh', 'start': 1.5, 'end': 1.8},
 {'word': 'contact', 'start': 2.01, 'end': 2.55},
 {'word': 'number', 'start': 2.55, 'end': 3.0},
 {'word': 'just', 'start': 3.12, 'end': 3.48},
 {'word': 'put', 'start': 3.48, 'end': 3.75},
 {'word': '[PHONE_START] nine eight four zero six one three [PHONE_END]',
  'start': 3.93,
  'end': 6.75}]

### Heuristics Description:

Forced-alignment heuristics are necessary because:
- The **Vosk model** may tokenize words differently compared to the reference transcript, especially for structured data like emails, phone numbers, and other PIIs (Personally Identifiable Information).
- **PII structures vary greatly** (e.g., "rendy.tan@hotmail.com" vs "rendy . tan at hotmail dot com"), and simple word-to-word alignment would fail.
- To achieve robust alignment and accurate timestamp mapping, **manual control** over token splitting and flattening is required based on the entity type.

These heuristics ensure that:
- Common free-text is aligned naturally,
- Structured PII is broken down appropriately for correct timestamp boundary matching.

#### 1. Outside of Entity Boundaries (General Case)

- Tokens are aligned **as-is** with Vosk words.
- No special splitting is done.
- Regular cleaning (punctuation removal except for `"."`) is applied when matching.
- **Example**:
  - Input Transcript: `"reach me at"`
  - Tokens: `["reach", "me", "at"]`
  - Aligned directly without splitting.

#### 2. Inside Entity Boundaries (e.g., [EMAIL_START], [PHONE_START], etc.)

- Special handling is done based on the entity type.

##### (A) EMAIL Entity (`current_entity == 'EMAIL'`)

- Split tokens based on `"."` and `"@"` separators.
- Words like `"at"` are **left intact**.
- **Example**:
  - Input: `"rendy.tan@hotmail.com"`
  - Split into: `["rendy", ".", "tan", "@", "hotmail", ".", "com"]`

##### (B) Other Entity Types (`CREDIT_CARD`, `CAR_PLATE`, `BANK_ACCOUNT`, `NRIC`, `PHONE`, `PASSPORT_NUM`)

- **If the token is a spelled-out number** (checked against a dictionary):
  - **Do not split**; keep the word as a single token.
  - **Example**:
    - Input: `"eight"`
    - Output: `["eight"]`

- **If the token is pure digits** (e.g., numbers like `"98005331"`):
  - **Split** into **individual characters**.
  - **Example**:
    - Input: `"98005331"`
    - Output: `["9", "8", "0", "0", "5", "3", "3", "1"]`

- **If the token is a mix of letters and numbers** (e.g., `"AB1234X"`):
  - **Split** into **individual characters** as well.
  - **Example**:
    - Input: `"AB1234X"`
    - Output: `["A", "B", "1", "2", "3", "4", "X"]`

#### 3. When Flattening Entity Tokens (Before Final Alignment)

- After all splitting:
  - **Spelled-out numbers** (like `"eight"`) and **email parts** (like `"hotmail"`) are **kept whole**.
  - Other tokens (numbers, single characters) appear **character-by-character**.

- **Flattening Examples**:
  - Tokens: `["eight", "5", "0"]`
    - Final output: `"eight 5 0"`
  
  - Tokens: `["hotmail", ".", "com"]`
    - Final output: `"hotmail . com"`

## Load and Read Audio (All 500 samples)

In [None]:
# TODO

## [Archived]

In [None]:
import json
import os
import tempfile
from pydub import AudioSegment
from vosk import Model, KaldiRecognizer
model_path = "vosk-model-en-us-0.42-gigaspeech" #model_new"
model = Model(model_path)
def align_audio_with_text(audio_path, transcription):
    audio = AudioSegment.from_wav(audio_path)
    recognizer = KaldiRecognizer(model, audio.frame_rate)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav:
        temp_wav_path = temp_wav.name
        audio.export(temp_wav_path, format="wav")
    results = []
    try:
        with open(temp_wav_path, "rb") as wf:
            wf.read(44)
            recognizer.SetWords(True)
            while True:
                data = wf.read(4000)
                if len(data) == 0:
                    break
                if recognizer.AcceptWaveform(data):
                    results.append(json.loads(recognizer.Result()))
            results.append(json.loads(recognizer.FinalResult()))
    finally:
        if os.path.exists(temp_wav_path):
            os.remove(temp_wav_path)
    words = []
    for result in results:
        if 'result' in result:
            for word in result['result']:
                words.append(word)
    aligned_segments = []
    for word in words:
        aligned_segments.append({
            "start": word["start"],
            "end": word["end"],
            "word": word["word"]
        })
    return aligned_segments

audio_dir = "/content/drive/MyDrive/Share/Research/speechNER/finetune/Audio_Files_for_testing"
transcription_file = "/content/drive/MyDrive/Share/Research/speechNER/Alignement_data/Text_with_ids_temp_preprocessed.jsonl"
output_file = "/content/drive/MyDrive/Share/Research/speechNER/Alignement_data/tr_aligned_data_new.jsonl"
with open(transcription_file, 'r') as f:
    transcriptions = [json.loads(line) for line in f]
aligned_data = []
for item in transcriptions:
    audio_path = f"{audio_dir}/id{item['id']}.wav"
    aligned_transcription = align_audio_with_text(audio_path, item['text'])
    aligned_data.append({
        "id": item['id'],
        "text": item['text'],
        "align": aligned_transcription
    })
with open(output_file, 'w') as f:
    for item in aligned_data:
        f.write(json.dumps(item) + '\n')