# Forced Alignment with Vosk

Originally, we evaluated the tagger's F1-score by simply using indices, which may be too penalising. In order to properly evaluate the performance of our PII identification pipeline, we would need to perform *forced alignment*, which aligns token-level transcripts into their corresponding timestamps in the audio files.

For this, we shall be using *Vosk*, a toolkit which offers forced-alignment models. 

## Change Directory to Root Project

In [1]:
import os 

os.chdir('..')

In [2]:
os.getcwd()

'/Users/farhan/Desktop/Research'

## Helper functions

Helper function for sorting the audio file names by ID

In [3]:
def retrieve_key(file: str) -> int:
    try:
        # 3 digit
        key = int(file[2:5])
    except ValueError:
        # 1 digit
        if file[3] == '.':
            key = int(file[2])
        else:
            key = int(file[2:4])
    return key

Helper function to embed the entities within the transcripts (given a dataframe)

In [258]:
import pandas as pd

def insert_entity_tags_to_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Inserts entity boundary tags into the 'text' column based on 'entities',
    and adds a new column 'tagged_text' with the result.

    Args:
        df (pd.DataFrame): Must contain 'text' and 'entities' columns.

    Returns:
        pd.DataFrame: Same DataFrame with an additional 'tagged_text' column.
    """
    def insert_tags(row):
        text = row["text"]
        entities = row["entities"]

        # Sort entities in reverse order of start index to avoid offset issues
        entities_sorted = sorted(entities, key=lambda x: x[0], reverse=True)

        for start, end, label in entities_sorted:
            tag_start = f"[{label}_START]"
            tag_end = f"[{label}_END]"
            text = text[:end] + tag_end + text[end:]
            text = text[:start] + tag_start + text[start:]
        
        return text

    df = df.copy()
    df["tagged_text"] = df.apply(insert_tags, axis=1)
    return df

Helper function to unify whitespaces

In [259]:
import re

def unify_whitespace(s):
    """
    - Unify multiple spaces into one space across the text.
    - Ensure exactly one space after [XXX_START] and before [XXX_END].
    - Ensure one space after [XXX_END] if missing.
    - Ensure one space before [XXX_START] if missing.
    """
    if not isinstance(s, str):
        return s

    # Step 1: unify all whitespace
    s = re.sub(r'\s+', ' ', s.strip())

    # Step 2: ensure space after [XXX_START] and before [XXX_END]
    s = re.sub(r'(\[\w+_START\])(\S)', r'\1 \2', s)  # Add space after [START] if missing
    s = re.sub(r'(\S)(\[\w+_END\])', r'\1 \2', s)    # Add space before [END] if missing

    # Step 3: ensure space after [XXX_END] if missing
    s = re.sub(r'(\[\w+_END\])(\S)', r'\1 \2', s)

    # Step 4: ensure space before [XXX_START] if missing
    s = re.sub(r'(\S)(\[\w+_START\])', r'\1 \2', s)  # Add space before [START] if missing

    return s

Helper function to run Vosk forced-alignment model

In [33]:
from vosk import Model, KaldiRecognizer
import soundfile as sf
import json

def run_vosk(audio_path: str, vosk_model: Model) -> list:
    # Load audio
    audio_data, sample_rate = sf.read(audio_path)
    
    # Prepare recognizer
    rec = KaldiRecognizer(vosk_model, sample_rate)
    rec.SetWords(True)
    
    if audio_data.ndim > 1:
        audio_data = audio_data.mean(axis=1)  # Stereo to mono

    pcm_data = (audio_data * 32767).astype("int16").tobytes()

    rec.AcceptWaveform(pcm_data)
    result = json.loads(rec.FinalResult())
    
    return result

Helper function to align reference Whisper-generated transcript to forced-alignment model

In [462]:
import re
import string

def clean_token(token):
    """Remove punctuation and lowercase."""
    return token.lower().translate(str.maketrans('', '', string.punctuation))

def align_transcript_with_vosk(vosk_words, transcript):
    """
    Aligns a reference transcript with Vosk timestamps.
    Handles [XXX_START]... [XXX_END] entities properly.
    """
    tokens = re.findall(r'\[.*?\]|\S+', transcript)  # Tokenize the transcript
    aligned = []
    vosk_idx = 0
    current_entity = None
    entity_tokens = []
    entity_start_time = None
    entity_end_time = None

    entity_types_to_split = ['CREDIT_CARD', 'CAR_PLATE', 'BANK_ACCOUNT', 'NRIC', 'PHONE', 'PASSPORT_NUM']
    
    i = 0  # Index to keep track of the current token in the list

    while i < len(tokens):
        token = tokens[i]

        print(f"Current token: {token}")

        if token.endswith('_START]'):
            # Start a new entity
            current_entity = token.replace('[', '').replace(']', '').replace('_START', '')
            entity_tokens = []
            entity_start_time = None
            entity_end_time = None
            i += 1
            continue

        if token.endswith('_END]'):
            # End the current entity
            if current_entity:
                # Flatten the entity and align with timestamps
                flattened_entity = []
                for t in entity_tokens:
                    clean_token_with_no_symbols = clean_token(t)  # Clean token
                    flattened_entity.extend(list(clean_token_with_no_symbols))  # Split into characters
                # Join the characters and align timestamps
                aligned.append({
                    "word": f"[{current_entity}_START] {' '.join(flattened_entity)} [{current_entity}_END]",
                    "start": entity_start_time,
                    "end": entity_end_time
                })
            current_entity = None
            entity_tokens = []
            entity_start_time = None
            entity_end_time = None
            i += 1
            continue

        clean_ref_word = clean_token(token)

        if current_entity:
            # Inside an entity, split the token into characters and modify the tokens list
            if vosk_idx < len(vosk_words):
                vosk_word = vosk_words[vosk_idx]['word']
                if not entity_tokens:
                    entity_start_time = vosk_words[vosk_idx]['start']
                entity_end_time = vosk_words[vosk_idx]['end']

                # Only split tokens into characters if the entity is in the defined list
                if current_entity in entity_types_to_split:
                    clean_token_with_no_symbols = clean_token(token)  # Clean token
                    char_tokens = list(clean_token_with_no_symbols)  # Split into characters

                    # Modify the tokens list in place by extending with the character tokens
                    tokens[i:i+1] = char_tokens  # Replace the current token with the split characters
                    entity_tokens.extend(char_tokens)  # Add characters to entity tokens list
                else:
                    # If not a specified entity type, just add the token as is
                    entity_tokens.append(token)

                vosk_idx += 1
            else:
                # No more Vosk words left (shouldn't happen usually)
                entity_tokens.append(token)

        else:
            # Outside entity, normal matching
            while vosk_idx < len(vosk_words):
                clean_vosk_word = clean_token(vosk_words[vosk_idx]['word'])
                aligned.append({
                    "word": token,
                    "start": vosk_words[vosk_idx]['start'],
                    "end": vosk_words[vosk_idx]['end']
                })
                vosk_idx += 1
                break

        i += 1  # Move to the next token

    return aligned

## Load the dataset

In [23]:
import os

os.chdir('..')

Load batch 1 (150 samples)

In [283]:
import pandas as pd

batch_one_ref = pd.read_json('data/true_data_150.jsonl', lines=True)
batch_one_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [284]:
import os

batch_one_files = sorted(os.listdir("data/Audio_Files_for_testing"), key=retrieve_key)
batch_one_files  = [f'data/Audio_Files_for_testing/{file}' for file in batch_one_files]

import pandas as pd

batch_one_df = pd.DataFrame(data=batch_one_files, columns=['file_name'])
batch_one_df.head()

Unnamed: 0,file_name
0,data/Audio_Files_for_testing/id1.wav
1,data/Audio_Files_for_testing/id2.wav
2,data/Audio_Files_for_testing/id3.wav
3,data/Audio_Files_for_testing/id4.wav
4,data/Audio_Files_for_testing/id5.wav


Load batch 2 (350 samples)

In [285]:
import pandas as pd

batch_two_ref = pd.read_json('data/newtest_151_500_updated_TTS.jsonl', lines=True)
batch_two_ref.head()

Unnamed: 0,id,text,entities
0,151,"456 729103 8 is Kaifu Lee's DBS bank account, ...","[[0, 12, BANK_ACCOUNT]]"
1,152,"Jacob's OCBC bank account is 192-58462-3, and ...","[[29, 40, BANK_ACCOUNT]]"
2,153,"788 305194 2 is Zheng Qi's POSB bank account, ...","[[0, 12, BANK_ACCOUNT]]"
3,154,"Geetha's UOB bank account is 341-92741-9, and ...","[[29, 40, BANK_ACCOUNT]]"
4,155,"623 481057 6 is Ah Seng's Maybank account, and...","[[0, 12, BANK_ACCOUNT]]"


In [286]:
import os

batch_two_files = sorted(os.listdir("data/newtest_151_500_updated_TTS"), key=retrieve_key)
batch_two_files  = [f'data/newtest_151_500_updated_TTS/{file}' for file in batch_two_files]

import pandas as pd

batch_two_df = pd.DataFrame(data=batch_two_files, columns=['file_name'])
batch_two_df.head()

Unnamed: 0,file_name
0,data/newtest_151_500_updated_TTS/id151.wav
1,data/newtest_151_500_updated_TTS/id152.wav
2,data/newtest_151_500_updated_TTS/id153.wav
3,data/newtest_151_500_updated_TTS/id154.wav
4,data/newtest_151_500_updated_TTS/id155.wav


As you can see, for batch 2, the entities enclosed are not in the reference transcripts. As we are given the indices, we can write a helper function to include them within the transcripts. 

Preprocess transcripts

In [287]:
batch_two_ref = insert_entity_tags_to_df(batch_two_ref)

In [288]:
batch_two_ref.head()

Unnamed: 0,id,text,entities,tagged_text
0,151,"456 729103 8 is Kaifu Lee's DBS bank account, ...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]456 729103 8[BANK_ACCOUNT_...
1,152,"Jacob's OCBC bank account is 192-58462-3, and ...","[[29, 40, BANK_ACCOUNT]]",Jacob's OCBC bank account is [BANK_ACCOUNT_STA...
2,153,"788 305194 2 is Zheng Qi's POSB bank account, ...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]788 305194 2[BANK_ACCOUNT_...
3,154,"Geetha's UOB bank account is 341-92741-9, and ...","[[29, 40, BANK_ACCOUNT]]",Geetha's UOB bank account is [BANK_ACCOUNT_STA...
4,155,"623 481057 6 is Ah Seng's Maybank account, and...","[[0, 12, BANK_ACCOUNT]]",[BANK_ACCOUNT_START]623 481057 6[BANK_ACCOUNT_...


In [289]:
batch_two_ref.tail()

Unnamed: 0,id,text,entities,tagged_text
346,496,Patrick Loh boasting about his email patrick.l...,"[[37, 60, EMAIL]]",Patrick Loh boasting about his email [EMAIL_ST...
347,497,Jasmine Yeo got sian when someone spell her em...,"[[50, 69, EMAIL]]",Jasmine Yeo got sian when someone spell her em...
348,498,Bobby Tan write his email bobby.tan@gmail.com ...,"[[26, 45, EMAIL]]",Bobby Tan write his email [EMAIL_START]bobby.t...
349,499,Kamala Singh telling the IT guy her email kama...,"[[42, 60, EMAIL]]",Kamala Singh telling the IT guy her email [EMA...
350,500,Raymond Koh say his email raymond.k@singnet.co...,"[[26, 50, EMAIL]]",Raymond Koh say his email [EMAIL_START]raymond...


In [290]:
batch_two_ref = batch_two_ref.drop(columns=['entities', 'text', 'id'], axis=1)
batch_two_ref.rename(columns={'tagged_text': 'text'}, inplace=True)
batch_two_ref.head()

Unnamed: 0,text
0,[BANK_ACCOUNT_START]456 729103 8[BANK_ACCOUNT_...
1,Jacob's OCBC bank account is [BANK_ACCOUNT_STA...
2,[BANK_ACCOUNT_START]788 305194 2[BANK_ACCOUNT_...
3,Geetha's UOB bank account is [BANK_ACCOUNT_STA...
4,[BANK_ACCOUNT_START]623 481057 6[BANK_ACCOUNT_...


Combine the datasets [Run this when dataset not combined yet]

In [291]:
test_set_ref = pd.concat([batch_one_ref, batch_two_ref], ignore_index=True)

In [292]:
test_set_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [293]:
test_set_ref.tail()

Unnamed: 0,text
496,Patrick Loh boasting about his email [EMAIL_ST...
497,Jasmine Yeo got sian when someone spell her em...
498,Bobby Tan write his email [EMAIL_START]bobby.t...
499,Kamala Singh telling the IT guy her email [EMA...
500,Raymond Koh say his email [EMAIL_START]raymond...


In [294]:
test_set_ref['text'] = test_set_ref['text'].apply(unify_whitespace)

In [295]:
test_set_ref.to_json('data/test_set_ref_all.jsonl', lines=True, orient='records')

Load the combined processed dataset [When already combined]

In [296]:
test_set_ref = pd.read_json('data/test_set_ref_all.jsonl', lines=True)

In [297]:
test_set_ref.head()

Unnamed: 0,text
0,"The day before [DATE_START] yesterday, [DATE_E..."
1,um my date of birth is uh second [DATE_START] ...
2,"she handed over a crumpled piece of paper, the..."
3,aglio olio and err uh [CARDINAL_START] three t...
4,[PERSON_START] Hong's [PERSON_END] email is [E...


In [298]:
test_set_ref.tail()

Unnamed: 0,text
496,Patrick Loh boasting about his email [EMAIL_ST...
497,Jasmine Yeo got sian when someone spell her em...
498,Bobby Tan write his email [EMAIL_START] bobby....
499,Kamala Singh telling the IT guy her email [EMA...
500,Raymond Koh say his email [EMAIL_START] raymon...


## Load the model (Vosk)

Unfortunately, there are no current models that are tuned for Singaporean English (Singlish). As such, we shall use the `vosk-model-en-us-0.42-gigaspeech` model.

In [277]:
from vosk import Model, KaldiRecognizer
import soundfile as sf
import json

# Load model (replace path with your model directory)
model_path = "models/vosk-model-en-us-0.42-gigaspeech"
model = Model(model_path)

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=8
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from models/vosk-model-en-us-0.42-gigaspeech/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from models/vosk-model-en-us-0.42-gigaspeech/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:294) Loading words from models/vosk-model-en-us-0.42-gigaspeech/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:303) Loading winfo models/vosk-model-en-us-0.42-gigaspeech/graph/phones/word_bound

## Load and Read Audio (With just one sample)

So here, several things are happening:

1. We create a `KaldiRecognizer` instance and set `.SetWords()` to `True`, which means that we will get word-level timestamps.
2. The `.AcceptWaveform()` method is used to process the waveform
3. The `.FinalResult()` method is finally called to retrieve the word-level timestamps (transcribed from the Vosk Model - although with some innacurracies, as Vosk is not a full-fledged ASR model)

Test on one audio sample (Simple example with Name and Email PIIs)

In [448]:
audio_path = "data/Audio_Files_for_testing/id1.wav"
audio_data, sample_rate = sf.read(audio_path)

In [449]:
sample = run_vosk(audio_path, model)

In [450]:
sample

{'result': [{'conf': 1.0, 'end': 0.69, 'start': 0.51, 'word': 'the'},
  {'conf': 1.0, 'end': 0.87, 'start': 0.69, 'word': 'day'},
  {'conf': 1.0, 'end': 1.2, 'start': 0.87, 'word': 'before'},
  {'conf': 1.0, 'end': 1.68, 'start': 1.2, 'word': 'yesterday'},
  {'conf': 0.561098, 'end': 2.04, 'start': 1.74, 'word': 'ram'},
  {'conf': 1.0, 'end': 2.46, 'start': 2.04, 'word': 'received'},
  {'conf': 1.0, 'end': 2.88, 'start': 2.49, 'word': 'another'},
  {'conf': 1.0, 'end': 3.27, 'start': 2.94, 'word': 'email'},
  {'conf': 1.0, 'end': 3.6, 'start': 3.27, 'word': 'from'},
  {'conf': 0.92733, 'end': 4.110592, 'start': 3.69, 'word': 'ahri'},
  {'conf': 0.682733, 'end': 4.29, 'start': 4.110592, 'word': 'and'},
  {'conf': 0.996653, 'end': 4.74, 'start': 4.540672, 'word': 'it'},
  {'conf': 0.996653, 'end': 5.16, 'start': 4.95, 'word': 'club'},
  {'conf': 0.999769, 'end': 5.43, 'start': 5.16, 'word': 'dot'},
  {'conf': 0.997014, 'end': 5.64, 'start': 5.46, 'word': 'is'},
  {'conf': 0.999999, 'end'

Reference Transcript

In [451]:
test_set_ref['text'].iloc[0]

'The day before [DATE_START] yesterday, [DATE_END] [PERSON_START] Ram [PERSON_END] received another email from [EMAIL_START] r e m y at outlook dot sg [EMAIL_END]'

Align method usage

In [452]:
align_transcript_with_vosk(sample['result'], test_set_ref['text'].iloc[0])

Current token: The
Current token: day
Current token: before
Current token: [DATE_START]
Current token: yesterday,
Current token: [DATE_END]
Current token: [PERSON_START]
Current token: Ram
Current token: [PERSON_END]
Current token: received
Current token: another
Current token: email
Current token: from
Current token: [EMAIL_START]
Current token: r
Current token: e
Current token: m
Current token: y
Current token: at
Current token: outlook
Current token: dot
Current token: sg
Current token: [EMAIL_END]


[{'word': 'The', 'start': 0.51, 'end': 0.69},
 {'word': 'day', 'start': 0.69, 'end': 0.87},
 {'word': 'before', 'start': 0.87, 'end': 1.2},
 {'word': '[DATE_START] y e s t e r d a y [DATE_END]',
  'start': 1.2,
  'end': 1.68},
 {'word': '[PERSON_START] r a m [PERSON_END]', 'start': 1.74, 'end': 2.04},
 {'word': 'received', 'start': 2.04, 'end': 2.46},
 {'word': 'another', 'start': 2.49, 'end': 2.88},
 {'word': 'email', 'start': 2.94, 'end': 3.27},
 {'word': 'from', 'start': 3.27, 'end': 3.6},
 {'word': '[EMAIL_START] r e m y a t o u t l o o k d o t s g [EMAIL_END]',
  'start': 3.69,
  'end': 5.94}]

Test on one audio sample (Numerical PIIs)

In [453]:
audio_path = "data/Audio_Files_for_testing/id50.wav"
audio_data, sample_rate = sf.read(audio_path)

In [454]:
sample2 = run_vosk(audio_path, model)

In [455]:
sample2

{'result': [{'conf': 0.551495, 'end': 1.08, 'start': 0.48, 'word': 'museum'},
  {'conf': 1.0, 'end': 1.62, 'start': 1.14, 'word': 'glimpse'},
  {'conf': 1.0, 'end': 1.8, 'start': 1.65, 'word': 'the'},
  {'conf': 1.0, 'end': 2.13, 'start': 1.8, 'word': 'credit'},
  {'conf': 1.0, 'end': 2.34, 'start': 2.13, 'word': 'card'},
  {'conf': 1.0, 'end': 3.03, 'start': 2.34, 'word': 'information'},
  {'conf': 1.0, 'end': 3.48, 'start': 3.3, 'word': 'and'},
  {'conf': 1.0, 'end': 3.63, 'start': 3.48, 'word': 'it'},
  {'conf': 1.0, 'end': 3.78, 'start': 3.63, 'word': 'is'},
  {'conf': 1.0, 'end': 4.02, 'start': 3.78, 'word': 'from'},
  {'conf': 0.810728, 'end': 4.38, 'start': 4.02, 'word': 'visa'},
  {'conf': 0.886088, 'end': 4.77, 'start': 4.41, 'word': 'card'},
  {'conf': 1.0, 'end': 5.25, 'start': 5.04, 'word': 'and'},
  {'conf': 1.0, 'end': 5.73, 'start': 5.25, 'word': 'numbered'},
  {'conf': 1.0, 'end': 6.48, 'start': 6.15, 'word': 'eight'},
  {'conf': 1.0, 'end': 6.81, 'start': 6.48, 'word':

Reference

In [456]:
test_set_ref['text'].iloc[49]

'[PERSON_START] Raam [PERSON_END] glimpsed the credit card information and it is from [ORG_START] VISA [ORG_END] card and number [CREDIT_CARD_START] 8888-4249-9427-1019 [CREDIT_CARD_END]'

Force align

credit_card, car_plate, bank_account, nric, phone, passport_num

In [463]:
align_transcript_with_vosk(sample2['result'], test_set_ref['text'].iloc[49])

Current token: [PERSON_START]
Current token: Raam
Current token: [PERSON_END]
Current token: glimpsed
Current token: the
Current token: credit
Current token: card
Current token: information
Current token: and
Current token: it
Current token: is
Current token: from
Current token: [ORG_START]
Current token: VISA
Current token: [ORG_END]
Current token: card
Current token: and
Current token: number
Current token: [CREDIT_CARD_START]
Current token: 8888-4249-9427-1019
Current token: 8
Current token: 8
Current token: 8
Current token: 4
Current token: 2
Current token: 4
Current token: 9
Current token: 9
Current token: 4
Current token: 2
Current token: 7
Current token: 1
Current token: 0
Current token: 1
Current token: 9
Current token: [CREDIT_CARD_END]


[{'word': '[PERSON_START] r a a m [PERSON_END]', 'start': 0.48, 'end': 1.08},
 {'word': 'glimpsed', 'start': 1.14, 'end': 1.62},
 {'word': 'the', 'start': 1.65, 'end': 1.8},
 {'word': 'credit', 'start': 1.8, 'end': 2.13},
 {'word': 'card', 'start': 2.13, 'end': 2.34},
 {'word': 'information', 'start': 2.34, 'end': 3.03},
 {'word': 'and', 'start': 3.3, 'end': 3.48},
 {'word': 'it', 'start': 3.48, 'end': 3.63},
 {'word': 'is', 'start': 3.63, 'end': 3.78},
 {'word': 'from', 'start': 3.78, 'end': 4.02},
 {'word': '[ORG_START] v i s a [ORG_END]', 'start': 4.02, 'end': 4.38},
 {'word': 'card', 'start': 4.41, 'end': 4.77},
 {'word': 'and', 'start': 5.04, 'end': 5.25},
 {'word': 'number', 'start': 5.25, 'end': 5.73},
 {'word': '[CREDIT_CARD_START] 8 8 8 8 4 2 4 9 9 4 2 7 1 0 1 9 8 8 8 4 2 4 9 9 4 2 7 1 0 1 9 [CREDIT_CARD_END]',
  'start': 6.15,
  'end': 11.61}]

### Heuristics for Forced-Alignment

1. **General Matching**:
   - The system aligns reference tokens with Vosk output by directly matching words and their timestamps.

2. **Flattening Numbers**:
   - For entities with alphanumeric characters (e.g., credit cards, phone numbers, passport numbers, car plates, etc.), the system retains the start time of the first token and the end time of the last token, without converting words to numbers.
   
3. **Special Cases**:
   - **Text-based Entities**: Entities like names or organizations are matched normally without changes.
   - **Number-based Entities**: For entities like `credit_card`, `car_plate`, `bank_account`, `nric`, `phone`, `passport_num`, the system:
     - Removes special characters (e.g., hyphens or spaces).
     - Splits the token into individual characters.
     - Modifies the token list to reflect these splits while retaining the entity’s timestamps.

## Load and Read Audio (All 500 samples)

In [None]:
# TODO

## [Archived]

In [None]:
import json
import os
import tempfile
from pydub import AudioSegment
from vosk import Model, KaldiRecognizer
model_path = "vosk-model-en-us-0.42-gigaspeech" #model_new"
model = Model(model_path)
def align_audio_with_text(audio_path, transcription):
    audio = AudioSegment.from_wav(audio_path)
    recognizer = KaldiRecognizer(model, audio.frame_rate)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_wav:
        temp_wav_path = temp_wav.name
        audio.export(temp_wav_path, format="wav")
    results = []
    try:
        with open(temp_wav_path, "rb") as wf:
            wf.read(44)
            recognizer.SetWords(True)
            while True:
                data = wf.read(4000)
                if len(data) == 0:
                    break
                if recognizer.AcceptWaveform(data):
                    results.append(json.loads(recognizer.Result()))
            results.append(json.loads(recognizer.FinalResult()))
    finally:
        if os.path.exists(temp_wav_path):
            os.remove(temp_wav_path)
    words = []
    for result in results:
        if 'result' in result:
            for word in result['result']:
                words.append(word)
    aligned_segments = []
    for word in words:
        aligned_segments.append({
            "start": word["start"],
            "end": word["end"],
            "word": word["word"]
        })
    return aligned_segments

audio_dir = "/content/drive/MyDrive/Share/Research/speechNER/finetune/Audio_Files_for_testing"
transcription_file = "/content/drive/MyDrive/Share/Research/speechNER/Alignement_data/Text_with_ids_temp_preprocessed.jsonl"
output_file = "/content/drive/MyDrive/Share/Research/speechNER/Alignement_data/tr_aligned_data_new.jsonl"
with open(transcription_file, 'r') as f:
    transcriptions = [json.loads(line) for line in f]
aligned_data = []
for item in transcriptions:
    audio_path = f"{audio_dir}/id{item['id']}.wav"
    aligned_transcription = align_audio_with_text(audio_path, item['text'])
    aligned_data.append({
        "id": item['id'],
        "text": item['text'],
        "align": aligned_transcription
    })
with open(output_file, 'w') as f:
    for item in aligned_data:
        f.write(json.dumps(item) + '\n')