In [70]:
# import difflib

# def align_transcripts(transcript1, transcript2):
#     words1 = transcript1.split()
#     words2 = transcript2.split()
#     # Use the difflib library to create a sequence matcher object
#     matcher = difflib.SequenceMatcher(None, words1, words2)
#     # Create an empty list to store the aligned transcripts
#     aligned_transcripts = []
#     for tag, i1, i2, j1, j2 in matcher.get_opcodes():
#         if tag == 'equal':
#             for i in range(i1,i2):
#                 aligned_transcripts.append([words1[i], words2[i]])
#         elif tag == 'replace':
#             for i in range(i1,i2):
#                 aligned_transcripts.append([words1[i], words2[i]])
#         elif tag == 'delete':
#             for i in range(i1,i2):
#                 aligned_transcripts.append([words1[i], ''])
#         elif tag == 'insert':
#             for i in range(j1,j2):
#                 aligned_transcripts.append(['', words2[i]])
#     return aligned_transcripts

In [127]:
import re
import json
import numpy as np

# Pre-processing functions

In [132]:
def load_transcript(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.readlines()
    # Remove the first three lines (Microsoft timestamp)
    content = content[3:]
    return "".join(content)


def remove_time_stamps(text):
    return re.sub(r'\[\d{2}:\d{2}:\d{2}:\d{2}\]', '', text)


def remove_words_in_angle_brackets(text):
    # Removes the hesitations in angle brackets
    return re.sub(r"<\w+>", "", text)


def replace_newlines(text):
    # Replaces one or more "\n" with a single space
    text = re.sub(r'\n+', ' ', text)
    return text


def npsc_json_to_bigdoc(file_path):
     with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        sentences = [sentence['sentence_text'] for sentence in data['sentences']]
        text = ". ".join(sentences)
        return text


def split_string(text):
    # Simple tokenizer
    # Splits a string into words and punctuation
    words = re.findall(r'\b\w+\b|[^\w\s]+', text)
    return words

def write_to_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)


# Needleman Wunsch Algorithm

In [131]:
def align_lists(list1, list2):
    # Create a matrix to store the alignment scores
    matrix = np.zeros((len(list1) + 1, len(list2) + 1))
    # Initialize the first row and column of the matrix with gap penalties
    for i in range(1, len(list1) + 1):
        matrix[i][0] = matrix[i-1][0] - 1
    for j in range(1, len(list2) + 1):
        matrix[0][j] = matrix[0][j-1] - 1
    # Fill in the rest of the matrix with alignment scores
    for i in range(1, len(list1) + 1):
        for j in range(1, len(list2) + 1):
            match = matrix[i-1][j-1] + (1 if list1[i-1] == list2[j-1] else -1)
            delete = matrix[i-1][j] - 1
            insert = matrix[i][j-1] - 1
            matrix[i][j] = max(match, delete, insert)
    # Traceback to find the optimal alignment
    aligned_lists = []
    i, j = len(list1), len(list2)
    while i > 0 or j > 0:
        if i > 0 and j > 0 and matrix[i][j] == matrix[i-1][j-1] + (1 if list1[i-1] == list2[j-1] else -1):
            aligned_lists.append([list1[i-1], list2[j-1]])
            i -= 1
            j -= 1
        elif i > 0 and matrix[i][j] == matrix[i-1][j] - 1:
            aligned_lists.append([list1[i-1], ''])
            i -= 1
        else:
            aligned_lists.append(['', list2[j-1]])
            j -= 1
    return aligned_lists[::-1]

# Cleaning transcript, aligning with gold label, and writing to JSON

In [122]:
folder_name = '20170207'
asr_file_path = '..\\Data\\asr_generated\\20170207-095506-parallel-aihub.txt'
json_file_path = f'..\\Data\\NPSC\\{folder_name}\\{folder_name}_sentence_data.json'

transcript = load_transcript(asr_file_path)
transcript = remove_time_stamps(transcript)
transcript = replace_newlines(transcript)

goldlabel = npsc_json_to_bigdoc(json_file_path)
goldlabel = remove_words_in_angle_brackets(goldlabel)
alignment = align_lists(split_string(transcript), split_string(goldlabel))


In [124]:
write_to_json(alignment, f'..\\Data\\aligned\\{folder_name}_test_alignment.json')
