In [76]:
pip install -q beautifulsoup4 lxml

In [77]:
import os
import re
import json
from bs4 import BeautifulSoup
from string import punctuation
from tqdm.notebook import tqdm

In [59]:
def parse_one_file(filepath):
    with open(filepath, 'r') as f:
        data = f.read()
    Bs_data = BeautifulSoup(data, "xml")
    words_xml = Bs_data.find_all('w')
    word_tuples = []
    for word in words_xml:
        start = word.get('starttime')
        end = word.get('endtime')
        word = word.string
        word_tuples.append((start, end, word))
    return word_tuples

In [60]:
def get_separate_utterances(word_tuples, speaker, meeting_part):
    utt_list = []
    prev_end_time = 0
    for word_tuple in word_tuples:
        start_time = word_tuple[0]
        end_time = word_tuple[1]
        word = word_tuple[2]
        if prev_end_time == 0:
            sent_start_time = start_time
            oneutt = word
        elif prev_end_time == start_time:
            if word not in punctuation:
                word = f" {word}"
            oneutt += word
        else:
            sent_end_time = prev_end_time
            oneutt = re.sub(r'(\w)_', r'\1', oneutt)
            utt_list.append({"meeting_part": meeting_part, "person": speaker,
                             "starttime": float(sent_start_time), "endtime": float(sent_end_time),
                             "sentences": oneutt})
            sent_start_time = start_time
            oneutt = word
        prev_end_time = end_time
    return utt_list

In [61]:
def get_utt_list_for_meeting(filepath):
    speaker = filepath.split('.')[1]
    meeting_part = filepath.split('.')[0][-1]
    word_tuples = parse_one_file(filepath)
    separate_utterances = get_separate_utterances(word_tuples, speaker=speaker, meeting_part=meeting_part)
    return separate_utterances

In [62]:
def process_all_meetings(filepaths):
    all_meetings = {}
    meetings_time_sorted = {}
    for filepath in tqdm(filepaths):
        meeting_name = filepath.split('/')[-1].split('.')[0][:-1]
        meeting_utts_participant = get_utt_list_for_meeting(filepath)
        if meeting_name not in all_meetings:
            all_meetings[meeting_name] = meeting_utts_participant
        else:
            all_meetings[meeting_name] += meeting_utts_participant
    for key, value in all_meetings.items():
        meetings_time_sorted[key] = sorted(value, key=lambda d: (d['meeting_part'], d['starttime']))
    return meetings_time_sorted

In [65]:
folder_path = '/content/drive/MyDrive/AMI_corpus_words' #or whatever it is for you
test_meeting = sorted(os.listdir(folder_path))
test_meeting = [f"{folder_path}/{file_name}" for file_name in test_meeting]
dict_test_meeting = process_all_meetings(test_meeting)

  0%|          | 0/5 [00:00<?, ?it/s]

In [73]:
dict_test_meeting['EN2001'][4]

{'meeting_part': 'a',
 'person': 'A',
 'starttime': 11.09,
 'endtime': 15.53,
 'sentences': "Does anyone want to see uh Steve's feedback from the specification?"}

In [74]:
with open("AMI_corpus_transcripts.json", "w") as outfile:
    json.dump(dict_test_meeting, outfile)