In [1]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Process Individual Transcripts

In [2]:
import re

In [3]:
# change directory to desired folder.
%cd /content/drive/MyDrive/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en/train/

/content/drive/MyDrive/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en/train


In [4]:
# provide file path
file_path = "meeting_en_train_003/transcript_MAN2_annot13.deidentified.txt"
transcripts = open(file_path, "r").readlines()

In [5]:
# list of regular expression to foarged in transcripts.
reg_ex = {
      r"( *)\<(.*?)\>": '',
      r"\n": '',
      r"(  +)": ' ',
      r"\-": '',
      r"(,+)": ',',
      r"( *)(-+)": '',
}       

In [6]:
# utility methods 

# remove punctuations and special tokens
def rem_ntok(reg_ex, text):
  for key, val in reg_ex.items():
    text = re.sub(key, val, text)
  return text

# remove extra space from the text
# remove \n and concat utterance which do not start with (PERSON.*?)
def add_colon(sentence):
  eidx = re.search(r'\(PERSON(.*?)\)', sentence).end()
  if sentence[eidx] == ':':
    return sentence
  else:
    return sentence[:eidx] + ':' + sentence[eidx:]

# process roles list and remove "( and )"
def process_roles(role):
  regex = {
      r"\(": '',
      r"\)": ''
  }
  for key, value in regex.items():
    role = re.sub(key, '', role)
  return role

# remove special tokens from the processed list of roles and utterances
def remove_special_tokens(utterance):
  regex = [r'^\.\',', r'^\.\'', r'^\',', r'^,', r'^\'', r'^\.', r'^, ,', r'^\?']
  for exp in regex:
    utterance = re.sub(exp, '', utterance)
  return utterance

In [7]:
# remove newline character
def preprocess_transcripts(document):
  transcript = []
  for line in document:
    if line == "\n":
      continue
    transcript.append(line.replace("\n", "") + " ")
  return transcript

# iterate over transcript and segmentation
def parse_transcript(reg_ex, transcript):
  # updated list of transcript's text
  updateList = []
  for text in transcript:
    updateList.append(rem_ntok(
        reg_ex = reg_ex,
        text = text
    ))
  # create list of utterances
  utteranceList = []
  person_regex = [r'\(PERSON(.*)\)']
  for text in updateList:
    result = re.findall(person_regex[0], text)
    if len(result) == 1:
      utteranceList.append(add_colon(text))
    else:
      try:
        prev_text = utteranceList[-1]
        utteranceList[-1] = prev_text + text.strip() + " "
      except Exception as e:
        pass
  return utteranceList

# bifurcate transcripts into roles and utterances. 
def split_transcripts(processed_transcript):
  roles, utterances, temp_roles = [], [], []
  for text in processed_transcript:
    temp = text.split(':')
    tune = remove_special_tokens(temp[1].strip()).strip()
    tune = remove_special_tokens(tune.strip()).strip()
    tune = remove_special_tokens(tune.strip()).strip()
    if tune is not '' and len(tune) > 2:
      utterances.append(tune)
      temp_roles.append(temp[0])
  for role in temp_roles:
    roles.append(process_roles(role))
  return roles, utterances

In [8]:
transcripts = preprocess_transcripts(transcripts)
transcripts = parse_transcript(reg_ex, transcripts)
roles, utterances = split_transcripts(transcripts)

In [9]:
example = [f"{roles[i]}: {utterances[i]}\n" for i in range(5)]
example = ''.join(example)
print(example)

PERSON6: Try to record it if it works. Now it seems that it is working. Yeah, I see that it is recorded. And now I will try to share my screen. And share. And, can you see that?
PERSON7: Yes.
PERSON4: Yes, mhm.
PERSON6: Uh, I just wanted to show, uh, [PERSON7]. Uh, and could you look at the data at my screen now? Uh, uh, [PERSON7]?
PERSON4: I think he is there.



----

## Process Automin Datasets

In [10]:
%cd ..

/content/drive/My Drive/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en


In [11]:
import os
directory = './automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en/'
test_dir = os.path.join(directory, 'dev/')
desired_dev_dir = os.path.join(directory, 'preprocessed_dev/')

In [12]:
import os
directory = '/content/drive/MyDrive/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en'
train_dir = os.path.join(directory, 'train/')
desired_train_dir = os.path.join(directory, 'preprocessed/')

In [12]:
# train routine

train_fold = os.listdir(train_dir)
count = 0
for folder in train_fold:
  try:
    rootfile = train_dir + f'{folder}/{folder}_transcript.txt'
    transcripts = open(rootfile, 'r')
    transcripts = transcripts.readlines()
    utterancelist = pp_utterances(transcripts)
    print(utterancelist)
    # save txt file under preprocessed/{folder}_transcript.txt
    desfile = desired_train_dir + f'{folder}_transcript.txt'
    with open(desfile, 'w') as filehandle:
      for utterances in utterancelist:
        filehandle.write('%s\n' % utterances)
  except Exception as e:
    print(f'{folder} not found!')

In [12]:
# train routine

dev_fold = os.listdir(dev_fold)
count = 0
for folder in dev_fold:
  try:
    rootfile = test_dir + f'{folder}/{folder}_transcript.txt'
    transcripts = open(rootfile, 'r')
    transcripts = transcripts.readlines()
    utterancelist = pp_utterances(transcripts)
    print(utterancelist)
    # save txt file under preprocessed/{folder}_transcript.txt
    desfile = desired_dev_dir + f'{folder}_transcript.txt'
    with open(desfile, 'w') as filehandle:
      for utterances in utterancelist:
        filehandle.write('%s\n' % utterances)
  except Exception as e:
    print(f'{folder} not found!')

----

# Convert to JSON file

In [13]:
import os
root_dir = '/content/drive/MyDrive/AutoMin-2021/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en/'
main_dir = os.path.join(directory, 'preprocessed/')

In [14]:
# Iterate over preprocessed transcript and appending to main directory
main_dict = dict()

prepro_fold = os.listdir(main_dir)

for idx, File in enumerate(prepro_fold):
  x = File.replace('.txt', '')
  print(f"Iterating over {File}")
  dict_name = f'{x}'
  root_file = main_dir + f'/{File}'
  pp_transcripts = open(root_file, 'r')
  pp_transcripts = pp_transcripts.readlines()
  roles, utterances = split_transcripts(pp_transcripts)
  main_dict[dict_name] = dict()
  main_dict[dict_name]['roles'] = roles
  main_dict[dict_name]['utterances'] = utterances

Iterating over meeting_en_dev_001_transcript.txt
Iterating over meeting_en_dev_010_transcript.txt
Iterating over meeting_en_dev_009_transcript.txt
Iterating over meeting_en_dev_005_transcript.txt
Iterating over meeting_en_dev_006_transcript.txt
Iterating over meeting_en_dev_002_transcript.txt
Iterating over meeting_en_dev_004_transcript.txt
Iterating over meeting_en_dev_007_transcript.txt
Iterating over meeting_en_dev_008_transcript.txt
Iterating over meeting_en_dev_003_transcript.txt
Iterating over meeting_en_train_056_transcript.txt
Iterating over meeting_en_train_060_transcript.txt
Iterating over meeting_en_train_005_transcript.txt
Iterating over meeting_en_train_067_transcript.txt
Iterating over meeting_en_train_033_transcript.txt
Iterating over meeting_en_train_058_transcript.txt
Iterating over meeting_en_train_069_transcript.txt
Iterating over meeting_en_train_051_transcript.txt
Iterating over meeting_en_train_034_transcript.txt
Iterating over meeting_en_train_066_transcript.txt


In [15]:
# example
roles, utterances = main_dict['meeting_en_train_003_transcript']["roles"], main_dict['meeting_en_train_003_transcript']["utterances"] 

example = [f"{roles[i]}: {utterances[i]}\n" for i in range(5)]
example = ''.join(example)
print(example)

PERSON6: Try to record it if it works.Now it sees that it is working.Yeah, I see that it is recorded.And now I will try to share y screen.And share.And, can you see that?
PERSON7: Yes.
PERSON4: Yes, .
PERSON6: h, I just wanted to show, uh,.h, and could you look at the data at y screen now?h, uh,?
PERSON4: I think he is there.



In [17]:
%cd /content/drive/MyDrive/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en/

/content/drive/MyDrive/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en


In [18]:
# extra utility functions to remove un-wanted sentences.
def max_length(text_list):
  length = [len(text.split(' ')) for text in text_list]
  return max(length)

def preprocess_utterance(sequence):
   return_seq = [sentences for sentences in sequence if len(sentences) > 4]
   return return_seq

def insert_to_roles(roles, len, idx, role, con_index):
  idx = idx + con_index
  for i in range(len):
    roles.insert(idx, role)
  return roles

def insert_text(utterances, sequences, idx, con_index):
  idx = idx + con_index
  for text in sequences[::-1]:
    utterances.insert(idx, text)
  return utterances

In [19]:
dev_keys = """meeting_en_dev_001_transcript
meeting_en_dev_010_transcript
meeting_en_dev_009_transcript
meeting_en_dev_005_transcript
meeting_en_dev_006_transcript
meeting_en_dev_002_transcript
meeting_en_dev_004_transcript
meeting_en_dev_007_transcript
meeting_en_dev_008_transcript
meeting_en_dev_003_transcript"""

dev_keys = dev_keys.split("\n")
dev_keys

['meeting_en_dev_001_transcript',
 'meeting_en_dev_010_transcript',
 'meeting_en_dev_009_transcript',
 'meeting_en_dev_005_transcript',
 'meeting_en_dev_006_transcript',
 'meeting_en_dev_002_transcript',
 'meeting_en_dev_004_transcript',
 'meeting_en_dev_007_transcript',
 'meeting_en_dev_008_transcript',
 'meeting_en_dev_003_transcript']

In [20]:
# dev set contains transcripts which are supposed to be handled manually
# delete dev keys including roles and utterances

for key in dev_keys:
  del main_dict[key]

In [21]:
for key in main_dict.keys():

  # Key No.
  print(f'currently parsing {key}')

  Utterances = main_dict[key]['utterances']
  roles = main_dict[key]['roles']

  try:
    # print max length
    print(f'max word length in {key} is {max_length(Utterances)}')

    # original length of both lists
    print(f'Original lists length {len(roles), len(Utterances)}')

    # get mappings
    mappings = {  
        "idx": [],
        "utterances": [],
        "roles": []
    }
    
    for idx, utterance in enumerate(Utterances):
      word_list = [sentence.strip() for sentence in utterance.split(' ')]
      sentence_list = [sentence.strip() for sentence in utterance.split('.')]
      # check if length of word list is greater than 150
      if len(word_list) > 150:
        sequence = []
        temp = ''
        for sentence in sentence_list:
          temp = f'{temp} {sentence}.'
          # if word limit exceeded than create a new sentence
          if len(temp.split(' ')) > 150:
            sequence.append(temp.strip())
            temp = ''

        sequence.append(temp.strip())

        # delete the sentence present in original list
        del Utterances[idx]

        # preprocess and striping and removing small sentence less than 3
        sequence = preprocess_utterance(sequence)
        len_roles = len(sequence)

        # retrieve corresponding role from the roles list
        role = roles[idx]
        
        # delete the role present in original list
        del roles[idx]

        # mapping index, roles and utterances to mapping dictionary
        mappings["idx"].append(idx)
        mappings["utterances"].append(sequence)
        mappings["roles"].append(role)

    # New length after preprocessing
    print(f'Length of lists after preprocessing {len(roles), len(Utterances)}')

    # Applying modifications
    con_index = 0
    for idx, index in enumerate(mappings['idx']):
      sequence = mappings['utterances'][idx]
      len_utterances = len(sequence)
      Utterances = insert_text(Utterances, sequence, index, con_index)
      roles = insert_to_roles(roles, len_utterances, index, mappings['roles'][idx], con_index)
      # Reflecting to the position of insertion
      print(f'Inserted @ {index + con_index}')
      con_index = con_index + len_utterances  

    # New length after insertion
    print(f'Length of lists after insertion {len(roles), len(Utterances)}')

    # Applying changes to main dictionary
    main_dict[key]['utterance'] = Utterances
    main_dict[key]['roles'] = roles

    print('----------------------------------------------------------------------------------------------------------------------------------------------------------------------')
  
  except Exception as e:
    pass

currently parsing meeting_en_train_056_transcript
max word length in meeting_en_train_056_transcript is 597
Original lists length (340, 340)
Length of lists after preprocessing (324, 324)
Inserted @ 54
Inserted @ 59
Inserted @ 68
Inserted @ 71
Inserted @ 112
Inserted @ 145
Inserted @ 155
Inserted @ 172
Inserted @ 177
Inserted @ 195
Inserted @ 200
Inserted @ 204
Inserted @ 208
Inserted @ 240
Inserted @ 270
Inserted @ 276
Length of lists after insertion (371, 371)
----------------------------------------------------------------------------------------------------------------------------------------------------------------------
currently parsing meeting_en_train_060_transcript
max word length in meeting_en_train_060_transcript is 381
Original lists length (224, 224)
Length of lists after preprocessing (221, 221)
Inserted @ 67
Inserted @ 74
Inserted @ 79
Length of lists after insertion (228, 228)
---------------------------------------------------------------------------------------------

In [22]:
# Saving changes
import json

with open('/content/drive/MyDrive/automin-2021-confindential-data-main/task-A-elitr-minuting-corpus-en/train_processed.json', 'w') as filehandle:
  json.dump(main_dict, filehandle)

In [23]:
# *Note; some transcript might contain different annotation for roles e.g. instead of (PERSON 1) it might be [PERSON 1], these anamolies should be handled manually based on circumstances and occurrence.