# Task 1 Reconstruct the Original Meeting Transcripts

**Title**: FIT5196_S1_2018_Assignment3_task1  
**Student ID**: 27771539  
**Student Name**: Chuangfu Xie  
**Date**: 03/06/2018  
**Develop-Env**: MacOS - Jupyter Notebook - Python3

In [None]:
import sys
print(sys.version_info)

In [None]:
import xml.etree.ElementTree as ET
import re
from os import listdir
from os.path import isfile, join
from pprint import pprint 
import time

#  Parsing Topic file

In [None]:
def get_top_id(topic):
    ptn_topic_id = r'(\w+)\..*?\.(.*)'
    topic_attrib = list(topic.attrib.items())
    if len(topic_attrib) == 2:
        topic_id, desc = topic_attrib[0][1], topic_attrib[1][1]
    else:
        topic_id, desc = topic_attrib[0][1], 'None'
    [(file_name, top_id)] = re.findall(ptn_topic_id, topic_id)
    return top_id#, file_name, desc

def get_top_type(ET_element_pointer):
    '''
    This function is for extract topic pointer (ET.Element)
    '''
    role, href = ET_element_pointer.attrib.items()
    ptn = r'\w+#id\((.*)\)'
    return re.findall(ptn, href[1])[0] #str

def get_child_data(ET_element_child):
    '''
    This function is for extracting child data
    return {key:[value]}
    '''
    [(href,data)] = ET_element_child.attrib.items()
    data_ptn = r'.*?\.(\w).*?\((.*)\)'
    [(k,data)] = re.findall(data_ptn, data)
    raw_data = data.split(')..id(')
    words_ptn = r'.*?.\.\w\.[a-zA-z]+(\d*)'# modified mark: extract number only
    data = []
    for each in raw_data:
        data.append(int(re.findall(words_ptn,each)[0]))# modified mark: int()
    return k, data

def parse_topic(topic):
    results = {}
    _id = get_top_id(topic)
    #_type = get_top_type(topic[0])
    if _id not in results:
        results[_id] = []
    
    result = results[_id]
    # extract data from topic
    for i, e in enumerate(topic):
        if i==0:
            continue
        k, data = get_child_data(e)
        result.append((k,data))
    return results

def parse_element_list(element_list):
    results = {}
    _id = element_list.pop(0)
    
    if _id not in results:
        results[_id] = []
        
    result = results[_id]
    # extract data from topic
    for i, element in enumerate(element_list):
        if i==0:
            continue
        k, data = get_child_data(element)
        result.append((k,data))
    return results

def parse_entire_tree(tree):
    element_list = tree.getroot().getchildren()
    topic_list = []
    for topic in element_list:
        if topic.findall('topic'):
            _elemt_list = topic.getchildren()

            elemt_list = [] # go for topic_ele_list_list
            _id = get_top_id(topic)
            elemt_list.append(_id)

            inner_topic = [] # go for topic_in_file

            for element in _elemt_list:
                if element.tag == 'topic':
                    t_dict = parse_topic(element)
                    temp = [t_dict[key] for key in t_dict.keys()]
                    topic_list += temp
                    #pprint(temp)
                else:
                    elemt_list.append(element)
            t_dict = parse_element_list(elemt_list)
            temp = [t_dict[key] for key in t_dict.keys()]
            topic_list += temp
        else:
            t_dict = parse_topic(topic)
            temp = [t_dict[key] for key in list(t_dict.keys())]
            topic_list += temp
    return topic_list

def parse_all_topic_files(mypath=None):
    if not mypath:
        mypath = './topics'
    # load entire list of fires name from directory
    file_list = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
    print(len(file_list), 'files in total from', mypath) # for checking
    
    entire_topics_dict = {}#initialise a dict for storing
    ptn = r'(.*)\.\w+\..*' # for extract filename, who(A,B,C,D)
    f_cnt = 0 # success counter
    
    for file in file_list:
        if re.findall(ptn, file):
            fname = re.findall(ptn, file)[0]
        path = mypath + '/' + file # construct the path
        try:
            tree = ET.parse(path)
            result = parse_entire_tree(tree)
            if fname not in entire_topics_dict:
                entire_topics_dict[fname] = []
            entire_topics_dict[fname] = parse_entire_tree(tree)
            f_cnt += 1
        except:
            continue
    #print("Successfully parse", f_cnt, 'files.') # for checking
    return entire_topics_dict

# Parsing words files

In [None]:
def parse_word(words_list):
    '''
    : parase_word(words_list) 
    This function will parse single word file.
    
    Arguments:
        > words_list: list. a list of ET.elements
    Return:
        > vocas: a list of all words(easy to extract via indexing)
    '''
    vocas = []
    for word in words_list:
        if 'type' in word.attrib:
            if word.attrib['type']=='laugh':
                vocas.append('(laugh)')
            else:
                vocas.append('_')
        else:
            vocas.append(word.text)
    return vocas

def parse_all_wordfiles(mypath=None):
    '''
    : parse_all_wordfiles(mypath=None)
    This function is to parse all words files in target directory.
    
    Arguments:
        > mypath: str. By default is None. The words directory path is './words'. 
                  Not need to input any argument to this function.
    Return:
        > entire_words_dict: dict. a dict of dict of all words files
    
    HELP - How to extract target words:
    { 'filename1':{'A':[...],
                   'B':[...],
                   'C':[...],
                   'D':[...],},
      'filename2':{'A':[...],
                   'B':[...],
                   'C':[...],
                   'D':[...],},
         ...             
    }
    '''
    if not mypath:
        mypath = './words'
    # load entire list of fires name from directory
    file_list = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
    print(len(file_list), 'files in total from', mypath) # for checking
    
    entire_words_dict = {} #initialise a dict for storing
    ptn = r'(.*)\.(\w)\..*' # for extract filename, who(A,B,C,D)
    f_cnt = 0 # success counter
    for file in file_list:
        fname, who = re.findall(ptn, file)[0]
        if fname not in entire_words_dict:
            entire_words_dict[fname] = {}
        if who not in entire_words_dict[fname]:
            entire_words_dict[fname][who] = []
        path = mypath + '/' + file # construct the path
        try:
            words_tree = ET.parse(path) #load xml
            words_elemt_list = words_tree.getroot().getchildren() #get all elements
            vocas = parse_word(words_elemt_list)
            entire_words_dict[fname][who] += vocas
            f_cnt += 1
        except:
            print('missing:', who)
            continue
    #print("Successfully parse", f_cnt, 'files.') # for checking
    return entire_words_dict

# Reminder: multi-processing to accelerate the process

# Parsing segment file

In [None]:
def parse_segment(segments_list, target_dict):
    for segment in segments_list:
        # extract time-range
        [_, chn, start, end] = segment.attrib.items()
        key = (start[1], end[1]) 
        #print(key)

        # extract word-range
        [( _, raw )] = segment.getchildren()[0].attrib.items()
        data_ptn = r'.*?\.(\w).*?\((.*)\)'
        words_ptn = r'.*?.\.\w\.[a-zA-z]+(\d*)' # modified mark: extract number only
        #words_ptn = r'.*?.\.\w\.(.*)'
        [(who, _range)] = re.findall(data_ptn, raw)
        raw_range = _range.split(')..id(')
        word_range = [who]
        for each in raw_range:
            word_range.append(int(re.findall(words_ptn,each)[0])) # modified mark: int()
        # store in to dict
        target_dict[key] = word_range
    #print('Success:', len(list(segments_dict.keys()))) #for checking

def parse_all_segmentfiles(mypath=None):
    '''
    : parse_all_segmentfiles(mypath=None)
    This function is to parse all segment files in target directory.
    
    Arguments:
        > mypath: str. By default is None. The words directory path is './segments'. 
                  Not need to input any argument to this function.
    Return:
        > entire_segment_dict: dict. a dict of dict of all segment files
    
    HELP - How to extract target segment:
    { 'filename1':{(start, end)': ['A', word0, word15],
                    ...,
                   (start, end)': ['B', word0, word15],
                    ...,
                   (start, end)': ['C', word0, word15],
                    ...,
                   (start, end)': ['D', word0, word15],
                    ...
                    },
      'filename2':{(start, end)': ['A', word0, word15],
                    ...,
                   (start, end)': ['B', word0, word15],
                    ...,
                   (start, end)': ['C', word0, word15],
                    ...,
                   (start, end)': ['D', word0, word15],
                    ...
                    },
         ...             
    }
    '''
    if not mypath:
        mypath = './segments'
    file_list = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
    print(len(file_list), 'files in total from', mypath) # for checking
    
    entire_segments_dict = {} #initialise a dict for storing
    ptn = r'(.*)\.\w\..*' # for extract filename, who(A,B,C,D)
    f_cnt = 0 # success counter

    for file in file_list:
        fname = re.findall(ptn, file)[0]
        if fname not in entire_segments_dict:
            entire_segments_dict[fname] = {}

        path = mypath + '/' + file # construct the path
        try:
            segments_tree = ET.parse(path)
            segments_elemt_list = segments_tree.getroot().getchildren()
            #print('load',len(segments_list),'segments') #for checking
            # get the dict updated
            #target_dict = entire_segments_dict[fname]
            parse_segment(segments_elemt_list, entire_segments_dict[fname])
            f_cnt += 1
        except:
            print('missing:', fname)
            continue
    #print("Successfully parse", f_cnt, 'files.') # for checking
    return entire_segments_dict

# Other function for parsing to txt

In [None]:
def get_topic_range(subtopic):
    speaker_range = {}
    for speaker in subtopic:
        who, _range = speaker
        if who not in speaker_range:
            speaker_range[who] = []
        speaker_range[who].append(_range[-1])
        speaker_range[who].append(_range[0])
    for each in speaker_range:
        max_range = max(speaker_range[each])
        min_range = min(speaker_range[each])
        speaker_range[each] = [min_range,max_range]
    return speaker_range

def segment_in_topic(speaker_range, line_range, who):
    '''
    Arguemnts
    > speaker_range: dict. speaker words range in a subtopic
    > line_range: list. segment words range
    > who: str. which speaker
    '''
    s_start, s_end = speaker_range[who]
    s_range = range(s_start, s_end+1)
    if line_range[0] in s_range and line_range[-1] in s_range:
        return line_range
    elif line_range[0] in s_range:
        return [line_range[0], s_end]
    elif line_range[-1] in s_range:
        return [s_start, line_range[-1]]
    else:
        return None

def preprocess_segments(top, segments_list):
    new_segments_list = []
    speaker_range = get_topic_range(top) 
    for segment in segments_list:
        who, s_range = segment[0], segment[1:]
        if who in speaker_range:
            new_s_range = segment_in_topic(speaker_range, s_range, who)
            if new_s_range:
                if new_s_range == s_range:
                    new_segments_list.append(segment)
                else:
                    # if tail got trim
                    if s_range[-1] > new_s_range[-1] and s_range[0] == new_s_range[0]:
                        suplm_segment = [who, new_s_range[-1]+1, s_range[-1]] #trim part
                        new_segments_list.append([who,new_s_range[0],new_s_range[1]]) # updated
                        new_segments_list.append(suplm_segment)

                    # if head got trim
                    if s_range[0] < new_s_range[0] and s_range[-1] == new_s_range[-1]:
                        #suplm_segment = (who, [s_range[0], new_s_range[0]-1]) #trim part
                        #new_segments_list.append(suplm_segment)
                        new_segments_list.append([who,new_s_range[0],new_s_range[1]])
                    
        else:
            pass
    return new_segments_list

def convert_to_txt(topics_list, all_words):
    all_txt = []
    for topic in topics_list:
        topic_sentence_list = []
        for line in topic:
            sentence = ''
            who, w_range = line[0], line[1:]
            sentence += who + ': ' #for checking
            if len(w_range) > 1:
                s, e = w_range
                for index in range(s,e+1):
                    word = all_words[who][index]
                    if word:
                        sentence += word + ' '
            else:
                index = w_range[0]
                word = all_words[who][index]
                if word:
                    sentence += word + ' '
            topic_sentence_list.append(sentence)
        topic_sentence_list.append('**********')
        all_txt.append(topic_sentence_list)
    return all_txt

def write_file(file_txt, filename):
    f = open('./txt_files/'+filename+'.txt','w')
    for topic in file_txt:
        for line in topic:
            f.write(line+'\n')
    f.close()

## Load all files:

In [None]:
start = time.time()
#All topic
all_topic_files = parse_all_topic_files()

#All Words
all_wordfiles = parse_all_wordfiles()
test_words = all_wordfiles['ES2002a']

#All segments
all_segments = parse_all_segmentfiles()

* * *

# Parse and Write to txt file

In [None]:
mypath = './topics'
# load entire list of fires name from directory
file_list = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
entire_words_dict = {} #initialise a dict for storing
ptn = r'(.*)\.\w+\..*' # for extract filename, who(A,B,C,D)
filename_list = []

for file in file_list:
    fname = re.findall(ptn, file)[0]
    filename_list.append(fname)

for filename in filename_list:
    try:
        #load taget topic file
        #filename = 'ES2002a'
        topic_file = all_topic_files[filename]

        #preprocess for segment
        _temp = all_segments[filename]
        segment_list = list(_temp.keys())
        segment_list.sort(key=lambda x:float(x[0]))
        scripts_list = [_temp[key] for key in segment_list]

        new_topic_list = []
        for subtopic in topic_file:
            new_topic_list.append(preprocess_segments(subtopic, scripts_list))   

        words_file = all_wordfiles[filename]#
        all_txt = convert_to_txt(new_topic_list, words_file)
        #pprint(all_txt)
        write_file(all_txt, filename)
    except:
        pass
end = time.time()
print('Total use time:', round(end-start,4),'s')