In [1]:
import xml.etree.ElementTree as ET
import re
from os import listdir
from os.path import isfile, join
from pprint import pprint 

In [2]:
def get_top_id(topic):
    ptn_topic_id = r'(\w+)\..*?\.(.*)'
    topic_attrib = list(topic.attrib.items())
    if len(topic_attrib) == 2:
        topic_id, desc = topic_attrib[0][1], topic_attrib[1][1]
    else:
        topic_id, desc = topic_attrib[0][1], 'None'
    [(file_name, top_id)] = re.findall(ptn_topic_id, topic_id)
    return top_id#, file_name, desc

def get_top_type(ET_element_pointer):
    '''
    This function is for extract topic pointer (ET.Element)
    '''
    role, href = ET_element_pointer.attrib.items()
    ptn = r'\w+#id\((.*)\)'
    return re.findall(ptn, href[1])[0] #str

def get_child_data(ET_element_child):
    '''
    This function is for extracting child data
    return {key:[value]}
    '''
    [(href,data)] = ET_element_child.attrib.items()
    data_ptn = r'.*?\.(\w).*?\((.*)\)'
    [(k,data)] = re.findall(data_ptn, data)
    raw_data = data.split(')..id(')
    words_ptn = r'.*?.\.\w\.[a-zA-z]+(\d*)'# modified mark: extract number only
    data = []
    for each in raw_data:
        data.append(int(re.findall(words_ptn,each)[0]))# modified mark: int()
    return k, data

def parse_topic(topic):
    results = {}
    _id = get_top_id(topic)
    #_type = get_top_type(topic[0])
    if _id not in results:
        results[_id] = []
    
    result = results[_id]
    # extract data from topic
    for i, e in enumerate(topic):
        if i==0:
            continue
        k, data = get_child_data(e)
        result.append((k,data))
    return results

def parse_element_list(element_list):
    results = {}
    _id = element_list.pop(0)
    
    if _id not in results:
        results[_id] = []
        
    result = results[_id]
    # extract data from topic
    for i, element in enumerate(element_list):
        if i==0:
            continue
        k, data = get_child_data(element)
        result.append((k,data))
    return results

def parse_entire_tree(tree):
    element_list = tree.getroot().getchildren()
    topic_list = []
    for topic in element_list:
        if topic.findall('topic'):
            _elemt_list = topic.getchildren()

            elemt_list = [] # go for topic_ele_list_list
            _id = get_top_id(topic)
            elemt_list.append(_id)

            inner_topic = [] # go for topic_in_file

            for element in _elemt_list:
                if element.tag == 'topic':
                    t_dict = parse_topic(element)
                    temp = [t_dict[key] for key in t_dict.keys()]
                    topic_list += temp
                    #pprint(temp)
                else:
                    elemt_list.append(element)
            t_dict = parse_element_list(elemt_list)
            temp = [t_dict[key] for key in t_dict.keys()]
            topic_list += temp
        else:
            t_dict = parse_topic(topic)
            temp = [t_dict[key] for key in list(t_dict.keys())]
            topic_list += temp
    return topic_list

In [16]:
def parse_all_topic_files(mypath=None):
    if not mypath:
        mypath = './topics'
    # load entire list of fires name from directory
    file_list = sorted([f for f in listdir(mypath) if isfile(join(mypath, f))])
    print(len(file_list), 'files in total from', mypath) # for checking
    
    entire_topics_dict = {}#initialise a dict for storing
    ptn = r'(.*)\.\w+\..*' # for extract filename, who(A,B,C,D)
    f_cnt = 0 # success counter
    
    for file in file_list:
        fname = re.findall(ptn, file)[0]
        if fname not in entire_topics_dict:
            entire_topics_dict[fname] = []
        path = mypath + '/' + file # construct the path
        try:
            tree = ET.parse(path)
            entire_topics_dict[fname] = parse_entire_tree(tree)
            f_cnt += 1
        except:
            continue
    #print("Successfully parse", f_cnt, 'files.') # for checking
    return entire_topics_dict

In [17]:
topic_all = parse_all_topic_files()

139 files in total from ./topics


In [13]:
pprint(list(topic_all.keys()))

['ES2002a',
 'ES2002b',
 'ES2002c',
 'ES2002d',
 'ES2003a',
 'ES2003b',
 'ES2003c',
 'ES2003d',
 'ES2004a',
 'ES2004b',
 'ES2004c',
 'ES2004d',
 'ES2005a',
 'ES2005b',
 'ES2005c',
 'ES2005d',
 'ES2006a',
 'ES2006b',
 'ES2006d',
 'ES2007a',
 'ES2007b',
 'ES2007c',
 'ES2007d',
 'ES2008a',
 'ES2008b',
 'ES2008c',
 'ES2008d',
 'ES2009a',
 'ES2009b',
 'ES2009c',
 'ES2009d',
 'ES2010a',
 'ES2010b',
 'ES2010c',
 'ES2010d',
 'ES2011a',
 'ES2011b',
 'ES2011c',
 'ES2011d',
 'ES2012a',
 'ES2012b',
 'ES2012c',
 'ES2012d',
 'ES2013a',
 'ES2013b',
 'ES2013c',
 'ES2013d',
 'ES2014a',
 'ES2014b',
 'ES2014c',
 'ES2014d',
 'ES2015a',
 'ES2015d',
 'ES2016a',
 'ES2016b',
 'ES2016c',
 'ES2016d',
 'IB4003',
 'IB4005',
 'IB4010',
 'IB4011',
 'IS1000a',
 'IS1000b',
 'IS1000c',
 'IS1000d',
 'IS1001a',
 'IS1001b',
 'IS1001c',
 'IS1001d',
 'IS1002b',
 'IS1002c',
 'IS1002d',
 'IS1003a',
 'IS1003b',
 'IS1003c',
 'IS1003d',
 'IS1004a',
 'IS1004b',
 'IS1004c',
 'IS1004d',
 'IS1005a',
 'IS1005b',
 'IS1005c',
 'IS1006

tree = ET.parse('./topics/ES2002c.topic.xml')
def parse_entire_tree(tree):
    element_list = tree.getroot().getchildren()
    topic_list = []
    for topic in element_list:
        if topic.findall('topic'):
            _elemt_list = topic.getchildren()

            elemt_list = [] # go for topic_ele_list_list
            _id = get_top_id(topic)
            elemt_list.append(_id)

            inner_topic = [] # go for topic_in_file

            for element in _elemt_list:
                if element.tag == 'topic':
                    t_dict = parse_topic(element)
                    temp = [t_dict[key] for key in t_dict.keys()]
                    topic_list += temp
                    #pprint(temp)
                else:
                    elemt_list.append(element)
            t_dict = parse_element_list(elemt_list)
            temp = [t_dict[key] for key in t_dict.keys()]
            topic_list += temp
        else:
            t_dict = parse_topic(topic)
            temp = [t_dict[key] for key in list(t_dict.keys())]
            topic_list += temp
    return topic_list

In [None]:
parse_entire_tree(tree)