### loading the XES file as dict with lxml

In [1]:
from lxml import objectify
import dateutil.parser
import pickle
import os


def load_as_xml():
    xes_as_xml = objectify.fromstring(open("BPI Challenge 2018.xes", 'r').read())
    return xes_as_xml


def load_as_dict(year):
    json_file = "BP_Challenge_2018-%s.pickle" % year

    # if the dict already exists
    if os.path.isfile(json_file):
        print "Loading data from file"
        return pickle.load(open(json_file))

    # loading the xes file
    xes_xml = objectify.fromstring(open("BPI Challenge 2018.xes", 'r').read())
    print "XES file loaded"

    xes_as_dict = {}
    for trace in xes_xml.trace:
        # converting basic elements
        trace_dict = extract_basic_elements(trace)

        if int(trace_dict['year']) != year:
            continue

        # converting events and saving in list
        trace_dict['event'] = [extract_basic_elements(e) for e in trace.event]
        # ordering event list by date
        trace_dict['event'].sort(key=lambda x: x['time:timestamp'])
        
        # appending trace to dict
        xes_as_dict[trace_dict['identity:id']] = trace_dict
    
    print "Dict file created"
    # to free up memory
    del xes_xml

    # saving dict into file
    with open(json_file, 'w') as f:
      pickle.dump(xes_as_dict, f)

    print "Dict file saved as %s" % json_file
    return xes_as_dict


def extract_basic_elements(element):
    d_elem = {}

    if element.xpath('id'):
        for id_element in element.id:
            d_elem[id_element.get('key')] = id_element.get('value')

    if element.xpath('int'):
        for int_element in element.int:
            d_elem[int_element.get('key')] = int(int_element.get('value'))

    if element.xpath('string'):
        for str_element in element.string:
            d_elem[str_element.get('key')] = str_element.get('value')

    if element.xpath('boolean'):
        for bool_element in element.boolean:
            d_elem[bool_element.get('key')] = 1 if bool_element.get('value') == 'true' else 0

    if element.xpath('date'):
        for date_element in element.date:
            d_elem[date_element.get('key')] = dateutil.parser.parse(date_element.get('value'))

    if element.xpath('float'):
        for float_element in element.float:
            d_elem[float_element.get('key')] = float(float_element.get('value'))

    return d_elem


## Apenas 2015

In [64]:
xes_as_dict_2015 = load_as_dict(2015)

Loading data from file


In [None]:
qtd_trace = len(xes_as_dict_2015.keys())
print "Total de casos: ", qtd_trace

In [66]:
# convertendo em lista de casos e filtrando
casos_2015 = []
for key, caso in xes_as_dict_2015.items():
#     if caso['event'][-1]['activity'] == 'finish payment':
    # adiciona a data do primeiro, do ultimo evento e o interval em dias
    if caso.get('event'):
        caso['finish payment'] = 1 if caso['event'][-1]['activity'] == 'finish payment' else 0
        caso['start_time'] = caso['event'][0]['time:timestamp']
        caso['end_time'] = caso['event'][-1]['time:timestamp']
        caso['tot_events'] = len(caso['event'])
        delta = caso['event'][-1]['time:timestamp'] - caso['event'][0]['time:timestamp']
        caso['delta_time'] = delta.days
        casos_2015.append(caso)

# ordenando os casos por data de inicio
casos_2015.sort(key=lambda x: x['start_time'])
print "Total de casos apos remocao: ", len(casos_2015)

Total de casos apos remocao:  14750


#### Coletando apenas os eventos

In [67]:
eventos = []
for caso in casos_2015:
    eventos += caso['event']
print "Total de eventos: ", len(eventos)

Total de eventos:  897813


#### Coletando apenas os atributos dos casos

In [68]:
for caso in casos_2015:
    caso.pop('event')
print casos_2015[0]

{'program-id': '215', 'applicant': '88dc21ca7a587c35', 'penalty_AGP': False, 'penalty_BGK': False, 'penalty_AVJLP': False, 'year': '2015', 'finish payment': 1, 'concept:name': 'cb3425ce193199d7', 'penalty_BGP': False, 'basic payment': True, 'penalty_AJLP': False, 'greening': True, 'penalty_AVGP': False, 'penalty_C16': False, 'penalty_B16': False, 'penalty_AVBP': False, 'area': 6.8887, 'risk_factor': 1.0, 'selected_manually': False, 'application': 'cb3425ce193199d7', 'penalty_C9': False, 'cross_compliance': 0.0, 'tot_events': 40, 'department': 'e7', 'penalty_C4': False, 'selected_random': False, 'amount_applied0': 2371.85, 'penalty_AUVP': False, 'penalty_B5': False, 'start_time': datetime.datetime(2014, 5, 4, 0, 0, tzinfo=tzoffset(None, 7200)), 'delta_time': 655, 'penalty_amount0': 0.0, 'rejected': False, 'penalty_AVUVP': False, 'penalty_CC': False, 'penalty_V5': False, 'penalty_BGKV': False, 'penalty_B5F': False, 'selected_risk': False, 'penalty_GP1': False, 'small farmer': False, 'pen

In [69]:
count_attr = [len(j.keys()) for j in casos_2015]

In [72]:
max(count_attr)

67

In [70]:
print count_attr.index(max(count_attr))

4886


In [71]:
import csv

keys = casos_2015[4886].keys()
with open('casos_attr_2015.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(casos_2015)