In [1]:
import time
import os
from bs4 import BeautifulSoup
import pandas as pd
import re
import json
import xmltodict
import csv
from functools import reduce

In [2]:
def retrieve_sequence(node_dict, first_node):
    ''' Computes a sequence of nodes given the first node
        and a dictionary with all the transitions
    '''
    sequence = []
    current = first_node
    sequence.append(current)
    while current in node_dict:
        next_ = node_dict[current]
        current = next_
        sequence.append(next_)
    
    return sequence    

In [3]:
def get_detailed_sequence(sequence, id_type_dict, id_tool_dict):
    ''' Enrich a sequence of acitvities with the acitvity type information
        or the tool type when an acitivity represents a tool
    '''
    sequence_detailed = [(x, id_tool_dict[x]) if x in id_tool_dict 
                                  else (x, id_type_dict[x]) for x in sequence]
    return sequence_detailed

In [4]:
def get_subdirectories(input_dir):
    return [name for name in os.listdir(input_dir)
            if os.path.isdir(os.path.join(input_dir, name))]

In [5]:
def get_tool_paths(graph_folder):
    '''  Get all the paths of the xml files that refer to Tools '''
    tool_xml_list = []    
    subdirs = [x[0] for x in os.walk(graph_folder)][1:]
    if len(subdirs) != 0:
        tool_xml_list = [x + '/tool.xml' for x in subdirs]
    else: 
        tool_xml_list = [x[2] for x in os.walk(graph_folder)][0]
        tool_xml_list = [graph_folder + '/' + x for x in tool_xml_list if x.endswith('tool.xml')]
    
    return tool_xml_list

In [6]:
def create_tool_jsonb(lesson_id, id_tool_dict, subdir, matching_dict):
    ''' Creates the json representation for the content of a Tool
        and add a new row in the Activities_Info.csv
    '''
    activities_info_dict = {}
    tool_path_list = get_tool_paths(subdir)
    for tool_path in tool_path_list: 
        if '__MACOSX' in tool_path or 'Image' in tool_path: 
            continue
        filename, _ = os.path.splitext(tool_path)
        filename = re.split('\\\\|/',filename)[-2]
        with open(tool_path, 'r') as f:
            xml_string = f.read()
        # temp solution 
        try:
            data = xmltodict.parse(xml_string)
            assert len(data.keys())==1, 'Tool json files has not only one primary key'
            parent_key = next(iter(data))
            new_data = data[parent_key]
            new_data['tool_type']= parent_key
            new_data.move_to_end('tool_type', last=False)          
            jsonString = json.dumps(new_data, indent=4)
        except:
            print('not well-formed (invalid token):', tool_path)
            invalid_tool_xmls.append(tool_path)
            jsonString = 'null'
#         jsonString = json.dumps(xmltodict.parse(xml_string), indent=4)
        activity_id = int(matching_dict[filename])
        activities_info_dict[activity_id] =  [lesson_id, id_tool_dict[activity_id], jsonString]
                                     
    Activities_Info = pd.DataFrame.from_dict(activities_info_dict, orient='index', columns=[
        'lesson_id', 'tool_type', 'data'])
    Activities_Info.to_csv('data/Activities_Info.csv', sep=',', 
                           encoding='utf-8', mode='a', header=False)

In [7]:
def initialize_csv_outputs(filename, column_list):
    output_df = pd.DataFrame(columns=column_list)
    output_df.set_index('id', inplace=True)
    output_df.to_csv('data/' + filename + '.csv', sep=',', encoding='utf-8')

In [8]:
def append_to_csv(data_dict, filename, column_list):
    output_df = pd.DataFrame.from_dict(data_dict, orient='index', columns=column_list[1:])
    output_df.to_csv('data/' + filename + '.csv', sep=',', encoding='utf-8', mode='a', header=False)

In [9]:
with open('data/activity_types.csv', 'r') as act_in, open('data/tool_types.csv', 'r') as tool_in:        
    act_reader = csv.reader(act_in)
    tool_reader = csv.reader(tool_in)
    activity_types_dict = {int(row[0]):row[1] for row in act_reader}
    tool_names_dict = {row[0]:row[1] for row in  tool_reader}
    act_in.close()
    tool_in.close()


complex_activity_types = [6, 7, 8, 10, 11, 12, 13, 15]
complex_followed_by_single = ['PARALLEL_ACTIVITY_TYPE', 'OPTIONS_ACTIVITY_TYPE', 'FLOATING_ACTIVITY_TYPE']

In [10]:
tables_info = [
    # test_lessons is used for debugging
    ('test_lessons', ['id', 'title', 'userID', 'length', 'total_activities', 'filename']),
    ('Lessons', ['id', 'title', 'userID', 'length', 'total_activities']),
    ('Sequences', ['id', 'lessons_id', 'parent_id', 'activities', 'main']),
    ('Activities', ['id', 'lessons_id', 'type', 'title', 'subsequences']),
    ('Activities_Info', ['id', 'lessons_id', 'tool_type', 'data'])
]
# TODO: uncomment
# for output_file, column_list in tables_info:
#     initialize_csv_outputs(output_file, column_list)

In [23]:
def sequence_analysis(graph_folder):
    
    print('Analyzing:', graph_folder)   
    xml_path = graph_folder + '/learning_design.xml'
    filename, _ = os.path.splitext(xml_path)
    filename = re.split('\\\\|/',filename)[-2]
    with open(xml_path) as fp:
        soup = BeautifulSoup(fp, 'xml')
    if soup.find('validDesign').text == 'false':
        print("---> Invalid Learning Design - Graph was not analyzed")
        print('-----------------------------------------------------------------------------')
        invalid_designs.append(filename)
        return

    title = soup.find('title').text
    userID = soup.find('userID').text
    global lesson_id
    last_lesson_id = lesson_id
    lesson_id += 1
    global sequence_id
    last_sequence_id = sequence_id 
    global activity_id
    last_activity_id = activity_id 
    
    floatingActivity = None
    
    '''
    Parse the activities
    '''
    complex_activities = [] 
    child_activities = [] #TODO: can delete this and use child_parent_dict keys
    desingID_manualID = {}
    manualID_desingID = {}
    id_type_dict = dict()
    uiid_id_dict = dict()
    id_tool_dict = dict ()
    # toolContentID is the same ID used in the xml file
    toolContentID_id_dict = dict()
    id_title_dict = dict()
    child_parent_dict = dict()
    parent_children_dict = dict()   
    
    activity_id += 1
    first_activity = activity_id
    desingID_manualID[soup.find('firstActivityID').text] = first_activity    
    activities = soup.find('activities').findAll(
        'org.lamsfoundation.lams.learningdesign.dto.AuthoringActivityDTO') 
    for act in activities:
        #store activity fields
        if act.find('activityID').text not in desingID_manualID:
            activity_id += 1 
            activity_ID = activity_id
            desingID_manualID[act.find('activityID').text] = activity_id
        else:
            activity_ID = first_activity
#         activity_ID = act.find('activityID').text
        activity_UIID = act.find('activityUIID').text    
        activityType_ID = int(act.find('activityTypeID').text)
        parentID = act.find('parentActivityID')
        act_title = act.find('activityTitle').text if act.find('activityTitle') else 'null'
        id_title_dict[activity_ID] = act_title
        # store info derived from the activity fields
        activity_type = activity_types_dict[activityType_ID]
        #
        if activity_type == 'FLOATING_ACTIVITY_TYPE':
            if lesson_id not in floating_dict:
                floating_dict[lesson_id] = 1
            else:
                floating_dict[lesson_id] += 1
                print(title)
                print(activity_ID)
            
            
        
        uiid_id_dict[activity_UIID] = activity_ID
        id_type_dict[activity_ID] = activity_type
        if activity_type == 'TOOL_ACTIVITY_TYPE':
            toolContentID_id_dict[act.find('toolContentID').text] = activity_ID 
        if parentID:
            child_parent_dict[activity_ID] = parentID.text
        if activityType_ID in complex_activity_types: 
            complex_activities.append(activity_ID)       
        if act.find('parentActivityID'): 
            child_activities.append(activity_ID)
        if act.find('toolSignature'):
            id_tool_dict[activity_ID] = tool_names_dict[act.find('toolSignature').text]
    
    for key, value in desingID_manualID.items():
        manualID_desingID[value] = key
    
    child_parent_dict_translated = {}
    for key, value in child_parent_dict.items():
        child_parent_dict_translated[key] = desingID_manualID[value]
    child_parent_dict = child_parent_dict_translated
    
    # length of the graph (num of displayed nodes)
    numOfNodes = len(activities) - len(child_activities)
    complex_nodes = [act_id for act_id in complex_activities if act_id not in child_activities]
    last_children = [child_id for child_id in child_activities 
                     if child_id not in list(child_parent_dict.values())]
    # create dict storing the parent and all its children
    for value in child_parent_dict.values():
        parent_children_dict[value] = []
    for key,value in child_parent_dict.items():
        parent_children_dict[value].append(key)
            
    '''
    Parse the transitions
    '''    
    transition_dict = dict()
    transitions = soup.find('transitions').findAll(
        'org.lamsfoundation.lams.learningdesign.dto.TransitionDTO')
    for tran in transitions:
        from_tran = tran.find('fromActivityID').text
        to_tran = tran.find('toActivityID').text
        transition_dict[ desingID_manualID[from_tran] ] = desingID_manualID[ to_tran ]
        
    '''
    Compute main sequence and subsequences
    '''    
    main_sequence = retrieve_sequence(transition_dict, first_activity)
    # store all the sub-sequences constructed by an activity of type SEQUENCE_ACTIVITY_TYPE
    sequence_pointers = dict()
    for act in complex_activities:
        # 2nd condition id to deal with empty sequences
        if id_type_dict[act] == 'SEQUENCE_ACTIVITY_TYPE' and (act in parent_children_dict):   
            seq_activities = parent_children_dict[act]
            first_child = list(filter(lambda x: x not in transition_dict.values(), seq_activities))
            assert len(first_child) == 1, 'SEQUENCE_ACTIVITY_TYPE has more than one first children' 
            first_child = first_child[0]
            subseq = retrieve_sequence(transition_dict, first_child)
            subseq_detailed = [(x, id_tool_dict[x]) if x in id_tool_dict 
               else (x, id_type_dict[x]) for x in subseq]
            
            parent_act = child_parent_dict[act]
            if parent_act in sequence_pointers:
                sequence_pointers[parent_act].append(subseq)
            else:
                sequence_pointers[parent_act] = []
                sequence_pointers[parent_act].append(subseq)
    
    # store all the sub-sequences
    for act in complex_activities:
        if id_type_dict[act] in complex_followed_by_single:
            act_children = parent_children_dict[act]
            for child in act_children:
                # TODO: investigate if this check is required
                assert id_type_dict[child] != 'SEQUENCE_ACTIVITY_TYPE', ('EROOR:'
                                        ' {} leads to a SEQUENCE_ACTIVITY_TYPE').format(id_type_dict[act]) 
                next_acts = [child]                
                if act in sequence_pointers:
                    sequence_pointers[act].append(next_acts)
                else:
                    sequence_pointers[act] = []
                    sequence_pointers[act].append(next_acts)
            
                

                


    '''
    Display general info
    '''
    sequences_dict = {}
#     curr_id = last_seq_index + 1    
    sequence_id += 1
    sequences_dict[sequence_id] = [lesson_id, 'null', {int(x) for x in main_sequence}, True]
    # TODO: comment out
    print("Length of Sequence:", len(main_sequence), ", Nodes:", numOfNodes, 
          ", Number of transitions:" , len(transition_dict),
          ", Activities:", len(activities) , ", Complex Nodes:", len(complex_nodes),
          ", Complex Activities:",len(complex_activities), "\n")
    
    total_act = len(main_sequence)
    total_act += list(id_type_dict.values()).count("SEQUENCE_ACTIVITY_TYPE")
    # TODO: comment out
    print("Main sequence:\n", get_detailed_sequence(main_sequence, id_type_dict, id_tool_dict), "\n")
    print('----->sequence_pointers:')
    
    floatingActivity_list = []
    for key, value in sequence_pointers.items():
        if id_type_dict[key] == 'FLOATING_ACTIVITY_TYPE': 
            total_act += 1
        for subval in value:
            int_subval = [int(x) for x in subval]
            assert len(int_subval) == len(set(int_subval)),('There are missing values in the ' + 
                    'activities int[] column from table sequences')
            if key not in main_sequence and not any(
            key in sublist for list_ in sequence_pointers.values() for sublist in list_):
                # TODO: comment out
                print('--------------------------')
                total_act += len(subval)
                floatingActivity_list.append(subval)
                # TODO: comment out
                print('--------------------------\n', key, "-", id_type_dict[key], "---->",
                      get_detailed_sequence(subval, id_type_dict, id_tool_dict), '\n--------------------------')
                sequence_id += 1
                sequences_dict[sequence_id] = [lesson_id, 'null', set(int_subval), False]
            else:
                # TODO: comment out
                print(key, "-", id_type_dict[key], "---->",
                      get_detailed_sequence(subval, id_type_dict, id_tool_dict))
                sequence_id += 1
                sequences_dict[sequence_id] = [lesson_id, int(key), set(int_subval), False]
                total_act +=len(subval)

    assert total_act == len(activities), ('Error: Some activities were not analyzed, total_act:' + 
                                str(total_act) + ' - len(activities):' + str(len(activities)))
    if floatingActivity_list:
        for floating in floatingActivity_list:
            assert len(floating) == 1, 'Wrong assumption that floating activity children are not sequences'

    total_activities = len([x for x in id_type_dict.values() if x != 'SEQUENCE_ACTIVITY_TYPE'])
    lessons_dict = {'id': lesson_id, 'title' : title, 'userID': int(userID), 'length': numOfNodes,
                    'total_activities': total_activities,}
    
    id_subsequences_dict2 = dict()
    for key, value in sequences_dict.items():
        parent_id = value[1]
        if parent_id not in id_subsequences_dict2:
            id_subsequences_dict2[parent_id] = []
        id_subsequences_dict2[parent_id].append(key)
            
    activities_dict = {}
    # TODO:
    # replace this with another iteration in the xml for storing all the info apart from id-type-suseq 
    # and delete the unecessary datasructures
    for act_id, act_type in id_type_dict.items():
        if act_type == 'SEQUENCE_ACTIVITY_TYPE':
            continue
        _id = int(act_id)
        sub_seq = set(id_subsequences_dict2[_id]) if _id in id_subsequences_dict2 else 'null'
        if sub_seq != 'null':
            assert len(sub_seq) == len(id_subsequences_dict2[_id]),('There are missing values in the ' + 
                    'subsequences int[] column from table activities')
        activities_dict[act_id] = [lesson_id, act_type, id_title_dict[act_id], sub_seq]

    
    test_lessons_dict = {lesson_id: [title, int(userID), numOfNodes, total_activities, filename]}
    lessons_dict = {lesson_id: [title, int(userID), numOfNodes, total_activities]}
    # TODO: comment out
    print(lessons_dict)
    
    # avoid lesson dublicates
    lesson_sign = ' - '.join([str(x) for x in lessons_dict[lesson_id]])  
    if lesson_sign in lesson_sign_dict:
        floating_dict[lesson_id] = 0
        lesson_sign_dict[lesson_sign] += 1
        lesson_id = last_lesson_id
        sequence_id = last_sequence_id
        activity_id = last_activity_id
        print('---> File skipped because it is a copy of an already existent learning design')
        print('-----------------------------------------------------------------------------')
        return
    else:
        lesson_sign_dict[lesson_sign] = 1
#         lesson_sign_list.append(lesson_sign)
    
    # the order must be the same with the tables_info 
    output_dict_list = [test_lessons_dict, lessons_dict, sequences_dict, activities_dict, None]
# TODO: uncomment
#     for i in range(len(tables_info)):
#         if tables_info[i][0] == 'Activities_Info': 
#              create_tool_jsonb(lesson_id, id_tool_dict, graph_folder, toolContentID_id_dict)
#         else:
#             append_to_csv(output_dict_list[i], tables_info[i][0], tables_info[i][1])

            
    print('-----------------------------------------------------------------------------')        

In [24]:
floating_dict = {}
lesson_sign_dict = {}
invalid_designs = []
invalid_tool_xmls = []

paths = {}
with open('data/file_paths.csv', mode='r') as infile:
    reader = csv.reader(infile)
    paths = {rows[0]:rows[1] for rows in reader}

start_time = time.time()

subdirs = [paths['test_folder'] + x for x in get_subdirectories(paths['test_folder'])]
lesson_id = -1
sequence_id = -1
activity_id = -1
for _dir in subdirs:
    sequence_analysis(_dir)
    
elapsed_time = divmod(round((time.time() - start_time)), 60)

Analyzing: /home/foivos/Desktop/EPFL/OptionalProject/test_folder/project2013
Length of Sequence: 41 , Nodes: 42 , Number of transitions: 40 , Activities: 83 , Complex Nodes: 10 , Complex Activities: 22 

Main sequence:
 [(0, 'Noticeboard'), (1, 'Image Gallery'), (60, 'Share Resources'), (4, 'GROUPING_ACTIVITY_TYPE'), (32, 'PARALLEL_ACTIVITY_TYPE'), (2, 'Mind Map'), (65, 'PERMISSION_GATE_ACTIVITY_TYPE'), (7, 'Wiki'), (59, 'Noticeboard'), (3, 'GROUPING_ACTIVITY_TYPE'), (6, 'PARALLEL_ACTIVITY_TYPE'), (61, 'PERMISSION_GATE_ACTIVITY_TYPE'), (33, 'Noticeboard'), (66, 'TOOL_BRANCHING_ACTIVITY_TYPE'), (55, 'PARALLEL_ACTIVITY_TYPE'), (10, 'PARALLEL_ACTIVITY_TYPE'), (80, 'Noticeboard'), (81, 'PARALLEL_ACTIVITY_TYPE'), (11, 'Pixlr'), (12, 'Wiki'), (14, 'Data Collection'), (15, 'Noticeboard'), (58, 'PARALLEL_ACTIVITY_TYPE'), (13, 'Video Recorder'), (16, 'Noticeboard'), (63, 'Submit Files'), (62, 'PERMISSION_GATE_ACTIVITY_TYPE'), (67, 'Web Conferencing'), (17, 'Noticeboard'), (79, 'Forum'), (18, 'V

In [19]:
''' Show Statistics '''

print('------\nElapsed time: {m} min {s} sec\n'.format(m=elapsed_time[0], s=elapsed_time[1]))

print('--> {num} invalid learning designs\n'.format(num=len(invalid_designs)))
# print(*invalid_designs, '\n', sep='\n')

dublicates = {key:value for key,value in lesson_sign_dict.items() if value > 1}
sorted_keys = sorted(dublicates, key=lambda k: dublicates[k] , reverse=True)
total_dublicates = reduce((lambda x,y: x + y), [dublicates[k] for k in sorted_keys]) - len(sorted_keys)
print('--> {num} design dubilcates\n'.format(num=total_dublicates))
# print(*[(k + " ==> " + str(dublicates[k])) for k in sorted_keys], '\n', sep='\n')

print('--> {num} invalid tool xml files\n'.format(num=len(invalid_tool_xmls)))
# print(*invalid_tool_xmls, '\n', sep='\n')

with open('data/Lessons.csv') as l_in:
    stored_lessons = len(l_in.readlines()) -1 
    skipped_desings = len(invalid_designs) + total_dublicates
    print('\nTotal files parsed: {files}, stored files: {stored}, skipped files: {skipped}\n------'.format(
            files=len(subdirs), stored=stored_lessons, skipped=skipped_desings))

------
Elapsed time: 0 min 0 sec

--> 0 invalid learning designs



TypeError: reduce() of empty sequence with no initial value

#### Sanity check

In [14]:
''' Check if code changes code produce different results '''

# TODO: before_path is the folder with the initial csv files that are loaded into the DB
before_path = ''
after_path = 'data/'
csv_files = [ 'Lessons.csv', 'Sequences.csv', 'Activities.csv', 'Activities_Info.csv']
for file in csv_files:
    before_file = before_path + file
    after_file = after_path + file
    with open(before_file, 'r') as before_in, open(after_file, 'r') as after_in: 
        print('Check:\n', before_file, '\n', after_file)
        before_lines = before_in.readlines()
        after_lines = after_in.readlines()
        assert len(before_lines) == len(after_lines)
        for i in range(len(before_lines)):
            assert before_lines[i] == after_lines[i], str(i) + ' : ' + before_lines[i]
        if file == 'Lessons.csv':
            print('Total entries:',  len(after_lines) - 1, '\n')

Check:
 /home/foivos/Desktop/EPFL/OptionalProject/outputs/psql_tables/Lessons.csv 
 data/Lessons.csv
Total entries: 1576 

Check:
 /home/foivos/Desktop/EPFL/OptionalProject/outputs/psql_tables/Sequences.csv 
 data/Sequences.csv
Check:
 /home/foivos/Desktop/EPFL/OptionalProject/outputs/psql_tables/Activities.csv 
 data/Activities.csv
Check:
 /home/foivos/Desktop/EPFL/OptionalProject/outputs/psql_tables/Activities_Info.csv 
 data/Activities_Info.csv


In [15]:
for key, value in floating_dict.items():
    if value > 1:
        print(key, '--', value)