In [1]:
import psycopg2
import pandas as pd
import numpy as np
import itertools
from functools import reduce
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
import os
from pathlib import Path
import re
import csv

from collections import Counter
from nltk import ngrams

  """)


In [2]:
def dict_from_csv(filename):
    with open(filename, mode='r') as f:
        reader = csv.reader(f)
        csv_dict = {rows[0]:rows[1] for rows in reader}
        return csv_dict

def connect_to_db(user, password, host, port, database):
    try:
        connection = psycopg2.connect(
            user = user, password = password, host = host, port = port, database = database)
    except (Exception, psycopg2.Error) as error :
        print("Error: ", error)
        
    return connection

def execute_query(connection, query_string):
    try:
        cursor = connection.cursor()
        cursor.execute(query_string)
        records = cursor.fetchall()
        print('Records fetched: ', len(records))
    except (Exception, psycopg2.Error) as error :
        print("Error: ", error)
        
    return records

In [3]:
# database connection
params_dict = dict_from_csv('connection_params.csv')
connection = connect_to_db(*params_dict.values())

In [4]:
# query the database
sequence_records = execute_query(connection, "select * from subsequences;")
activity_records = execute_query( 
    connection,
    "SELECT id, (CASE WHEN tool::text IS NULL THEN type::text ELSE tool::text END) as type FROM activities;"
)
if connection:
    connection.close()

Records fetched:  7229
Records fetched:  32559


In [5]:
subsequences = pd.DataFrame(sequence_records, columns=['id', 'sequence_id', 'parent_id', 'activities', 'main'])
subsequences.set_index('id', inplace=True)

In [6]:
subsequences.head(2)

Unnamed: 0_level_0,sequence_id,parent_id,activities,main
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,,"[0, 1, 2, 3, 4, 5, 7, 6, 9, 10, 8]",True
1,1,,"[11, 12, 13]",True


In [7]:
activities = pd.DataFrame(activity_records, columns=['id', 'type'])
activities.set_index('id', inplace=True)
activities.head(2)

Unnamed: 0_level_0,type
id,Unnamed: 1_level_1
0,Question and Answer
1,Multiple Choice


In [17]:
with open('data/simple_categories.txt') as f:
    lines = f.readlines()
    f.close()
lines = [line.replace('\n', '') for line in lines]
simple_matching = dict() 
for line in lines:
    keys_string = line.split(':')[1]
    category = line.split(':')[0]
    for key in keys_string.split(','):
        simple_matching[key] = category

with open('data/branching_categories.csv', mode='r') as infile:
    reader = csv.reader(infile)
    complex_matching = {rows[0]:rows[1] for rows in reader}

activity_matching = dict(simple_matching)
activity_matching.update(complex_matching)

In [120]:
def create_all_paths(sequences_in_types, sequences_in_groups):
    open(sequences_in_types, 'w')
    open(sequences_in_groups, 'w')
    grouped_subsequences = subsequences.groupby('sequence_id')

    for sequence_id in tqdm(grouped_subsequences.sequence_id.unique()):
        with open(sequences_in_types, 'a') as f_types, open(sequences_in_groups, 'a') as f_groups:
#             print('----> Lesson: ', sequence_id[0])
            
            paths_list = []
            
            # subsequences dataframe grouped by sequence ID 
            lesson_subseqs = grouped_subsequences.get_group(sequence_id[0])
            
            # get the main sequence of the learning desing
            main_seq = lesson_subseqs.loc[lesson_subseqs['main']==True]['activities']#values[0]
            assert len(main_seq) == 1
            main_seq = main_seq.values[0]
            # get all the branches excluding floating activities
            lesson_subseqs = lesson_subseqs[(lesson_subseqs['main']==False) & (lesson_subseqs['parent_id'])]
            
            # dictionary, key -> branching activity : value -> list of branches it creates
            subseq_dict = {}
            for complex_act in lesson_subseqs.parent_id.unique():
                subseq_dict[int(complex_act)] = []
            # match: from activity_id --> activity_type
            for row in lesson_subseqs.itertuples():
                subseq_dict[int(row.parent_id)].append([activities.loc[int(act)].type
                                                        for act in row.activities])
            
            # store info for categorizing the complex activities
            complex_act_info = dict()
            if subseq_dict:
                for branching_act, branches in subseq_dict.items():
                    branching_info = dict()
                    # length of longest branch
                    branching_info['max_length'] = max(len(branch) for branch in branches)
                    if branching_info['max_length'] > 1:
                        branching_info['length_cat'] = "long"
                    else:
                        branching_info['length_cat'] = 'short'     
                    # check the similarity among the branches
                    branch_set = set(tuple(branch) for branch in branches)
                    if len(branch_set) > 1:
                        branching_info['similarity'] = "diff"
                    else:
                        branching_info['similarity'] = "same"
                    # in few lessons one out of two branches might be empty,
                    # we consider this as different branches 
                    if len(branches) == 1:
                        branching_info['similarity'] = "diff"
                    complex_act_info[branching_act] = branching_info
            
            # branching activities
            for key, value_dict in complex_act_info.items():
                split_type = complex_matching[activities.loc[int(key)].type]
                # TODO: decide form: and_split VS and_split_short_same
                if split_type == 'or_split':
                    complex_act_info[key] = (
                        split_type + '_' + value_dict['length_cat'] + '_' + value_dict['similarity'])
                elif split_type == 'and_split':
                    complex_act_info[key] = (
                        split_type + '_' + value_dict['length_cat'] + '_' + value_dict['similarity'])
                elif split_type == 'xor_split':
                    complex_act_info[key] = (
                        split_type + '_' + value_dict['length_cat'] + '_' + value_dict['similarity'])
            
            main_in_types = []
            main_in_groups = []
            for act in main_seq:
                if act in complex_act_info:
                    main_in_types.append(complex_act_info[act])
                    main_in_groups.append(complex_act_info[act])                   
                else:
                    main_in_types.append(activities.loc[int(act)].type)
                    main_in_groups.append(activity_matching[activities.loc[int(act)].type])
            
            

#             combinations = list(itertools.product(*list(subseq_dict.values())))
#             paths_list = []
#             for comp in combinations:
#                 main_seq_copy = main_seq[:]
#                 for subpath in comp:
#                     if subpath[0] in main_seq_copy:
#                         position = main_seq_copy.index(subpath[0]) + 1
#                         main_seq_copy[position: position] = subpath[1:]
#                 if main_seq_copy not in paths_list:
#                     paths_list.append(main_seq_copy)

            f_types.write(str(sequence_id[0]) + ',')
            f_types.write(','.join(str(act_type) for act_type in main_in_types))
            f_types.write('\n')
            f_groups.write(str(sequence_id[0]) + ',')
            f_groups.write(','.join(str(act_group) for act_group in main_in_groups))
            f_groups.write('\n')

            
    f_types.close()
    f_groups.close()

In [121]:
create_all_paths('data/sequences_in_types.txt', 'data/sequences_in_groups.txt')

with open('data/sequences_in_types.txt') as in_types, open('data/sequences_in_groups.txt') as in_groups:
    sequences_in_types = in_types.readlines()
    sequences_in_groups = in_groups.readlines()
    in_types.close()
    in_groups.close()
sequences_in_types = [seq.replace('\n', '').split(',') for seq in sequences_in_types]
sequences_in_groups = [seq.replace('\n', '').split(',') for seq in sequences_in_groups]

HBox(children=(IntProgress(value=0, max=2512), HTML(value='')))




In [122]:
print(sequences_in_types[:2])
print(sequences_in_groups[:2])

[['0', 'Question and Answer', 'Multiple Choice', 'Question and Answer', 'Question and Answer', 'Share Resources', 'Question and Answer', 'Share Resources', 'Question and Answer', 'Question and Answer', 'Multiple Choice', 'Noticeboard'], ['1', 'Noticeboard', 'Multiple Choice', 'Question and Answer']]
[['0', 'Reflective Activities', 'Assessment Activities', 'Reflective Activities', 'Reflective Activities', 'Informative Activities', 'Reflective Activities', 'Informative Activities', 'Reflective Activities', 'Reflective Activities', 'Assessment Activities', 'Informative Activities'], ['1', 'Informative Activities', 'Assessment Activities', 'Reflective Activities']]


### N-grams

In [124]:
ngram_num = 3

In [132]:
def get_common_ngrams(content_list, ngram_num=3, common_num=10, include_split=False):
    # create dictionary -> lesson id: lesson n-grams
    if include_split:
        ngram_dict = dict()
        for seq in content_list:
            lesson_id = int(seq[0])
            ngram_dict[lesson_id] = [ngram for ngram in ngrams(seq[1:], ngram_num)
                                        if [token for token in ngram if 'split' in token]]
    else:    
        ngram_dict = dict()
        for seq in content_list:
            lesson_id = int(seq[0])
            ngram_dict[lesson_id] = list(ngrams(seq[1:], ngram_num))
        
    ngram_counts = Counter()
    for ngram in ngram_dict.values():
        ngram_counts += Counter(ngram)  
        
    common_ngrams = ngram_counts.most_common(common_num) 
    return common_ngrams

In [135]:
print(*get_common_ngrams(sequences_in_types, common_num=3), sep='\n')

  del sys.path[0]


(('Noticeboard', 'Noticeboard', 'Noticeboard'), 685)
(('Noticeboard', 'Noticeboard', 'Share Resources'), 216)
(('Noticeboard', 'Noticeboard', 'Multiple Choice'), 213)


In [136]:
print(*get_common_ngrams(sequences_in_types, common_num=3, include_split=True), sep='\n')

(('Noticeboard', 'GROUPING_ACTIVITY_TYPE', 'xor_split_long_same'), 71)
(('Noticeboard', 'GROUPING_ACTIVITY_TYPE', 'xor_split_short_same'), 52)
(('Noticeboard', 'GROUPING_ACTIVITY_TYPE', 'and_split_short_diff'), 46)


  import sys


In [137]:
print(*get_common_ngrams(sequences_in_groups, common_num=3), sep='\n')

(('Informative Activities', 'Informative Activities', 'Informative Activities'), 1785)
(('Informative Activities', 'Reflective Activities', 'Informative Activities'), 934)
(('Informative Activities', 'Informative Activities', 'Reflective Activities'), 788)


  del sys.path[0]


In [138]:
print(*get_common_ngrams(sequences_in_groups, common_num=3, include_split=True), sep='\n')

(('Informative Activities', 'Grouping', 'xor_split_long_same'), 83)
(('Informative Activities', 'Assessment Activities', 'xor_split_long_diff'), 72)
(('Informative Activities', 'Grouping', 'and_split_short_diff'), 69)


  import sys


In [129]:
mylist = []
for seq in sequences_in_types:
    if [act for act in seq[1:] if 'and_split' in act]:
        mylist.append(seq[0])
print(mylist[:5])

['15', '24', '26', '30', '41']
