In [1]:
import psycopg2
import pandas as pd
import numpy as np
import itertools
from functools import reduce
# from tqdm import tqdm
from tqdm import tqdm_notebook as tqdm
import os
from pathlib import Path
import re
import csv

from collections import Counter
from nltk import ngrams

  """)


In [2]:
def dict_from_csv(filename):
    with open(filename, mode='r') as f:
        reader = csv.reader(f)
        csv_dict = {rows[0]:rows[1] for rows in reader}
        return csv_dict

In [3]:
def connect_to_db(user, password, host, port, database):
    try:
        connection = psycopg2.connect(
            user = user, password = password, host = host, port = port, database = database)
    except (Exception, psycopg2.Error) as error :
        print("Error: ", error)
        
    return connection

In [4]:
def execute_query(connection, query_string):
    try:
        cursor = connection.cursor()
        cursor.execute(query_string)
        records = cursor.fetchall()
        print('Records fetched: ', len(records))
    except (Exception, psycopg2.Error) as error :
        print("Error: ", error)
        
    return records

In [11]:
def create_all_paths(all_sequences_path):
    open(all_sequences_path, 'w')
    grouped_subsequences = subsequences.groupby('sequence_id')

    for sequence_id in tqdm(grouped_subsequences.sequence_id.unique()):
        with open(all_sequences_path, 'a') as f:
            paths_list = []
            lesson_subseqs = grouped_subsequences.get_group(sequence_id[0])
            
            # get the main sequence of the learning desing
            main_seq = lesson_subseqs.loc[lesson_subseqs['main']==True]['activities']#values[0]
            assert len(main_seq) == 1
            main_seq = main_seq.values[0]
            
            # get all the branches excluding floating activities
            subseq_dict = {}
            lesson_subseqs = lesson_subseqs[(lesson_subseqs['main']==False) & (lesson_subseqs['parent_id'])]
            for complex_act in lesson_subseqs.parent_id.unique():
                subseq_dict[int(complex_act)] = []
            
            # replace complex activities with the relative branches 
            for row in lesson_subseqs.itertuples():
                subseq_dict[int(row.parent_id)].append([row.parent_id] + row.activities)
            subseq_dict
            combinations = list(itertools.product(*list(subseq_dict.values())))
            paths_list = []
            for comp in combinations:
                main_seq_copy = main_seq[:]
                for subpath in comp:
                    if subpath[0] in main_seq_copy:
                        position = main_seq_copy.index(subpath[0]) + 1
                        main_seq_copy[position: position] = subpath[1:]
                if main_seq_copy not in paths_list:
                    paths_list.append(main_seq_copy)

            for path in paths_list:
                f.write(str(sequence_id[0]) + ',')
                for activity in path:
                    f.write(str(activity) + ' ')
                f.write('\n')
    f.close()

### Data retrieval

In [14]:
# database connection
params_dict = dict_from_csv('connection_params.csv')
connection = connect_to_db(*params_dict.values())

In [16]:
# query the database
sequence_records = execute_query(connection, "select * from subsequences;")
activity_records = execute_query( 
    connection,
    "SELECT id, (CASE WHEN tool::text IS NULL THEN type::text ELSE tool::text END) as type FROM activities;"
)
if connection:
    connection.close()

Error:  connection already closed


UnboundLocalError: local variable 'records' referenced before assignment

In [17]:
subsequences = pd.DataFrame(sequence_records, columns=['id', 'sequence_id', 'parent_id', 'activities', 'main'])
subsequences.set_index('id', inplace=True)

In [18]:
subsequences.head(2)

Unnamed: 0_level_0,sequence_id,parent_id,activities,main
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,,"[0, 1, 2, 3, 4, 5, 7, 6, 9, 10, 8]",True
1,1,,"[11, 12, 13]",True


In [19]:
activities = pd.DataFrame(activity_records, columns=['id', 'type'])
activities.set_index('id', inplace=True)
activities.head(2)

Unnamed: 0_level_0,type
id,Unnamed: 1_level_1
0,Question and Answer
1,Multiple Choice


In [20]:
paths_file = Path("data/paths.txt")

if paths_file.is_file():
    with open(paths_file, 'r') as f:
        all_sequences = f.readlines()
else:
    create_all_paths(paths_file)
    with open(paths_file, 'r') as f:
        all_sequences = f.readlines()

print(len(all_sequences), 'paths were loaded')
# TODO f not found locally, create and keep it in memory without storing it

11240 paths were loaded


In [21]:
all_sequences = [re.sub('\n', '', line.strip()) for line in all_sequences]
all_sequences[0]

'0,0 1 2 3 4 5 7 6 9 10 8'

In [22]:
text_sequences = list()
for sequence in tqdm(all_sequences):
    text_seq = []
    lesson_id = sequence.split(',')[0]
    text_seq.append(lesson_id)
    num_seq = sequence.split(',')[1]
    for num in num_seq.split():
        text_seq.append(activities.loc[int(num)].type)
    text_sequences.append(text_seq)
len(text_sequences)

HBox(children=(IntProgress(value=0, max=11240), HTML(value='')))




11240

In [23]:
# eliminate dublicates within the same learning desing
text_sequences_set = set(tuple(seq) for seq in text_sequences)
text_sequences_unique = [list(seq) for seq in text_sequences_set]
text_sequences_unique = sorted(text_sequences_unique, key=lambda x: int(x[0]))
len(text_sequences_unique)

4270

### N-grams

In [24]:
ngram_num = 3

In [25]:
# create dictionary -> lesson id: lesson n-grams
# max n-gram counter per lesson = 1 
ngram_dict = dict()
text_seq_dict = dict()
for seq in text_sequences_unique:
    lesson_id = int(seq[0])
    if lesson_id not in ngram_dict:
        ngram_dict[lesson_id] = set()
        text_seq_dict[lesson_id] = list()
    ngram_dict[lesson_id].update(list(ngrams(seq[1:], ngram_num)))
    text_seq_dict[lesson_id].append(seq[1:])

  # Remove the CWD from sys.path while we load stuff.


In [26]:
ngram_counts = Counter()
for ngram in ngram_dict.values():
    ngram_counts += Counter(ngram)

In [27]:
common_ngrams = ngram_counts.most_common(10) 
print(*common_ngrams, sep='\n')

(('Noticeboard', 'Noticeboard', 'Noticeboard'), 359)
(('Noticeboard', 'Noticeboard', 'Multiple Choice'), 207)
(('Noticeboard', 'Noticeboard', 'Share Resources'), 198)
(('Noticeboard', 'Multiple Choice', 'Noticeboard'), 186)
(('Noticeboard', 'Question and Answer', 'Noticeboard'), 175)
(('Noticeboard', 'Noticeboard', 'Question and Answer'), 162)
(('Noticeboard', 'Share Resources', 'Question and Answer'), 144)
(('Noticeboard', 'Share Resources', 'Noticeboard'), 142)
(('Noticeboard', 'GROUPING_ACTIVITY_TYPE', 'Chat'), 139)
(('Noticeboard', 'Noticeboard', 'GROUPING_ACTIVITY_TYPE'), 133)


In [28]:
# for each common ngram store the lesson ids where they appear
ngram_set_list = []
for ngram_tuple in common_ngrams:
    ngram_set = set() 
    for seq in text_sequences_unique:
        if ngram_tuple[0] in list(ngrams(seq[1:],ngram_num)):
            ngram_set.add(int(seq[0]))
    ngram_set_list.append(ngram_set)
    
for idx, lessons_set in enumerate(ngram_set_list):
    common_ngrams[idx] = common_ngrams[idx] + (sorted(list(lessons_set)),)

  


##### Locate n-grams

In [29]:
import numpy as np

In [30]:
def ngram_index(tokenized_sentence, requested_ngram, ngram_num=ngram_num):
    try:
        return list(ngrams(tokenized_sentence, ngram_num)).index(requested_ngram)
    except:
        return -1

In [32]:
# # find the position of ngram in every lesson
# # if it is the last ngram, assigns -1
# for ngram in [common_ngrams[9], ()]:
#     query_ngram = ngram[0]
#     print(query_ngram, '\n')
#     for lesson_id in ngram[2]:
#         ngram_positions = np.empty(len(text_seq_dict[lesson_id]))
#         for idx, seq in enumerate(text_seq_dict[lesson_id]):
#             ngram_positions[idx] = ngram_index(seq, query_ngram)
#         lesson_position = np.floor(ngram_positions[ngram_positions >= 0].mean())
#         if lesson_position + ngram_num == len(seq):
#             print(lesson_position)
#             lesson_position = -1
#         print(lesson_id, '---->' , int(lesson_position))     

### Statistics

In [33]:
#investigate the groupings
# whether they are followed by branchings or not

In [34]:
group_set = set()
group_branching_set = set()
for seq in text_sequences:
    if "GROUP_BRANCHING_ACTIVITY_TYPE" in seq[1:]:
        group_branching_set.add(seq[0])
    if "GROUPING_ACTIVITY_TYPE" in seq[1:]:
        group_set.add(seq[0])

In [35]:
print("GROUPING_ACTIVITY_TYPE :", len(group_set), "GROUP_BRANCHING_ACTIVITY_TYPE :", len(group_branching_set))
print("Difference: ", group_branching_set - group_set)
group_no_branching_list = sorted([int(x) for x in (group_set - group_branching_set)])
group_list = sorted([int(x) for x in group_set])        
group_branching_list = sorted([int(x) for x in group_branching_set])

GROUPING_ACTIVITY_TYPE : 999 GROUP_BRANCHING_ACTIVITY_TYPE : 301
Difference:  set()


In [None]:
# Find which activities cause tool branches

#### Testing

In [45]:
myset = set()
for seq in text_sequences:
    if "TOOL_BRANCHING_ACTIVITY_TYPE" in seq[1:]:
        myset.add(seq[0])
mylist = sorted([int(x) for x in myset])
print(mylist[:5])

[17, 42, 47, 51, 67]


In [38]:
for seq in text_sequences:
    if '881' in seq:
        print(seq)

['881', 'Image Gallery', 'eAdventure', 'TOOL_BRANCHING_ACTIVITY_TYPE', 'Image Gallery', 'eAdventure', 'Notebook']
['881', 'Image Gallery', 'eAdventure', 'TOOL_BRANCHING_ACTIVITY_TYPE', 'Notebook']
