This notebook's goal is to parse the json files which were extracted from the database onto a nice dataframe where all the information about the experiment (outside the simulation logs) are recorded as is.

In [1]:
import os
import re
import yaml
import json
import pickle

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns

# Data

In [2]:
root = '../../data/temp/'

In [3]:
sessions = {
    'Session 1': 1,
    'Session 2': 1,
    'Session 3': 2,
    'Session 4': 2,
    'Session 5': 2,
    'Session 6': 2,
    'Session 7': 2,
    'Session 8': 2,
    'Session 9': 2,
    'Session 10': 2,
    'Session 11': 2,
    'Session 12': 2,
    'Session 13': 3,
    'Session 14': 3,
    'Session 15': 3,
    'Session 16': 3,
    'Session 17': 3,
    'Session 18': 3,
    'Session 19': 3,
    'Session 20': 3,
    'Session 21': 3,
    'Session 22': 3,
    'Session 23': 3,
    'Session 24': 3,
    'Session 25': 3,
    'Session 26': 3,
    'Session 27': 3,
    'Session 28': 3,
    'Session 29': 3,
    'Session 30': 3,
    'Session 31': 3,
    'Session 32': 3,
    'Session 33': 3,
    'Session 34': 3,
    'Session 35': 3,
    'Session 36': 3,
    'Session 37': 3,
    'Session 38': 4,
    'Session 39': 4,
    'Session 40': 4,
    'Session 41': 4
}

# Versions

## [Version 1]

In [4]:
session = 'Session 1'
with open(root + session + '/cdq4drkk-q.json', 'r') as fp:
    sesh1 = json.load(fp)

In [5]:
def extract_version1(sesh):
    row = []
    try:
        username = sesh['name']
        row.append(username)

        ranking = ''.join(sesh['0']['ranks'])
        row.append(ranking)
        ranking_confidence = sesh['0']['conf']
        row.append(ranking_confidence)
        ranking_time = sesh['0']['time']
        row.append(ranking_time)

        q1 = sesh['1']['text']
        row.append(q1)
        q1_confidence = sesh['1']['conf']
        row.append(q1_confidence)
        q1_time = sesh['1']['time']
        row.append(q1_time)

        q2 = sesh['2']['text']
        row.append(q2)
        q2_conf = sesh['2']['conf']
        row.append(q2_conf)
        q2_time = sesh['2']['time']
        row.append(q2_time)

        q3 = sesh['3']['text']
        row.append(q3)
        q3_conf = sesh['3']['conf']
        row.append(q3_conf)
        q3_time = sesh['3']['time']
        row.append(q3_time)

        q4 = sesh['4']['text']
        row.append(q4)
        q4_conf = sesh['4']['conf']
        row.append(q4_conf)
        q4_time = sesh['4']['time']
        row.append(q4_time)

        q5_colour0 = sesh['5']['sliders'][0]
        q5_colour1 = sesh['5']['sliders'][1]
        q5_colour2 = sesh['5']['sliders'][2]
        q5_colour3 = sesh['5']['sliders'][3]
        q5_time = sesh['5']['time']
        row.append(q5_colour0)
        row.append(q5_colour1)
        row.append(q5_colour2)
        row.append(q5_colour3)
        row.append(q5_time)

        q6_colour0 = sesh['6']['sliders'][0]
        q6_colour1 = sesh['6']['sliders'][1]
        q6_colour2 = sesh['6']['sliders'][2]
        q6_colour3 = sesh['6']['sliders'][3]
        q6_time = sesh['6']['time']
        row.append(q6_colour0)
        row.append(q6_colour1)
        row.append(q6_colour2)
        row.append(q6_colour3)
        row.append(q6_time)

        formula = sesh['7']['text']
        formula_conf = sesh['7']['conf']
        formula_time = sesh['7']['time']
        row.append(formula)
        row.append(formula_conf)
        row.append(formula_time)

        notes_text = ''
        notes_math = ''
        notes_table = ''
        notes_diag = ''
        notes_time = ''
        row.append(notes_text)
        row.append(notes_math)
        row.append(notes_table)
        row.append(notes_diag)
        row.append(notes_time)

        beerslaw = sesh['8']['text']
        beerslaw_conf = sesh['8']['conf']
        beerslaw_time = sesh['8']['time']
        row.append(beerslaw)
        row.append(beerslaw_conf)
        row.append(beerslaw_time)

        feedback_entertain = sesh['feedback']['s_entertain']
        feedback_difficult = sesh['feedback']['s_difficult']
        feedback_good = sesh['feedback']['t_good']
        feedback_bad = sesh['feedback']['t_bad']
        feedback = sesh['feedback']['t_what']
        feedback_time = sesh['feedback']['time']
        row.append(feedback_entertain)
        row.append(feedback_difficult)
        row.append(feedback_good)
        row.append(feedback_bad)
        row.append(feedback)
        row.append(feedback_time)

        experiment_time = ''
        row.append(experiment_time)
        
        row.append('complete')
    except KeyError as ke:
        missing = 49 - len(row)
        row = row + ['uncomplete' for m in range(missing)]
        row.append(ke)
    
    return row

## [Version 2]

In [6]:
session = 'Session 3'
with open(root + session + '/4vtjgtja-q.json', 'r') as fp:
    sesh3 = json.load(fp)

In [7]:
def extract_version2(sesh):
    row = []
    try:
        username = sesh['name']
        row.append(username)

        start_time = sesh['1']
        exploration_time = sesh['2']
        ranking_time = sesh['3']
        row.append(start_time)
        row.append(exploration_time)
        row.append(ranking_time)

        ranking = ''.join(sesh['4']['ranks'])
        ranking_confidence = ''
        ranking_time = sesh['4']['time']
        row.append(ranking)
        row.append(ranking_confidence)
        row.append(ranking_time)

        q1 = sesh['5']['text']
        q1_conf = sesh['5']['conf']
        q1_time = sesh['5']['time']
        row.append(q1)
        row.append(q1_conf)
        row.append(q1_time)

        q2 = sesh['6']['text']
        q2_conf = sesh['6']['conf']
        q2_time = sesh['6']['time']
        row.append(q2)
        row.append(q2_conf)
        row.append(q2_time)

        q3 = sesh['7']['text']
        q3_conf = sesh['7']['conf']
        q3_time = sesh['7']['time']
        row.append(q3)
        row.append(q3_conf)
        row.append(q3_time)

        q4 = sesh['8']['text']
        q4_conf = sesh['8']['conf']
        q4_time = sesh['8']['time']
        row.append(q4)
        row.append(q4_conf)
        row.append(q4_time)

        q5_colour0 = sesh['9']['sliders'][0]
        q5_colour1 = sesh['9']['sliders'][1]
        q5_colour2 = sesh['9']['sliders'][2]
        q5_colour3 = sesh['9']['sliders'][3]
        q5_time = sesh['9']['time']
        row.append(q5_colour0)
        row.append(q5_colour1)
        row.append(q5_colour2)
        row.append(q5_colour3)
        row.append(q5_time)

        q6_colour0 = sesh['10']['sliders'][0]
        q6_colour1 = sesh['10']['sliders'][1]
        q6_colour2 = sesh['10']['sliders'][2]
        q6_colour3 = sesh['10']['sliders'][3]
        q6_time = sesh['10']['time']
        row.append(q6_colour0)
        row.append(q6_colour1)
        row.append(q6_colour2)
        row.append(q6_colour3)
        row.append(q6_time)

        formula = sesh['11']['text']
        formula_conf = sesh['11']['conf']
        formula_time = sesh['11']['time']
        row.append(formula)
        row.append(formula_conf)
        row.append(formula_time)

        notes_text = sesh['12']['choices'][0]
        notes_math = sesh['12']['choices'][1]
        notes_table = sesh['12']['choices'][2]
        notes_diag = sesh['12']['choices'][3]
        notes_time = sesh['12']['time']
        row.append(notes_text)
        row.append(notes_math)
        row.append(notes_table)
        row.append(notes_diag)
        row.append(notes_time)

        beerslaw = sesh['13']['text']
        beerslaw_conf = sesh['13']['conf']
        beerslaw_time = sesh['13']['time']
        row.append(beerslaw)
        row.append(beerslaw_conf)
        row.append(beerslaw_time)

        #posttest_time = sesh['14']

        problem_ranking = ''.join(sesh['15']['choices'])
        problem_conf = sesh['15']['confidence']
        problem_time = sesh['15']['time']
        row.append(problem_ranking)
        row.append(problem_conf)
        row.append(problem_time)

        feedback_entertain = sesh['feedback']['s_entertain']
        feedback_difficult = sesh['feedback']['s_difficult']
        feedback_good = sesh['feedback']['t_good']
        feedback_bad = sesh['feedback']['t_bad']
        feedback = sesh['feedback']['t_what']
        feedback_time = sesh['feedback']['time']
        row.append(feedback_entertain)
        row.append(feedback_difficult)
        row.append(feedback_good)
        row.append(feedback_bad)
        row.append(feedback)
        row.append(feedback_time)

        experiment_time = sesh['17']['time']
        row.append(experiment_time)

        row.append('complete')
        
    except KeyError as ke:
        missing = 49 - len(row)
        row = row + ['uncomplete' for m in range(missing)]
        row.append(ke)
        
    return row

## [Version 3]

In [8]:
session = 'Session 13'
with open(root + session + '/4aaspwgs-q.json', 'r') as fp:
    sesh13 = json.load(fp)

In [9]:
def extract_version3(sesh):
    row = []
    try:
        username = sesh['name']
        row.append(username)

        start_time = sesh['1']
        exploration_time = sesh['2']
        ranking_time = sesh['3']
        row.append(start_time)
        row.append(exploration_time)
        row.append(ranking_time)

        if '4' in sesh:
            ranking = ''.join(sesh['4']['ranks'])
            if 'conf' in sesh['4']:
                ranking_confidence = sesh['4']['conf']
            else:
                ranking_confidence = -1
            ranking_time = sesh['4']['time']
        else:
            ranking = 'missing'
            ranking_confidence = -1
            ranking_time = -1
        row.append(ranking)
        row.append(ranking_confidence)
        row.append(ranking_time)

        q1 = sesh['5']['text']
        q1_conf = sesh['5']['conf']
        q1_time = sesh['5']['time']
        row.append(q1)
        row.append(q1_conf)
        row.append(q1_time)

        q2 = sesh['6']['text']
        q2_conf = sesh['6']['conf']
        q2_time = sesh['6']['time']
        row.append(q2)
        row.append(q2_conf)
        row.append(q2_time)

        q3 = sesh['7']['text']
        q3_conf = sesh['7']['conf']
        q3_time = sesh['7']['time']
        row.append(q3)
        row.append(q3_conf)
        row.append(q3_time)

        q4 = sesh['8']['text']
        q4_conf = sesh['8']['conf']
        q4_time = sesh['8']['time']
        row.append(q4)
        row.append(q4_conf)
        row.append(q4_time)

        q5_colour0 = sesh['9']['sliders'][0]
        q5_colour1 = sesh['9']['sliders'][1]
        q5_colour2 = sesh['9']['sliders'][2]
        q5_colour3 = sesh['9']['sliders'][3]
        q5_time = sesh['9']['time']
        row.append(q5_colour0)
        row.append(q5_colour1)
        row.append(q5_colour2)
        row.append(q5_colour3)
        row.append(q5_time)

        q6_colour0 = sesh['10']['sliders'][0]
        q6_colour1 = sesh['10']['sliders'][1]
        q6_colour2 = sesh['10']['sliders'][2]
        q6_colour3 = sesh['10']['sliders'][3]
        q6_time = sesh['10']['time']
        row.append(q6_colour0)
        row.append(q6_colour1)
        row.append(q6_colour2)
        row.append(q6_colour3)
        row.append(q6_time)

        formula = sesh['11']['text']
        formula_conf = sesh['11']['conf']
        formula_time = sesh['11']['time']
        row.append(formula)
        row.append(formula_conf)
        row.append(formula_time)

        notes_text = sesh['12']['choices'][0]
        notes_math = sesh['12']['choices'][1]
        notes_table = sesh['12']['choices'][2]
        notes_diag = sesh['12']['choices'][3]
        notes_time = sesh['12']['time']
        row.append(notes_text)
        row.append(notes_math)
        row.append(notes_table)
        row.append(notes_diag)
        row.append(notes_time)

        beerslaw = sesh['13']['text']
        beerslaw_time = sesh['13']['time']
        row.append(beerslaw)
        row.append(beerslaw_time)

        #posttest_time = sesh['14']

        problem_ranking = ''.join(sesh['15']['choices'])
        problem_conf = sesh['15']['conf']
        problem_time = sesh['15']['time']
        row.append(problem_ranking)
        row.append(problem_conf)
        row.append(problem_time)

        feedback_entertain = sesh['feedback']['s_entertain']
        feedback_difficult = sesh['feedback']['s_difficult']
        feedback_good = sesh['feedback']['t_good']
        feedback_bad = sesh['feedback']['t_bad']
        feedback = sesh['feedback']['t_what']
        feedback_time = sesh['feedback']['time']
        row.append(feedback_entertain)
        row.append(feedback_difficult)
        row.append(feedback_good)
        row.append(feedback_bad)
        row.append(feedback)
        row.append(feedback_time)

        experiment_time = sesh['17']['time']
        row.append(experiment_time)

        row.append('complete')
        
    except KeyError as ke:
        missing = 49 - len(row)
        row = row + ['uncomplete' for m in range(missing)]
        row.append(ke)
    
    return row

## [Version 4]

In [10]:
def extract_version4(sesh):
    raise NotImplementedError
    start_time = sesh['1']
    exploration_time = sesh['2']
    ranking_time = sesh['3']
    
    ranking = ''.join(sesh['4']['ranks'])
    ranking_confidence = sesh['4']['conf']
    ranking_time = sesh['4']['time']
    
    q1 = sesh['5']['text']
    q1_conf = sesh['5']['conf']
    q1_time = sesh['5']['time']
    
    q2 = sesh['6']['text']
    q2_conf = sesh['6']['conf']
    q2_time = sesh['6']['time']
    
    q3 = sesh['7']['text']
    q3_conf = sesh['7']['conf']
    q3_time = sesh['7']['time']
    
    q4 = sesh['8']['text']
    q4_conf = sesh['8']['conf']
    q4_time = sesh['8']['time']
    
    q5_colour0 = sesh['9']['sliders'][0]
    q5_colour1 = sesh['9']['sliders'][1]
    q5_colour2 = sesh['9']['sliders'][2]
    q5_colour3 = sesh['9']['sliders'][3]
    q5_time = sesh['9']['time']
    
    q6_colour0 = sesh['10']['sliders'][0]
    q6_colour1 = sesh['10']['sliders'][1]
    q6_colour2 = sesh['10']['sliders'][2]
    q6_colour3 = sesh['10']['sliders'][3]
    q6_time = sesh['10']['time']
    
    formula = sesh['11']['text']
    formula_conf = sesh['11']['conf']
    formula_time = sesh['11']['time']
    
    notes_text = sesh['12']['choices'][0]
    notes_math = sesh['12']['choices'][1]
    notes_table = sesh['12']['choices'][2]
    notes_diag = sesh['12']['choices'][3]
    notes_time = sesh['12']['time']
    
    beerslaw = sesh['13']['text']
    beerslaw_time = sesh['13']['time']
    
    #posttest_time = sesh['14']
    
    problem_ranking = ''.join(sesh['15']['choices'])
    problem_conf = sesh['15']['confidence']
    problem_time = sesh['15']['time']
    
    feedback_entertain = sesh['feedback']['s_entertain']
    feedback_difficult = sesh['feedback']['s_difficult']
    feedback_good = sesh['feedback']['t_good']
    feedback_bad = sesh['feedback']['t_bad']
    feedback = sesh['feedback']['t_what']
    feedback_time = sesh['feedback']['time']
    
    experiment_time = sesh['17']['time']
    
    row = [
        username,
        ranking, ranking_confidence, ranking_time,
        q1, q1_conf, q1_time,
        q2, q2_conf, q2_time,
        q3, q3_conf, q3_time,
        q4, q4_conf, q4_time,
        q5_colour0, q5_colour1, q5_colour2, q5_colour3, q5_time,
        q6_colour0, q6_colour1, q6_colour2, q6_colour3, q6_time,
        formula, formula_conf, formula_time,
        notes_text, notes_math, notes_table, notes_diag, notes_time,
        beerslaw, beerslaw_time,
        problem_ranking, problem_conf, problem_time,
        feedback_entertain, feedback_difficult, feedback_good, feedback_bad, feedback, feedback_time,
        experiment_time
    ]
    return row

# Parsing

In [11]:
# Version to  Version Functions
version_function = {
    1: extract_version1,
    2: extract_version2,
    3: extract_version3,
    4: extract_version4
}

# get the session number
def get_session_number(path: str):
    reg = '(Session [0-9]+)'
    reg = re.compile(reg)
    reg = reg.findall('../../data/temp/Session 29/bezdgyya-2.log')
    return reg[0]

# reads the post test files and extract the information
def process_version(path: str, sessions: dict, version_function: dict):
    session = get_session_number(path)
    version = sessions[session]
    fun = version_function[version]
    
    with open(path, 'r') as fp:
        sesh = json.load(fp)
    try:
        row = fun(sesh)
    except KeyError as ke:
        return []
    return row
        
    

In [12]:
# Crawl all of the files
repo = '../../data/temp/'
files = []
for r, d, f in os.walk(repo):
    for file in f:
        if file.endswith(".json"):
            files.append(os.path.join(r, file))

In [13]:
# Pull the information and create the dataframe
post_test = []
for file in files:
    row = process_version(file, sessions, version_function)
    post_test.append(row)
    
# Columns
columns = [
    'username',
    'start_time', 'exploration_time', 'ranking_time',
    'ranking', 'ranking_confidence', 'ranking_time',
    'q1', 'q1_conf', 'q1_time',
    'q2', 'q2_conf', 'q2_time',
    'q3', 'q3_conf', 'q3_time',
    'q4', 'q4_conf', 'q4_time',
    'q5_colour0', 'q5_colour1', 'q5_colour2', 'q5_colour3', 'q5_time',
    'q6_colour0', 'q6_colour1', 'q6_colour2', 'q6_colour3', 'q6_time',
    'formula', 'fomula_conf', 'formula_time',
    'notes_text', 'notes_math', 'notes_table', 'notes_diag', 'notes_time',
    'beerslaw', 'beerslaw_time',
    'problem_ranking', 'problem_conf', 'problem_time', 
    'feedback_entertain', 'feedback_difficult', 'feedback_good', 'feedback_bad', 'feedback', 'feedback_time',
    'experiment_time', 'status'
]

# DataFrame
post_test = pd.DataFrame(post_test, columns=columns)

In [14]:
post_test

Unnamed: 0,username,start_time,exploration_time,ranking_time,ranking,ranking_confidence,ranking_time.1,q1,q1_conf,q1_time,...,problem_conf,problem_time,feedback_entertain,feedback_difficult,feedback_good,feedback_bad,feedback,feedback_time,experiment_time,status
0,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,...,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,'name'
1,y8qbtkmf,{'time': 1624975991},{'time': 1624976130},{'time': 1624976138},,100,1624976297,0.37,50,1624976447,...,100,1624976799,80,100,Es hat spass gemacht und ich fand sehr gut das...,"Das einem nicht so ganz klargestellt wird, das...","Ich würde gleich vorgehen, jedoch würde ich mi...",1624976966,1624977058,complete
2,sr34qyfx,{'time': 1624975979},{'time': 1624976049},{'time': 1624976060},,50,1624976281,1.48,85,1624976386,...,100,1624976972,35,100,Bilder,"Es sind immer die gleichen Fragen, wenn man ni...","Nein, ich könnte das nächste mal mir die Resul...",1624977497,1624978348,complete
3,ntqpqkpq,{'time': 1624976038},{'time': 1624976175},{'time': 1624976616},,50,1624976676,0.37,75,1624976802,...,35,1624978467,75,50,Aufgaben mit den Extinktionen herausfinden,Nicht schlecht jedoch fand ich die Aufgabe 11 ...,"Ich finde, dafür dass das Thema schon eine wei...",1624978602,1624978648,complete
4,r29wzm6f,{'time': 1624976041},{'time': 1624976055},{'time': 1624976976},,70,1624976990,1.59,50,1624977018,...,0,1624978054,40,70,Die zum selbst-ausprobierende Grafiken.,Zu viele Fragen in Aufgabe 1 zu den Extinktion...,ja durch ausprobieren,1624978149,1624978242,complete
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464,ky8kttpv,{'time': 1620375958},{'time': 1620376249},{'time': 1620377389},0213,-1,1620377415,49.90901496,10,1620377618,...,5,1620379358,0,0,die laborsimulation,die Übersetzung,wieder gleich,1620379477,1620379480,complete
465,9bp6yz7m,{'time': 1620375936},{'time': 1620376155},{'time': 1620377162},0231,-1,1620377179,0.37,25,1620377251,...,85,1620378206,60,50,das Ausprobieren,"Aufgabe 2 war zu lang, Aufgabe 2 war schwerer ...","Aufgabe 2: anderes Vorgehen, jedoch weiss ich ...",1620378405,1620378511,complete
466,ke6msbfr,{'time': 1620375949},{'time': 1620376384},{'time': 1620376999},3120,-1,1620377047,"weniger Absorption, wie links im Bild, da die ...",100,1620377095,...,100,1620378919,0,40,Aufgabe 3,immer wieder Wellenlänge einstellen bei Aufgabe 3,anfangs die Aufgabe besser studieren.,1620379034,1620379117,complete
467,fj5tdybn,{'time': 1620376155},{'time': 1620376211},{'time': 1620376588},0231,-1,1620376617,1.48,75,1620376678,...,80,1620379326,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,uncomplete,'feedback'


In [15]:
post_test.to_csv('../../data/post_test/extracted.csv', sep='\t')