<h1> Data Analysis for REYeker</h1>

In [11]:
# lib for dataframes
import pandas as pd

# lib for saving np images
from PIL import Image

# lib for plotting
%matplotlib inline
import matplotlib.pyplot as plt

# lib for numerical computations
import numpy as np

# lib for crerating paths
from pathlib import Path

# REYeker lib
import modules.rEYEkerAnalysis as rEYEker

<h2>1. Configuration</h2>

<h5>Database configuration </h5>

In [12]:
# path to the datafile
config_datasheet_path = r'./results/preprocessed.xlsx'

# columns with visual stimulus data
config_visual_stimulus_variable = "ClickData"

# columns with programming style
config_programming_style_variable = "ProgrammingStyle"

# columns with comprehension style
config_comprehension_variable = "Comprehension"

# columns with names of the algo
config_algo_name_variable = "Algorithm"

# columns with correctness value
config_corectness_variable = "Correctness"

# columns with time data of visual stimulus
config_time_variable_array = []

# columns with the given answers of the studen
config_flag_variable = "Flag"

# colums of response time
config_response_time_variable = "ResponseTime"

<h5>Configuration for REYEker data </h5>

In [13]:
# file for loading rEYEker settings
config_reyeker_settings_path = "data/used.json"

<h5>Import the preprocessed dataframe</h5>

In [21]:
needed_columns = ["Subject",
                  config_programming_style_variable, 
                  config_comprehension_variable, 
                  config_algo_name_variable,
                  config_flag_variable,
                  config_corectness_variable,
                  config_visual_stimulus_variable]

raw = pd.read_excel(config_datasheet_path)
df = pd.DataFrame(raw, columns=needed_columns)
df = df.loc[df["Correctness"]==True]
algo_name_array = [name for name in df[config_algo_name_variable].unique()]

df_tensor = []

for algo_name in algo_name_array:
    algo_df = df.loc[df[config_algo_name_variable]==algo_name]
    df_array = [algo_df.loc[(df[config_programming_style_variable]=="R") & (algo_df[config_comprehension_variable]=="BU")],
                algo_df.loc[(df[config_programming_style_variable]=="I") & (algo_df[config_comprehension_variable]=="BU")],
                algo_df.loc[(df[config_programming_style_variable]=="R") & (algo_df[config_comprehension_variable]=="TD")],
                algo_df.loc[(df[config_programming_style_variable]=="I") & (algo_df[config_comprehension_variable]=="TD")],]
    df_tensor.append(df_array)

Unnamed: 0,Subject,ProgrammingStyle,Comprehension,Algorithm,Flag,Correctness,ClickData
0,638,R,BU,BinarySearch,,True,136-143 182-47 475-78 284-96 190-147 437-143 1...
1,712,R,BU,BinarySearch,,True,149-33 282-94 218-194 513-176 224-204 487-235 ...
2,750,R,BU,BinarySearch,,True,464-76 144-124 406-176 534-235 307-273 541-370...
3,784,R,BU,BinarySearch,,True,176-32 413-60 216-110 426-77 209-116 286-166 5...
5,841,R,BU,BinarySearch,,True,195-40 450-92 201-104 196-137 535-171 208-217 ...
...,...,...,...,...,...,...,...
644,875,I,TD,ReverseString,,True,298-61 329-137 313-288 358-408 629-400 264-439...
645,876,I,TD,ReverseString,,True,162-49 260-83 317-36 187-10 392-20 50-55 178-7...
647,900,I,TD,ReverseString,,True,86-51 381-36 146-150 349-149 165-154 138-536 4...
648,939,I,TD,ReverseString,,True,200-3 322-30 318-31 372-81 127-127 284-134 348...


In [15]:
# data for loading the images
image_path_tensor = []

for algo_name in algo_name_array:
    image_path_array =[
        'images/BR/BR_' + algo_name + '.png',
        'images/BI/BI_' + algo_name + '.png',
        'images/TR/TR_' + algo_name + '.png',
        'images/TI/TI_' + algo_name + '.png',
    ]
    image_path_tensor.append(image_path_array)
    
# where to save to heatmaps and sequence diagrams
config_folder_prefix_array = ['BR/','BI/', 'TR/','TI/']

# used for saving the heatmaps and sequence diagrams
config_image_prefix_tensor = []
for algo_name in algo_name_array:
    image_prefix_array =[
        'BR_' + algo_name + '_',
        'BI_' + algo_name + '_',
        'TR_' + algo_name + '_',
        'TI_' + algo_name + '_',
    ]
    config_image_prefix_tensor.append(image_prefix_array)

<h4>Splitting Dataframes in right and wrong answers.</h4>

In [16]:
#df_tensor_right = [[df.loc[df[config_corectness_variable]==True] for df in df_array] for df_array in df_tensor]

<h4>Remove Outliers</h4>

In [17]:
#df_tensor = [[df.loc[df[config_flag_variable]!="outlier"]  for df in df_array] for df_array in df_tensor_right]

<h4>Import REYeker Settings</h4>

In [18]:
(_data, _times, click_setting) = rEYEker.load_data_from_json(config_reyeker_settings_path)

<h4>Import Images Settings</h4>

In [19]:
image_tensor = []

# read in every image
for image_path_array in image_path_tensor:
    image_array = []
    for image_path in image_path_array:
        img = rEYEker.load_image(image_path)
        image_array.append(img)
    image_tensor.append(image_array)

<h4> Cast Data to Valid format</h4>

Import the visual stimulus measured Data

In [20]:
visual_stimulus_data_tensor = []

#iter over every dataframe
for df_array in df_tensor: 
    
    visual_stimulus_data_matrix = []
    for idx, dataframe in enumerate(df_array):
        visual_stimulus_array = []
    
        #iter over every row 
        for _idx, row in dataframe.iterrows():
            data_str = row[config_visual_stimulus_variable]
            data_str = data_str.strip()
            coordinates_str = data_str.split(" ")
            coordinates = []
           
            # iter over every coordinate pair x-y
            for coordinate_str in coordinates_str:
                try:
                    coordinate = coordinate_str.split("-")
                    coordinate = (int(coordinate[0]), int(coordinate[1]))
                    coordinates.append(coordinate)
                except:
                    print(coordinate_str)
                
                
            visual_stimulus_array.append(coordinates)
            
        visual_stimulus_data_matrix.append(visual_stimulus_array)
        
    visual_stimulus_data_tensor.append(visual_stimulus_data_matrix)

<h4>Helper Functions</h4>

In [26]:
def save_images(image_array, folder, image_name):
    """
    :brief saves an array of images to a certain location incrementing the postfix by a number
    :param image_array:        array of images (np.ndarray)
    :param folder:     prefix of image/ folder location
    :param image_name: prefix for the image
    """
    
    Path(folder).mkdir(parents=True, exist_ok=True)
    
    prefix = folder + image_name
    
    #TODO create folders if there are none present
    for idx, data in enumerate(image_array):
        data = data*255
        data = np.uint8(data)
        im = Image.fromarray(data)
        im.save(prefix + str(idx) + '.png')
        
def compare_for_h0(arr_1, arr_2, alpha):
    t, p = stats.ttest_ind(arr_1, arr_2)
    if p > alpha:
        return True, t, p
    else:
        return False, t, p
    
def is_in(value, tup):
    return tup[0] <= value <= tup[1]

def get_0_offset(number):
    i = 0
    number = int(number)
    while number != 0:
        number = int(number / 10)
        i = i + 1
    return i

<h2>2. Create Single Heatmaps</h2>

create heatmaps

In [12]:
heatmap_tensor = []

print("Going to process " + str(len(visual_stimulus_data_tensor)) + " datatables: ")
for idx, visual_stimulus_data_matrix in enumerate(visual_stimulus_data_tensor):
    heatmaps_matrix = [] 
    print("\tGoing to process datatable #" + str(idx) + " with " + str(len(visual_stimulus_data_matrix)) + " datasets: ")
    
    # iterate over all the datasets
    for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_matrix):
        
        print("\t\tdataset #" + str(dataset_idx) + " (up to "+ str(len(stimulus_dataset)) + "): [", end='')
        heatmap_array = []
    
        # iterate over all the measurements of the dataset
        for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
            
            print(str(visual_idx), end=";")
            
            im = rEYEker.draw_shape_heat_map(image_tensor[idx][dataset_idx], stimulus_measurement, click_setting, should_copy=True)
            heatmap_array.append(im)
       
        print("]")
        heatmaps_matrix.append(heatmap_array)
        
    heatmap_tensor.append(heatmaps_matrix)

Going to process 8 datatables: 
	Going to process datatable #0 with 4 datasets: 
		dataset #0 (up to 7): [0;1;2;3;4;5;6;]
		dataset #1 (up to 3): [0;1;2;]
		dataset #2 (up to 3): [0;1;2;]
		dataset #3 (up to 9): [0;1;2;3;4;5;6;7;8;]
	Going to process datatable #1 with 4 datasets: 
		dataset #0 (up to 13): [0;1;2;3;4;5;6;7;8;9;10;11;12;]
		dataset #1 (up to 8): [0;1;2;3;4;5;6;7;]
		dataset #2 (up to 17): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;]
		dataset #3 (up to 16): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;]
	Going to process datatable #2 with 4 datasets: 
		dataset #0 (up to 24): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;

KeyboardInterrupt: 


save Heatmaps

In [13]:
for algo_idx, heatmaps_matrix in enumerate(heatmap_tensor):
    for idx, heatmap_array in enumerate(heatmaps_matrix):
        path = "./results/" + str(algo_name_array[algo_idx]) + "/heatmaps/heatmaps/" +  config_folder_prefix_array[idx]
        print("Writing to:" + path)
        save_images(heatmap_array, path, config_image_prefix_tensor[algo_idx][idx])

Writing to:./results/BinarySearch/heatmaps/heatmaps/BR/
Writing to:./results/BinarySearch/heatmaps/heatmaps/BI/
Writing to:./results/BinarySearch/heatmaps/heatmaps/TR/
Writing to:./results/BinarySearch/heatmaps/heatmaps/TI/
Writing to:./results/BubbleSort/heatmaps/heatmaps/BR/
Writing to:./results/BubbleSort/heatmaps/heatmaps/BI/
Writing to:./results/BubbleSort/heatmaps/heatmaps/TR/
Writing to:./results/BubbleSort/heatmaps/heatmaps/TI/
Writing to:./results/Factorial/heatmaps/heatmaps/BR/
Writing to:./results/Factorial/heatmaps/heatmaps/BI/
Writing to:./results/Factorial/heatmaps/heatmaps/TR/
Writing to:./results/Factorial/heatmaps/heatmaps/TI/
Writing to:./results/Fibonacci/heatmaps/heatmaps/BR/
Writing to:./results/Fibonacci/heatmaps/heatmaps/BI/
Writing to:./results/Fibonacci/heatmaps/heatmaps/TR/
Writing to:./results/Fibonacci/heatmaps/heatmaps/TI/
Writing to:./results/IntegerBinary/heatmaps/heatmaps/BR/
Writing to:./results/IntegerBinary/heatmaps/heatmaps/BI/
Writing to:./results/I

<h2>3. Create Average Heatmaps</h2>

create heatmaps

In [71]:
heatmap_tensor = []
mask_tensor = []
shape_array = []

print("Going to process " + str(len(visual_stimulus_data_tensor)) + " datatables: ")
for algo_idx, visual_stimulus_data_matrix in enumerate(visual_stimulus_data_tensor):
    average_heatmap_array = []
    mask_array = []
    print("\tGoing to process datatable #" + str(algo_idx) + " with " + str(len(visual_stimulus_data_matrix)) + " datasets: ")
    print("\t\t", end="")
    
    # iterate over all the datasets
    for idx, stimulus_dataset in enumerate(visual_stimulus_data_matrix):
        print("#" + str(idx), end="")
        image = image_tensor[algo_idx][idx]
        shape_array.append(image.shape)
        visual_measurements = visual_stimulus_data_matrix[idx]
        im, mask = rEYEker.draw_average_shape_heat_map_rel(image, visual_measurements, click_setting, 1.0, 0.7, None, should_copy=True)
        average_heatmap_array.append(im)
        mask_array.append(mask)
        
    print()
        
        
    heatmap_tensor.append(average_heatmap_array)
    mask_tensor.append(mask_array)

Going to process 8 datatables: 
	Going to process datatable #0 with 4 datasets: 
		#0#1#2#3
	Going to process datatable #1 with 4 datasets: 
		#0#1#2#3
	Going to process datatable #2 with 4 datasets: 
		#0#1#2#3
	Going to process datatable #3 with 4 datasets: 
		#0#1#2#3
	Going to process datatable #4 with 4 datasets: 
		#0#1#2#3
	Going to process datatable #5 with 4 datasets: 
		#0#1#2#3
	Going to process datatable #6 with 4 datasets: 
		#0#1#2#3
	Going to process datatable #7 with 4 datasets: 
		#0#1#2#3


save heatmaps

In [127]:
for algo_idx, heatmaps_matrix in enumerate(heatmap_tensor):
    for idx, heatmap in enumerate(heatmaps_matrix):
        #path = "./results/" + str(algo_name_array[algo_idx]) + "/heatmaps/average_heatmap/"
        path = "./results/averageHeatMaps/"
        print("Writing to:" + path)
        save_images([heatmap], path, config_image_prefix_tensor[algo_idx][idx])

Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/
Writing to:./07/


<h2>4. Create Sequence diagramms</h2>

create sequence diagrams

In [16]:
sequence_diagrams_tensor = []

print("Going to process " + str(len(visual_stimulus_data_tensor)) + " datatables: ")
for algo_idx, visual_stimulus_data_matrix in enumerate(visual_stimulus_data_tensor):
    sequence_diagrams_matrix = []
    print("\tGoing to process datatable #" + str(idx) + " with " + str(len(visual_stimulus_data_matrix)) + " datasets: ")
    
    # iterate over all the datasets
    for dataset_idx, stimulus_dataset in enumerate(visual_stimulus_data_matrix):
        sequence_diagram_array = []
        print("\t\tdataset #" + str(dataset_idx) + " (up to "+ str(len(stimulus_dataset)) + "): [", end='')
    
        # iterate over all the measurements of the dataset
        for visual_idx, stimulus_measurement in enumerate(stimulus_dataset):
            print(str(visual_idx), end=";")
            im = image_tensor[algo_idx][dataset_idx]
            try:
                im = rEYEker.draw_vertical_line_diagram(im, stimulus_measurement, should_copy=True)
                sequence_diagram_array.append(im)
                
            except:
                #TODO
                sequence_diagram_array.append(im.copy())
                #print("W.I.P.:", end='')
                #print("to many clicks for dataset " + str(dataset_idx) + " datset " + str(visual_idx))
                
        print("]")
        sequence_diagrams_matrix.append(sequence_diagram_array)
    sequence_diagrams_tensor.append(sequence_diagrams_matrix)

Going to process 8 datatables: 
	Going to process datatable #3 with 4 datasets: 
		dataset #0 (up to 5): [0;1;2;3;4;]
		dataset #1 (up to 2): [0;1;]
		dataset #2 (up to 3): [0;1;2;]
		dataset #3 (up to 8): [0;1;2;3;4;5;6;7;]
	Going to process datatable #3 with 4 datasets: 
		dataset #0 (up to 12): [0;1;2;3;4;5;6;7;8;9;10;11;]
		dataset #1 (up to 7): [0;1;2;3;4;5;6;]
		dataset #2 (up to 5): [0;1;2;3;4;]
		dataset #3 (up to 14): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;]
	Going to process datatable #3 with 4 datasets: 
		dataset #0 (up to 17): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;]
		dataset #1 (up to 25): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;24;]
		dataset #2 (up to 24): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16;17;18;19;20;21;22;23;]
		dataset #3 (up to 16): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;]
	Going to process datatable #3 with 4 datasets: 
		dataset #0 (up to 12): [0;1;2;3;4;5;6;7;8;9;10;11;]
		dataset #1 (up to 17): [0;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;

save sequence diagrams

In [17]:
for algo_idx, sequence_diagrams_matrix in enumerate(sequence_diagrams_tensor):
    for idx, sequence_diagram_array in enumerate(sequence_diagrams_matrix):
        path = "./results/" + str(algo_name_array[algo_idx]) + "/sequence_diagrams/" +  config_folder_prefix_array[idx]
        print("Writing to:" + path)
        save_images(sequence_diagram_array, path, config_image_prefix_tensor[algo_idx][idx])

Writing to:./results/BinarySearch/sequence_diagrams/BR/
Writing to:./results/BinarySearch/sequence_diagrams/BI/
Writing to:./results/BinarySearch/sequence_diagrams/TR/
Writing to:./results/BinarySearch/sequence_diagrams/TI/
Writing to:./results/BubbleSort/sequence_diagrams/BR/
Writing to:./results/BubbleSort/sequence_diagrams/BI/
Writing to:./results/BubbleSort/sequence_diagrams/TR/
Writing to:./results/BubbleSort/sequence_diagrams/TI/
Writing to:./results/Factorial/sequence_diagrams/BR/
Writing to:./results/Factorial/sequence_diagrams/BI/
Writing to:./results/Factorial/sequence_diagrams/TR/
Writing to:./results/Factorial/sequence_diagrams/TI/
Writing to:./results/Fibonacci/sequence_diagrams/BR/
Writing to:./results/Fibonacci/sequence_diagrams/BI/
Writing to:./results/Fibonacci/sequence_diagrams/TR/
Writing to:./results/Fibonacci/sequence_diagrams/TI/
Writing to:./results/IntegerBinary/sequence_diagrams/BR/
Writing to:./results/IntegerBinary/sequence_diagrams/BI/
Writing to:./results/I

# AOI categorization

In [62]:
config_prefix = ['BR', 'BI', 'TR', 'TI']
aoi_cat_path_matrix = []

for algo_name in algo_name_array:
    aoi_array =[
        'data/aoi_categorized/AOI_BR_' + algo_name + '.xlsx',
        'data/aoi_categorized/AOI_BI_' + algo_name + '.xlsx',
        'data/aoi_categorized/AOI_TR_' + algo_name + '.xlsx',
        'data/aoi_categorized/AOI_TI_' + algo_name + '.xlsx',
    ]
    aoi_cat_path_matrix.append(aoi_array)
    
aoi_df_matrix = []
for path_array in aoi_cat_path_matrix:
    aoi_df_array = []
    for path in path_array:
        raw = pd.read_excel(path)
        tmp_df = pd.DataFrame(raw)
        aoi_df_array.append(tmp_df)
    aoi_df_matrix.append(aoi_df_array) 
    
def is_in(df, y):
    for _idx, row in df.iterrows():
        if row["startHeight"] <= y <= row["stopHeight"]:
            return row["Name"]
    return "none"

iterative = ['none', 'main', 'Iterative definition', 'Pre calculation', 'Iteration Condition', 'Iteration Step', 'Return Result']
recursive = ['none','main', 'Recursive definition', 'Pre calculation', 'Recursive Condition', 'Recursive Step', 'Return Result']
order = ['0', '1', '2', '3', '4', '5', '6']

In [106]:
mask_array = []
for mask_2d in mask_tensor:
    for mask_1d in mask_2d:
        mask_array.append(mask_1d)
        
aoi_df_array = []
for aoi_df_1d in aoi_df_matrix:
    for df in aoi_df_1d:
        aoi_df_array.append(df)

algo_df = pd.DataFrame([], columns=["Comprehension", "Programming", "Algorithm"])
for algo in algo_name_array:
    for idx, prefix in enumerate(config_prefix):
        comprehension = "BU"
        if idx >= 2:
            comprehension = "TD"
        
        programming = "R"
        if idx%2==1:
            programming = "I"
        
        algo_df = algo_df.append(pd.DataFrame(
            [[comprehension, programming, algo]],
            columns=["Comprehension", "Programming", "Algorithm"]))
algo_df = algo_df.reset_index()
algo_df = algo_df.drop("index", axis=1)
    
additional = iterative.copy()
for element in recursive:
    additional.append(element)

additional = list(set(additional))
for element in additional:
    algo_df.insert(loc=3, column=element, value=0)

for idx, mask in enumerate(mask_array):
    height = shape_array[idx][0]
    width = shape_array[idx][1]
    aoi_df = aoi_df_array[idx]
    print(idx)
    for h in range(height):
        for w in range(width):
            mask_idx = h*width+w
            if mask[mask_idx] != 0:
                name = is_in(aoi_df, h)
                algo_df.at[idx, name] += 1


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


In [129]:
#algo_df = algo_df.reset_index()
pattern = []
for idx, row in algo_df.iterrows():
    algo_pattern = [row["Algorithm"], row["Comprehension"], row["Programming"]]
    for element in additional:
        if row[element] >= 500:
            algo_pattern.append(element)
    pattern.append(algo_pattern)  
    
for data in pattern:
    print(str(data))

['BinarySearch', 'BU', 'R', 'main', 'Recursive Step', 'Pre calculation', 'Recursive Condition']
['BinarySearch', 'BU', 'I', 'main']
['BinarySearch', 'TD', 'R', 'main']
['BinarySearch', 'TD', 'I', 'Iteration Condition', 'main', 'Pre calculation', 'Iteration Step']
['BubbleSort', 'BU', 'R', 'main']
['BubbleSort', 'BU', 'I', 'main']
['BubbleSort', 'TD', 'R', 'main']
['BubbleSort', 'TD', 'I', 'Iteration Condition', 'main', 'Iteration Step']
['Factorial', 'BU', 'R', 'main', 'Recursive Step', 'Recursive definition']
['Factorial', 'BU', 'I', 'Iteration Condition', 'Iterative definition', 'Pre calculation']
['Factorial', 'TD', 'R', 'main', 'Recursive Step', 'Recursive definition']
['Factorial', 'TD', 'I', 'Iteration Condition', 'Iterative definition', 'Pre calculation']
['Fibonacci', 'BU', 'R', 'none', 'Recursive Step']
['Fibonacci', 'BU', 'I', 'Iteration Condition', 'Pre calculation', 'Iteration Step']
['Fibonacci', 'TD', 'R', 'Recursive Step']
['Fibonacci', 'TD', 'I', 'Iteration Condition', 