In [12]:
# If true, data is exported from the data acquisition database to csv files.
# If false, data is read from csv files
DB_Available=False

if DB_Available:
    from django.db.models import Model
    from django.core.management.base import BaseCommand
    from apps.construction import models
import csv
import pandas as pd
import numpy as np
import os
import json
import datetime
import traceback
import sys
import django
import scipy.stats
import statistics
import matplotlib.pyplot as plt
import scikit_posthocs
from autorank import autorank, plot_stats, create_report, latex_table

# READ DATA
If the script is run in the django environment with database and access via model class ORM, DB_Available is set to true.
In this case, data is extracted from the database and saved to csv files.
Otherwise, DB_Availabe is set to False, which leads to data being read from the previously exported csv
files. The resulting dataframes are equivalent.

In [13]:
# Directory to which results and figures are saved
EXPORT_PATH = "{}/".format(os.getcwd())



if DB_Available:
    # loads project settings from django and makes models accessible

    from django_for_jupyter import init_django
    init_django("goaltrees")

    # names of the studies to be included
    studies=["pilot_study","big5_study"]

    
    # Get participant data and save as csv
    
    participants = models.Participant.objects.filter(study__name__in=studies,
                                                     exclude_from_analyses=False,
                                                     finished__isnull=False).order_by('id')

    study_names=[]
    for p in participants:
        study_names.append(p.study.name)
   
    df_participants=pd.DataFrame(list(participants.values()))
    df_participants=df_participants.set_index("id")
    df_participants["study_name"]=study_names
    df_participants.to_csv("{}/participants.csv".format(EXPORT_PATH),
                sep=";")

 
            
    # Get questions and save as csv

    questions=models.Question.objects.filter(participant__in=participants).order_by("id")
    df_questions=pd.DataFrame(list(questions.values()))
    df_questions=df_questions.set_index("id")
    
    for q in questions:
        # translate answering codes
        translation=[0,4,3,2,1]
        if (q.participant.study.name=="pilot_study") and ("condition" in q.question):
            df_questions.at[q.pk, "answer"]= translation[int(q.answer)]
    df_questions.to_csv("{}/questions.csv".format(EXPORT_PATH),
                sep=";")

    
    
    ##########   export open questions   ##########
    
    questions = models.Question.objects.filter(participant__study__name__in=studies)

    columns=["participant","question","answer"]
    
    data={}
    
    for q in questions:
        if "condition" not in q.question and q.answer:
            data[q.pk]={"participant":q.participant.pk,
                        "question":q.question,
                        "answer":q.answer,}

    df_open_questions=pd.DataFrame.from_dict(columns=columns,
                              orient="index",
                              data=data,)

    df_open_questions.to_csv("{}/open_questions.csv".format(EXPORT_PATH),
                   sep=";")




    

else: 
    df_participants=pd.read_csv(filepath_or_buffer="{}/participants.csv".format(EXPORT_PATH), 
                                sep=";", 
                                index_col="id",
                               )
    df_questions=pd.read_csv(filepath_or_buffer="{}/questions.csv".format(EXPORT_PATH), 
                                sep=";",
                            )

# Participant Data

In [14]:
data = {}
columns = ["participant_id", "age", "gender", "semester","subject","degree","screen_size","operating_system","browser_language"]
    
    
conditions=["ranking condition: {}".format(x) for x in range(1,5)]

for c in conditions:
    columns.append(c)


ids=df_participants.index

for p_index, p_row in df_participants.iterrows():
    data[p_index]={"participant_id":p_index,
                    "age":p_row["age"],
                    "gender":p_row["gender"],
                    "semester":p_row["semester"],
                    "subject":p_row["subject"],
                    "degree":p_row["degree"],
                    "screen_size":p_row["screen_size"],
                    "operating_system":p_row["operating_system"],
                    "browser_language":p_row["browser_language"],
    }
    for q_index, q_row in df_questions.iterrows():
        if "condition" in q_row["question"]:
            for c in columns:
                if c[-10:] == q_row["question"][-10:]:
                    data[p_index][c]=q_row["answer"]


                        
                        
df_study = pd.DataFrame.from_dict(data=data,
                                     orient="index",
                                     columns=columns)

print(df_study.describe())
print("total: {}".format(len(df_study)))
print("male: {}".format(len(df_study.loc[df_study['gender'] == "male"])))
print("female: {}".format(len(df_study.loc[df_study['gender'] == "female"])))

for s in df_study["subject"].unique():
    print("{}: {}".format(s, len(df_study.loc[df_study['subject'] == s])))

df_study.to_csv("{}/merged_visu_data.csv".format(EXPORT_PATH),
                   sep=";")

       participant_id         age    semester  degree
count      120.000000  119.000000  118.000000     0.0
mean       200.058333   23.529412    5.076271     NaN
std        141.308608    6.422713    3.379569     NaN
min         22.000000   16.000000    1.000000     NaN
25%         87.250000   20.000000    2.000000     NaN
50%        157.500000   22.000000    4.000000     NaN
75%        301.750000   24.000000    6.000000     NaN
max        542.000000   58.000000   20.000000     NaN
total: 120
male: 43
female: 77
Cognitive Science: 59
Coxi Science: 1
Psychologie: 30
Psychologie : 2
Biologie, Spanisch: 1
Medieninformatik: 1
cognitive science : 1
Cogsci: 2
psychologie: 1
Cognitive science: 1
International Business : 2
Politikwissenschaft: 1
B.Sc. Life Science: 1
Maschinenbau Wirtschaft und Management: 1
keines: 1
Architektur: 1
Rechtswissenschaft : 1
Ev. Theologie/ Politik-Wirtschaft und DaF auf Gymnasiallehramt: 1
Pädagogik der Kindheit: 1
Lehramt: 1
Anglistik und katholische Theologie: 1

# Visu Comparison

In [15]:
condition_1= [int(x) for x in df_study["ranking condition: 1"]]
condition_2 = [int(x) for x in df_study["ranking condition: 2"]]
condition_3 = [int(x) for x in df_study["ranking condition: 3"]]
condition_4 = [int(x) for x in df_study["ranking condition: 4"]]

samples = {"condition 1": condition_1,
               "condition 2": condition_2,
               "condition 3": condition_3,
               "condition 4": condition_4,
               }

visualizations={"condition 1": "sunburst",
               "condition 2": "treemap",
               "condition 3": "dendrogram",
               "condition 4": "circlepacking",
               }

columns=["participant"]
data={}
for v in visualizations.values():
    columns.append(v)
for i in range(0, len(samples["condition 1"])):
    data[i]={"participant":i,
                 visualizations["condition 1"]:samples["condition 1"][i],
                 visualizations["condition 2"]:samples["condition 2"][i],
                 visualizations["condition 3"]:samples["condition 3"][i],
                 visualizations["condition 4"]:samples["condition 4"][i],}

df_rankings = pd.DataFrame.from_dict(data=data,
                                     orient="index",
                                     columns=columns)

df_rankings.to_csv("{}/rankings.csv".format(EXPORT_PATH),
                   sep=";")




columns=["visu 1","visu 2","first","second"]
data={}
for i in range(1,5):

    for j in range(i + 1,5):

        s1="condition {}".format(i)
        s2 ="condition {}".format(j)

        data["{}{}".format(s1,s2)]={"visu 1":visualizations[s1],
                                        "visu 2":visualizations[s2],}

    
        # percentage

        first=0
        second=0
        for x in range(0, len(samples[s1])):
            if samples[s1][x] > samples[s2][x]:
                second+=1
            elif samples[s1][x] < samples[s2][x]:
                first+=1

        data["{}{}".format(s1,s2)]["first"]="{:,.8f}".format(first/len(samples[s1]))
        data["{}{}".format(s1,s2)]["second"]="{:,.8f}".format(second/len(samples[s1]))

        
df_post_hoc = pd.DataFrame.from_dict(data=data,
                                     orient="index",
                                     columns=columns)

df_post_hoc.to_csv("{}/post_hoc.csv".format(EXPORT_PATH),
                   sep=";")


# Friedman Test

In [16]:
##########   Friedman test   ##########

result=scipy.stats.friedmanchisquare(condition_1,
                        condition_2,
                        condition_3,
                        condition_4)

print("Friedmann statistic: {:,.4f}".format(result[0]))
print("Friedmann p: {:,.12f}".format(result[1]))


Friedmann statistic: 360.0000
Friedmann p: 0.000000000000


# Nemenyi post-hoc

In [17]:
##########   Nemenyi post-hoc test  ##########

print("Nemenyi post-hoc Test for {} and {}".format(visualizations[s1], visualizations[s2]))


data = np.array([samples["condition 1"],
                     samples["condition 2"],
                     samples["condition 3"],
                     samples["condition 4"],
                    ])

result = scikit_posthocs.posthoc_nemenyi_friedman(data.T)

print("Nemenyi post-hoc test")
print(result)




##########   descriptive statistics   ##########



columns=["visualization","R 1", "R 2", "R 3", "R 4", "rank sum","average", "SD"]
data={}

for s in samples.keys():
    R1= 0
    R2 = 0
    R3 = 0
    R4 = 0
    sum = 0
    for value in samples[s]:
        sum+=value
        if value==1:
            R1+=1
        elif value==2:
            R2+=1
        elif value==3:
            R3+=1
        elif value==4:
            R4+=1
        else:
            print("ERROR: value is {}".format(value))

    data[visualizations[s]] = {}
    data[visualizations[s]]["visualization"] = visualizations[s]
    data[visualizations[s]]["R 1"] = "{0:.0%}".format(R1/len(samples[s]))
    data[visualizations[s]]["R 2"] = "{0:.0%}".format(R2/len(samples[s]))
    data[visualizations[s]]["R 3"] = "{0:.0%}".format(R3/len(samples[s]))
    data[visualizations[s]]["R 4"] = "{0:.0%}".format(R4/len(samples[s]))
    data[visualizations[s]]["average"] = "{:,.2f}".format(statistics.mean(samples[s]))
    data[visualizations[s]]["rank sum"] = sum
    data[visualizations[s]]["SD"] = "{:,.2f}".format(statistics.stdev(samples[s]))




df_sample = pd.DataFrame.from_dict(data=data,
                                     orient="index",
                                     columns=columns)

df_sample.to_csv("{}/merged_visu_data_statistics.csv".format(EXPORT_PATH),
                   sep=";")



data = pd.DataFrame()

for c in samples.keys():
    data[visualizations[c]]=[np.float64(x) for x in samples[c]]

result = autorank(data, alpha=0.05, verbose=False)
print(result)

result = autorank(data, alpha=0.05, verbose=False, approach='bayesian')
print(result)

create_report(result)

latex_table(result)









Nemenyi post-hoc Test for dendrogram and circlepacking
Nemenyi post-hoc test
       0      1      2      3
0  1.000  0.001  0.001  0.001
1  0.001  1.000  0.001  0.001
2  0.001  0.001  1.000  0.001
3  0.001  0.001  0.001  1.000
RankResult(rankdf=
               meanrank  mean  std ci_lower ci_upper effect_size magnitude
treemap             4.0   1.0  0.0        1        1         NaN     large
dendrogram          3.0   2.0  0.0        2        2        -inf     large
sunburst            2.0   3.0  0.0        3        3        -inf     large
circlepacking       1.0   4.0  0.0        4        4        -inf     large
pvalue=1.0192622884900308e-77
cd=0.428169268246638
omnibus=friedman
posthoc=nemenyi
all_normal=True
pvals_shapiro=[1.0, 1.0, 1.0, 1.0]
homoscedastic=False
pval_homogeneity=nan
homogeneity_test=bartlett
alpha=0.05
alpha_normality=0.0125
num_samples=120
posterior_matrix=
None
decision_matrix=
None
rope=None
rope_mode=None
effect_size=cohen_d)


  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
  return (np.mean(x) - np.mean(y)) / _pooled_std(x, y)
  return (np.mean(x) - np.mean(y)) / _pooled_std(x, y)
  return (np.mean(x) - np.mean(y)) / _pooled_std(x, y)
  return (np.mean(x) - np.mean(y)) / _pooled_std(x, y)


RankResult(rankdf=
               mean  std ci_lower ci_upper effect_size magnitude  p_equal  \
circlepacking   4.0  0.0        4        4         NaN     large      NaN   
sunburst        3.0  0.0        3        3         inf     large      0.0   
dendrogram      2.0  0.0        2        2         inf     large      0.0   
treemap         1.0  0.0        1        1         inf     large      0.0   

               p_smaller decision  
circlepacking        NaN       NA  
sunburst             1.0  smaller  
dendrogram           1.0  smaller  
treemap              1.0  smaller  
pvalue=None
cd=None
omnibus=bayes
posthoc=bayes
all_normal=True
pvals_shapiro=[1.0, 1.0, 1.0, 1.0]
homoscedastic=None
pval_homogeneity=None
homogeneity_test=None
alpha=0.05
alpha_normality=0.0125
num_samples=120
posterior_matrix=
              circlepacking         sunburst       dendrogram          treemap
circlepacking           NaN  (1.0, 0.0, 0.0)  (1.0, 0.0, 0.0)  (1.0, 0.0, 0.0)
sunburst                NaN