In [1]:
import pandas as pd
import json
import os
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import scipy.stats
from statsmodels.stats.power import TTestIndPower
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning

# Data Preprocessing

In [2]:
def cleanup(data):
    fields = data.strip().split(',')
    fields = [field.strip('"').replace('\\', '').replace('{', '').replace('}', '').replace("'", '').replace("[", '').replace("]", '').replace('"', '') for field in fields]
    return fields

In [3]:
def parseCSV(filepath):
    processed_data = []

    with open(filepath, 'r', encoding='utf-8') as file:
        for i,subj_data in enumerate(file):

            if i==0: continue # CSV headers

            processed_subj = cleanup(subj_data)

            subject_info = {
                'worker': processed_subj[0],
                'assignment': processed_subj[1],
                'hit': processed_subj[2],
                'version': processed_subj[3],
                'timestamp': processed_subj[4]
                }

            img_responses = []
            post_survey = {}
            resp = None

            for item in processed_subj[5:]:
                if ':' in item:
                    key, value = item.split(':', 1)

                    if key == 'response':
                        resp = value
                    elif key == 'image_name':
                        subject_info[os.path.basename(value)] = resp
                    else:
                        post_survey[key] = value

                # If subjects include newline character in post-survey comments
                else:
                    value = item
                    if resp and resp in post_survey:
                        post_survey[last_key] += ' ' + value

            subject_info.update(post_survey)
            processed_data.append(subject_info)

        df = pd.DataFrame(processed_data)
        return df

In [9]:
def filter_subjects(df, attentioncheck):
    """
    Filter out subjects from the DataFrame based on the attention check. 
    Also filters out test runs of the experiment (labeled 'TEST' in post-survey comments). 
    If attentioncheck is true, filter based on attention check; otherwise, just remove test runs. 
    """

    df['agents'] = pd.to_numeric(df['agents'], errors='coerce')
    df['plates'] = pd.to_numeric(df['plates'], errors='coerce')
    if attentioncheck:
        filtered_df = df[(df['plates'] == 3) & (~df['comments'].str.contains("TEST", na=False))] 
    else:
        filtered_df = df[(~df['comments'].str.contains("TEST", na=False))]
    return filtered_df

In [12]:
filepath = '../rawdata/socloaf_expt2.csv'
df = parseCSV(filepath)
filtered_df = filter_subjects(df, attentioncheck=True)

# Building & fitting the mixed linear model

In [13]:
def effect_code_workload(image_name):
    parts = image_name.split('_')
    workload = parts[2] 
    if workload == '1':
        return [1, 0]
    elif workload == '4':
        return [0, 0]
    elif workload == '8':
        return [0, 1]
    else:
        return [None, None]

def effect_code_bottleneck(image_name):
    if ('_1.png' in image_name):
        return 0
    elif ('_6.png' in image_name):
        return 1
    else:
        return None

def effect_code_groupsize(image_name):
    if 'cartoon_2' in image_name:
        return [1, 0]
    elif 'cartoon_4' in image_name:
        return [0, 0]
    elif 'cartoon_6' in image_name:
        return [0, 1]
    else:
        return [None, None]

def decode_categories(row):
    if row['x_1_1'] == 1 and row['x_1_2'] == 0:
        row['workload'] = '1'
    elif row['x_1_1'] == 0 and row['x_1_2'] == 1:
        row['workload'] = '8'
    elif row['x_1_1'] == 0 and row['x_1_2'] == 0:
        row['workload'] = '4'

    row['bottleneck'] = 'High' if row['x_3'] == 1 else 'Low'

    if row['x_2_1'] == 1 and row['x_2_2'] == 0:
        row['group_size'] = '2'
    elif row['x_2_1'] == 0 and row['x_2_2'] == 1:
        row['group_size'] = '6'
    elif row['x_2_1'] == 0 and row['x_2_2'] == 0:
        row['group_size'] = '4'

    return row

In [14]:
image_columns = [col for col in filtered_df.columns if col.endswith('.png')]
long_df = pd.melt(filtered_df, id_vars=['worker'], value_vars=image_columns, var_name='image', value_name='resp')

# Apply the effect coding functions
long_df[['x_1_1', 'x_1_2']] = pd.DataFrame(long_df['image'].apply(effect_code_workload).tolist(), index=long_df.index)
long_df[['x_2_1', 'x_2_2']] = pd.DataFrame(long_df['image'].apply(effect_code_groupsize).tolist(), index=long_df.index)
long_df[['x_3']] = pd.DataFrame(long_df['image'].apply(effect_code_bottleneck).tolist(), index=long_df.index)

long_df = long_df.dropna(subset=['x_3', 'x_2_1', 'x_2_2', 'x_1_1', 'x_1_2'])
long_df['response'] = pd.to_numeric(long_df['resp'], errors='coerce')
long_df = long_df.dropna(subset=['response'])

model = smf.mixedlm("response ~ x_1_1 + x_1_2 + x_2_1 + x_2_2 + x_3 + x_1_1:x_2_1 + x_1_1:x_2_2 + x_1_2:x_2_1 + x_1_2:x_2_2 + x_2_1:x_3 + x_2_2:x_3 + x_1_1:x_3 + x_1_2:x_3", long_df, groups=long_df["worker"])
result = model.fit()

print(result.summary())

          Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: response   
No. Observations: 9643    Method:             REML       
No. Groups:       271     Scale:              3.9801     
Min. group size:  22      Log-Likelihood:     -20804.0030
Max. group size:  36      Converged:          Yes        
Mean group size:  35.6                                   
---------------------------------------------------------
              Coef.  Std.Err.    z    P>|z| [0.025 0.975]
---------------------------------------------------------
Intercept      5.821    0.128  45.460 0.000  5.570  6.072
x_1_1          1.430    0.100  14.358 0.000  1.234  1.625
x_1_2         -0.230    0.100  -2.310 0.021 -0.425 -0.035
x_2_1         -1.301    0.100 -13.056 0.000 -1.496 -1.106
x_2_2          0.624    0.100   6.259 0.000  0.428  0.819
x_3           -0.732    0.091  -8.053 0.000 -0.911 -0.554
x_1_1:x_2_1    0.226    0.122   1.851 0.064 -0.013  0.465
x_1_1:x_2_2   -0.329    

# Visualizi