# Data Analysis for Relational AI Group Experiment

Authors: Elijah Claggett, Faria Huq

In [None]:
# Important Variables

# Location of Empirica data files
data_path = './data/'

In [None]:
# Imports
from scipy.stats import ttest_ind, bootstrap
from convokit import PolitenessStrategies
import matplotlib.pyplot as plt
from colorama import Fore
from scipy import stats
import seaborn as sns
import pandas as pd
import numpy as np
import requests
import spacy
import json
import re
import math
import os
from utils import prettyPrintMulti, prettyPrintList, prettyPrintChats, bf, readDataFiles, empiricaColumnExists, getMultiGameData

## 1) Load data

In [None]:
# Print data files
trials = [f for f in os.listdir(data_path) if f.endswith('.json')]
trials.sort()

print(bf('All trials:'))
prettyPrintList(trials)

In [None]:
game_dict_multi, player_dict_multi, stage_dict_multi, playerStage_dict_multi = readDataFiles(trials, data_path)

In [None]:
from datetime import datetime

def truncate_to_microseconds(ts):
    if "." in ts:
        base, rest = ts.split(".")
        digits = rest.rstrip("Z")  # remove Z or +00:00 if present
        micro = digits[:6]  # truncate to microseconds
        return f"{base}.{micro}+00:00"
    return ts.replace("Z", "+00:00")

In [None]:
# filter out the stage times 
import numpy as np

updated_stage_dict_multi = {}
for each_trial in stage_dict_multi.keys():
    stage_list = [] 
    # print('each_trial', each_trial)
    if 'name' not in stage_dict_multi[each_trial].keys(): continue
    if 'ended' not in stage_dict_multi[each_trial].keys(): continue
    if 'started' not in stage_dict_multi[each_trial].keys(): continue
    # print('name', stage_dict_multi[each_trial]['name'][1:4])
    # print('started', stage_dict_multi[each_trial]['started'][1:4])
    # print('ended', stage_dict_multi[each_trial]['ended'][1:4])
    for each_entry_name, each_entry_time in zip(stage_dict_multi[each_trial]['name'], stage_dict_multi[each_trial]['started']):
        # print(each_entry_name)
        # print(each_entry_time)
        
        if not isinstance(each_entry_time, float) or not math.isnan(each_entry_time):
            iso_dt = datetime.fromisoformat(truncate_to_microseconds(each_entry_time[0]['dt']))
            iso_unix_ms = int(iso_dt.timestamp() * 1000)
        else:
            iso_unix_ms = np.inf
        stage_list.append({'name': each_entry_name[0]['val'], 'startTime': iso_unix_ms})
    updated_stage_dict_multi[each_trial] = stage_list
# print(updated_stage_dict_multi)

## 2) Clean data

Remove:

- People who didn't finish the entire experiment
- Test data (eli / faria)

In [None]:
exclude_local_trails = ['01JTP5XG9V373HW0JF06P59GNZ', '01JTP48A46PS57YC8B2MC6T1MT', '01JTP4B3CMETA57BXMM3E66WFN', '01JTQ4YH098ZARPFM1TSYTR1XP', '01JTQ6FRYG87XERGETTKVQW098', '01JTQ7FBBK75WQ7RXXFVDRT1NV']

In [None]:
# Data cleaning

notFinishedStudy = 0
totalRecruited = 0
for gameID in player_dict_multi:
    for participantID in player_dict_multi[gameID].index:
        totalRecruited += 1
        if type(player_dict_multi[gameID].loc[participantID]['participantIdentifier']) == list:
            eID = json.loads(player_dict_multi[gameID].loc[participantID]['participantIdentifier'][-1]['val'])
            
            # Remove test data (only if we want to isolate real experiment data)
            # if eID[0:3].lower() == 'eli':
            #     totalRecruited -= 1
            #     player_dict_multi[gameID].drop(participantID, inplace=True)
            #     continue
        
        # Remove anyone who didn't complete the entire experiment (aka didn't submit a summary)
        if 'summary' not in player_dict_multi[gameID].loc[participantID]:
            player_dict_multi[gameID].drop([participantID], inplace=True)
            notFinishedStudy += 1
            continue


In [None]:
# Summarize the data cleaning process

def getSummaries(p):
    if empiricaColumnExists(p, 'summary'):
        summary = json.loads(p['summary'][-1]['val'])
        return summary

summaries = getMultiGameData(getSummaries, game_dict_multi, player_dict_multi)

numParticipants = 0
for gameID in summaries:
    if gameID in exclude_local_trails:  continue
    # print('summaries', summaries)
    for participantID in summaries[gameID]:
        numParticipants += 1

print(f'Total participants recruited: {totalRecruited}')
print(f'Total participants kept: {numParticipants}')
print('---------------------------')
print(f'Total participants not finish study: {notFinishedStudy}')

In [None]:
# Get Prolific metadata

def getParticipantIdentifier(p):
    if empiricaColumnExists(p, 'participantIdentifier'):
        pID = json.loads(p['participantIdentifier'][-1]['val'])
        return pID

def getProlificSession(p):
    if empiricaColumnExists(p, 'sessionID'):
        sessionID = json.loads(p['sessionID'][-1]['val'])
        return sessionID
def getProlificStudy(p):
    if empiricaColumnExists(p, 'studyID'):
        studyID = json.loads(p['studyID'][-1]['val'])
        return studyID


pIDs = getMultiGameData(getParticipantIdentifier, game_dict_multi, player_dict_multi)
sessionIDs = getMultiGameData(getProlificSession, game_dict_multi, player_dict_multi)
studyIDs = getMultiGameData(getProlificStudy, game_dict_multi, player_dict_multi)

e2p = {}
e2session = {}
e2study = {}
for gameID in pIDs:
    for participantID in pIDs[gameID]:
        e2p[participantID] = pIDs[gameID][participantID][0]
for gameID in pIDs:
    for participantID in sessionIDs[gameID]:
        if len(sessionIDs[gameID][participantID]) > 0:
            e2session[participantID] = sessionIDs[gameID][participantID][0]
for gameID in pIDs:
    for participantID in studyIDs[gameID]:
        if len(studyIDs[gameID][participantID]) > 0:
            e2study[participantID] = studyIDs[gameID][participantID][0]


## 3) Select and format data

In [None]:
# Participants

def getChatIdentities(p):
    if empiricaColumnExists(p, 'selfIdentity'):
        chatIdentities = json.loads(p['selfIdentity'][-1]['val'])
        return chatIdentities

chatIdentities = getMultiGameData(getChatIdentities, game_dict_multi, player_dict_multi)

for gameID in chatIdentities:
    if gameID in exclude_local_trails:  continue
    print(bf('Trial:'), gameID)
    for participantID in chatIdentities[gameID]:
        print(bf('Participant:'), e2p[participantID], bf('Chat Identity:'), chatIdentities[gameID][participantID][-1])

In [None]:
# Participants

def getjoinedrooms(p):
    if empiricaColumnExists(p, 'joinedRooms'):
        chatIdentities = json.loads(p['joinedRooms'][-1]['val'])
        return chatIdentities

joinedrooms = getMultiGameData(getjoinedrooms, game_dict_multi, player_dict_multi)

for gameID in joinedrooms:
    if gameID in exclude_local_trails:  continue
    print(bf('Trial:'), gameID)
    for participantID in joinedrooms[gameID]:
        print(bf('Participant:'), e2p[participantID], bf('joined room:'), joinedrooms[gameID][participantID][-1])


In [None]:
# Tutorial Duration

def getTutorialDuration(p):
    if not empiricaColumnExists(p, 'participantID') or not empiricaColumnExists(p, 'passedTutorial'):
        return None

    joinTime = p['participantID'][-1]['dt']
    tutorialPassTime = p['passedTutorial'][-1]['dt']

    t0 = datetime.fromisoformat(truncate_to_microseconds(joinTime))
    t1 = datetime.fromisoformat(truncate_to_microseconds(tutorialPassTime))

    duration_minutes = (t1 - t0).total_seconds() / 60
    return round(duration_minutes, 2)

TutorialDurations = getMultiGameData(getTutorialDuration, game_dict_multi, player_dict_multi)

for gameID in TutorialDurations:
    if gameID in exclude_local_trails:  continue
    print(bf('Trial:'), gameID)
    for participantID in TutorialDurations[gameID]:
        print(bf('Participant:'), e2p[participantID], bf('Tutorial Duration (minute):'), TutorialDurations[gameID][participantID])

In [None]:
# Initial survey results
def getSurveyResults(p):
    if empiricaColumnExists(p, 'surveyAnswers'):
        surveyResults = json.loads(p['surveyAnswers'][-1]['val'])
        return surveyResults

surveyResults = getMultiGameData(getSurveyResults, game_dict_multi, player_dict_multi)

for gameID in chatIdentities:
    if gameID in exclude_local_trails:  continue
    print(bf('Trial:'), gameID)
    for participantID in surveyResults[gameID]:
        print(bf('Participant:'), e2p[participantID], bf('Survey Results:'), surveyResults[gameID][participantID])

In [None]:
dummy_time = 1746711756197
all_times = updated_stage_dict_multi['01JTR1DWPVWF94FNXGNS546P2E']
for entry_id in range(len(all_times)-1):
    if all_times[entry_id]['startTime']<= dummy_time and all_times[entry_id+1]['startTime']>= dummy_time: 
        print(all_times[entry_id]['name'])

In [None]:
updated_stage_dict_multi[gameID]

In [None]:
# Chat messages

# Chat room creation times
def getCreatedRooms(p):
    if empiricaColumnExists(p, 'createRoom'):
        results = json.loads(p['createRoom'][-1]['val'])
        return results
createdRooms = getMultiGameData(getCreatedRooms, game_dict_multi, player_dict_multi)

# Chat rooms
for gameID in game_dict_multi:
    if gameID in exclude_local_trails:  continue
    if 'chatRooms' not in game_dict_multi[gameID].keys():
        print('no chatroom found for', gameID)
        continue
        
    if isinstance(game_dict_multi[gameID]['chatRooms'][0], float):
        print('no chatroom found for', gameID)
        continue
        
    rooms = json.loads(game_dict_multi[gameID]['chatRooms'][0][-1]['val'])
    print(bf('Trial:'), gameID)
    print(bf('Chat rooms:'), rooms)
    
    for participantID in createdRooms[gameID]:
        if len(createdRooms[gameID][participantID]) > 0:
            print('Participant', bf(e2p[participantID]), 'created room at', createdRooms[gameID][participantID][-1])

    for roomID in rooms:
        print()
        print(bf('#'+rooms[roomID]['title']), 'messages:')
        roomMessages = json.loads(game_dict_multi[gameID]['chatChannel-'+roomID][0][-1]['val'])
        stagetimeStamps = updated_stage_dict_multi[gameID] 
        # timeStamps = game_dict_multi[gameID]['chatChannel-'+roomID][0][-1]['dt']
        prettyPrintChats(roomMessages, stagetimeStamps)

In [None]:
# Suggestions

def getProvidedSuggestions(p):
    if empiricaColumnExists(p, 'suggestedReply'):
        results = []
        for each_suggestion in p['suggestedReply']:
            results.append(each_suggestion['val'])
        return results
def getCopiedSuggestions(p):
    if empiricaColumnExists(p, 'copySuggestion'):
        for each_suggestion in p['copySuggestion']:
            results.append(each_suggestion['val'])
        return results
def getAcceptedSuggestions(p):
    if empiricaColumnExists(p, 'sendSuggestion'):
        for each_suggestion in p['sendSuggestion']:
            results.append(each_suggestion['val'])
        return results
    
providedSuggestions = getMultiGameData(getProvidedSuggestions, game_dict_multi, player_dict_multi)
copiedSuggestions = getMultiGameData(getCopiedSuggestions, game_dict_multi, player_dict_multi)
acceptedSuggestions = getMultiGameData(getAcceptedSuggestions, game_dict_multi, player_dict_multi)

for gameID in providedSuggestions:
    if gameID in exclude_local_trails:  continue
    print(bf('Trial:'), gameID)
    for participantID in providedSuggestions[gameID]:
        print(bf('Participant:'), e2p[participantID])
        print('\t', bf('Suggestions provided:'), providedSuggestions[gameID][participantID])
        print('\t', bf('Suggestions edited:'), copiedSuggestions[gameID][participantID])
        print('\t', bf('Suggestions accepted:'), acceptedSuggestions[gameID][participantID])

In [None]:
# Summaries

def getSummaryText(p):
    if empiricaColumnExists(p, 'summaryText'):
        results = json.loads(p['summaryText'][-1]['val'])
        return results

def getSummaryAgreement(p):
    if empiricaColumnExists(p, 'summaryAgreement'):
        results = json.loads(p['summaryAgreement'][-1]['val'])
        return results
def getSuggestionRating(p):
    if empiricaColumnExists(p, 'suggestionRating'):
        results = json.loads(p['suggestionRating'][-1]['val'])
        return results
    
def getSuggestionExplanation(p):
    if empiricaColumnExists(p, 'suggestionExplanation'):
        results = json.loads(p['suggestionExplanation'][-1]['val'])
        return results

summaryText = getMultiGameData(getSummaryText, game_dict_multi, player_dict_multi)
summaryAgreement = getMultiGameData(getSummaryAgreement, game_dict_multi, player_dict_multi)
suggestionRating = getMultiGameData(getSuggestionRating, game_dict_multi, player_dict_multi)
suggestionExplanation = getMultiGameData(getSuggestionExplanation, game_dict_multi, player_dict_multi)

for gameID in summaryText:
    if gameID in exclude_local_trails:  continue
    print(bf('Trial:'), gameID)
    for participantID in summaryText[gameID]:
        print(bf('Participant:'), e2p[participantID])
        print('\t', bf('Summary Text:'), summaryText[gameID][participantID])
        print('\t', bf('Summary Agreement:'), summaryAgreement[gameID][participantID])
        print('\t', bf('Suggestion Rating:'), suggestionRating[gameID][participantID])
        print('\t', bf('Suggestion Explanation:'), suggestionExplanation[gameID][participantID])

## 4) Figures

In [None]:
# Put figure code here