In [2]:
'''
Process human plan data for Conditions 2 and 3 (constrained)
'''

import numpy as np
import pandas as pd
import json 
import os
import sys
import random
import math
import matplotlib.pylab as plt 
import seaborn as sns
from matplotlib.font_manager import FontProperties
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.gridspec as gridspec
from transformers import GPT2Tokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [5]:
task = "constrained"

data_pth = f"../data/generations/generate-explanations-{task}.csv"


In [None]:
df = pd.read_csv(data_pth)

# extract subj ids
all_subjs = set(df.PROLIFIC_PID)

print("Num subjs: ", len(all_subjs), ", Num rows: ", len(df))

In [None]:
all_subjs

In [8]:
# check num subjs per condition 
subjs_per_condition = {}
for subj_id in all_subjs: 
    subj_df = df.loc[df.PROLIFIC_PID == subj_id].reset_index()
    condition_num = int(subj_df.condition[0])
    if condition_num not in subjs_per_condition: subjs_per_condition[condition_num] = [subj_id]
    else: subjs_per_condition[condition_num].append(subj_id)

for cond_num in sorted(subjs_per_condition.keys()): 
    print("Condition Num: ", cond_num, " Count: ", len(subjs_per_condition[cond_num]))

Condition Num:  1  Count:  22
Condition Num:  2  Count:  16
Condition Num:  3  Count:  14
Condition Num:  4  Count:  13
Condition Num:  5  Count:  15
Condition Num:  6  Count:  15
Condition Num:  7  Count:  15
Condition Num:  8  Count:  15


In [9]:
tasks = ["single-constraint",  "all-constraints"]
fragment = "This could have happened because"
constraint_fragment = "However, the reason this happened was not that"
random.seed(10)

save_dir = f"../exp_results/"

if not os.path.exists(save_dir): os.makedirs(save_dir)
 

def parse_explanation(raw_explanation): 
    # remove extraneous symbols from explanation 
    # (hacky b/c of way original data was saved)
    explanation = raw_explanation.split("\"Explanation\":")[-1].split("}")[0]
    return explanation.split(":\"")[-1].split("\"}")[0].replace("\\n", "<br />") 

def save_subj_data(subj_id, data_df, f, constraint_per_scenario): 
    # extract data only for subject
    subj_df = data_df.loc[data_df.PROLIFIC_PID == subj_id]#.reset_index()
    f.write(f'\nPID: {subj_id}\n')
    for goal, raw_explanation in zip(subj_df.prompt, subj_df.response): 
        explanation = parse_explanation(raw_explanation)[1:] # remove starting "
        pre_frag = f"{explanation[0].lower()}{explanation[1:]}"
        if pre_frag[:32] == "this could have happened because": 
            pre_frag = pre_frag[32:]
        explanation = f"\"{fragment} {pre_frag}" # include the starter text
        constraint = constraint_per_scenario[goal]
        f.write(
            f'\n\tScenario: {goal} {constraint_fragment} {constraint}.\n\tExplanation: {explanation}\n'
        )

def save_goal_data(goal, data_df, f): 
    # extract data only corresponding to specific goal
    goal_df = data_df.loc[data_df.prompt == goal]#.reset_index()
    constraint = goal_df.constraint.iloc[0]
    f.write(f'\nScenario: {goal}\n')
    if len(goal_df.response) < 10: 
        print(goal, len(goal_df.response))
    for raw_explanation in goal_df.response: 
        explanation = parse_explanation(raw_explanation)[1:] # remove starting "
        pre_frag = f"{explanation[0].lower()}{explanation[1:]}"
        if pre_frag[:32] == "this could have happened because this could have happened because": 
            print(pre_frag)
            pre_frag = pre_frag[32:]
        pre_frags.append(pre_frag)
        explanation = f"\"{fragment} {pre_frag}" # include the starter text
#         explanation = f"\"{fragment} {explanation[0].lower()}p{explanation[1:]}" # include the starter text
        f.write(
            f'\n\tExplanation: {explanation}\n'
        )

for task in tasks: 
    # filter out trial type to only include goal text + ratings
    # key columns: 
    # - "PROLIFIC_PID": subj_id
    # - "prompt": goal
    # - "responses": plan or rating (depending on task)
    # - "rt": reaction time (in milliseconds)
    goal_ratings_df = df[(df.trial_type == 'survey-likert')].reset_index()
    generated_explanations_df = df[(df.trial_type == 'survey-text') 
                         & (df.task==f"generate, {task}")].reset_index() # remove comments (b/c same data type)
#     print(generated_explanations_df.head(3))
    
    # get a mapping of scenario to its associated constraint
    constraint_per_scenario = {}
    for scenario, constraint in zip(generated_explanations_df.prompt, generated_explanations_df.constraint): 
        if scenario not in constraint_per_scenario: 
            constraint_per_scenario[scenario] = constraint
    print("Num scenarios: ", len(constraint_per_scenario.keys()))
    
    # store the full prompts for each batch using a sample subj per
    # helpful for downstream plotting/decomp. in analysis
    batched_stim = {cond_num: [] for cond_num in subjs_per_condition.keys()}
    for cond_num in subjs_per_condition.keys():
        sample_subj = subjs_per_condition[cond_num][0]
        prompts = list(generated_explanations_df.loc[generated_explanations_df.PROLIFIC_PID == sample_subj].prompt)
        batched_stim[cond_num] = prompts
        

    subj_ids = sorted(list(all_subjs)) # ensure same order for consistency
    filepth = f"{save_dir}/exp_per_subj_{task}.txt"
    f = open(filepth, 'w')
    f.write("Generated Explanations per Subject\n")

    for subj_id in subj_ids: 
        save_subj_data(subj_id, generated_explanations_df, f, constraint_per_scenario)
    f.close()
    
    # get a list of explanations per goal 
    parsed_explanations_per_goal = {}
    all_goals = sorted(list(set(generated_explanations_df.prompt)))
    n_keep = 10
    pre_frags=[]
    for full_goal in all_goals:
        goal_df = generated_explanations_df.loc[generated_explanations_df.prompt == full_goal]
        if len(goal_df.response) < 10: 
            print(full_goal, len(goal_df.response), " cond: ", goal_df.condition.iloc[0])
        parsed_explanations = []
        for raw_explanation in goal_df.response: 
            explanation = parse_explanation(raw_explanation)[1:]
            pre_frag = f"{explanation[0].lower()}{explanation[1:]}"
            if pre_frag[:32] == "this could have happened because": 
                pre_frag = pre_frag[32:]
            pre_frags.append(pre_frag)
            explanation = f"\"{fragment} {pre_frag}" # include the starter text
            explanation = explanation.replace("  ", " ") # remove double-spaces that may have been introduced w/ parse

    #         explanation = f"\"{fragment} {explanation[0].lower()}{explanation[1:]}" # include the starter text
            parsed_explanations.append(explanation)
        # subsample down to 10
        parsed_explanations_per_goal[full_goal] = random.sample(parsed_explanations, n_keep) 
        
    # save out plans per goal (aggregate over subjects)
    pre_frags = []



    filepth = f"{save_dir}/exp_per_scenario_{task}.txt"
    f = open(filepth, 'w')
    f.write("Generated Explanations per Scenario\n")
    full_goals = sorted(parsed_explanations_per_goal.keys())
    for goal in full_goals: 
        constraint = constraint_per_scenario[goal]
        f.write(f'\nScenario: {goal} {constraint_fragment} {constraint}.\n')
        for plan in parsed_explanations_per_goal[goal]: 
            f.write(
                f'\n\tExplanation: {plan}\n'
            )

    f.close()    
        
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    # compute number of tokens per plan
    n_tokens = []
    tot_plans = 0
    for goal, plans in parsed_explanations_per_goal.items(): 
        plan = plan[34:] # remove "This could have happened because" # since added
        n_tokens.extend([len(tokenizer(plan)['input_ids']) for plan in plans])

    print(f"{task} -- Mean num tokens: {np.mean(n_tokens)}") 
    print(f"  Min num tokens: {np.min(n_tokens)}") 
    print(f"  Max num tokens: {np.max(n_tokens)}") 

Num scenarios:  28
single-constraint -- Mean num tokens: 29.675
  Min num tokens: 11
  Max num tokens: 101
Num scenarios:  28
all-constraints -- Mean num tokens: 28.728571428571428
  Min num tokens: 10
  Max num tokens: 93
