In [1]:
'''
Process human plan data for Condition 3 (all constraints)
'''

import numpy as np
import pandas as pd
import json 
import os
import sys
import random
import math
import matplotlib.pylab as plt 
import seaborn as sns
from matplotlib.font_manager import FontProperties
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.gridspec as gridspec
from transformers import GPT2Tokenizer


In [17]:
main_dir = "/Users/kcollins/language_and_structure_of_thoughts/plans"#"/Users/kcollins/Dropbox (MIT)/language_and_structure_of_thoughts"
data_pth = f"{main_dir}/humans/plan_generation/raw_data/generate-plans-constrained-without-many-objs.csv"
save_dir = f"{main_dir}/exp_results/constrained_many_objs"

data_dir = f"../data/generations/"
data_pth = f"{data_dir}raw-generate-plans-constrained-without-many-objs.csv"
save_dir = f"../exp_results/constrained_many_objs"

if not os.path.exists(save_dir): os.makedirs(save_dir)


In [None]:
df = pd.read_csv(data_pth)

# extract subj ids
all_subjs = set(df.PROLIFIC_PID)

print("Num subjs: ", len(all_subjs), ", Num rows: ", len(df))


In [21]:
# check num subjs per condition 
subjs_per_condition = {}
for subj_id in all_subjs: 
    subj_df = df.loc[df.PROLIFIC_PID == subj_id].reset_index()
    condition_num = int(subj_df.condition[0])
    if condition_num not in subjs_per_condition: subjs_per_condition[condition_num] = [subj_id]
    else: subjs_per_condition[condition_num].append(subj_id)

for cond_num in sorted(subjs_per_condition.keys()): 
    print("Condition Num: ", cond_num, " Count: ", len(subjs_per_condition[cond_num]))

Condition Num:  1  Count:  16
Condition Num:  2  Count:  12
Condition Num:  3  Count:  10
Condition Num:  4  Count:  11


In [None]:
# filter out trial type to only include goal text + ratings
# key columns: 
# - "PROLIFIC_PID": subj_id
# - "prompt": goal
# - "responses": plan or rating (depending on task)
# - "rt": reaction time (in milliseconds)
generated_plans_df = df[(df.trial_type == 'survey-text') 
                     & (df.task=="generate plans")].reset_index() # remove comments (b/c same data type)


In [25]:
# save out plans generated by each subject

def parse_plan(raw_plan): 
    # remove extraneous symbols from plan 
    # (hacky b/c of way original data was saved)
    plan = raw_plan.split("\"Plan\":")[-1].split("}")[0]
    return plan.split(":\"")[-1].split("\"}")[0].replace("\\n", "<br />") 

def save_subj_data(subj_id, data_df, f): 
    # extract data only for subject
    subj_df = data_df.loc[data_df.PROLIFIC_PID == subj_id]#.reset_index()
    f.write(f'\nPID: {subj_id}\n')
    for goal, constraint, raw_plan in zip(subj_df.prompt, subj_df.constraint, subj_df.response): 
        plan = parse_plan(raw_plan)
        f.write(
            f'\n\tGoal: {goal[:-1]}, {constraint}.\n\tPlan: {plan}\n'
        )
    

subj_ids = sorted(list(all_subjs)) # ensure same order for consistency
filepth = f"{save_dir}/goals_per_subj.txt"
f = open(filepth, 'w')
f.write("Generated Plans per Subject\n")

for subj_id in subj_ids: 
    save_subj_data(subj_id, generated_plans_df, f)
f.close()

In [26]:
# get a list of plans per goal 
random.seed(10)
parsed_plans_per_goal = {}
all_goals = sorted(list(set(generated_plans_df.prompt)))
n_keep = 10
for goal in all_goals:
    goal_df = generated_plans_df.loc[generated_plans_df.prompt == goal]
    constraint = goal_df.constraint.iloc[0]
    full_goal = f"{goal[:-1]}, {constraint}."
    if len(goal_df.response) < 10: 
        print(goal, len(goal_df.response))
    parsed_plans = []
    for raw_plan in goal_df.response: 
        plan = parse_plan(raw_plan)
        parsed_plans.append(plan)
    # subsample down to 10
    parsed_plans_per_goal[full_goal] = random.sample(parsed_plans, n_keep)

In [27]:
# save out plans per goal (aggregate over subjects)

def save_goal_data(goal, data_df, f): 
    # extract data only corresponding to specific goal
    goal_df = data_df.loc[data_df.prompt == goal]#.reset_index()
    constraint = goal_df.constraint.iloc[0]
    f.write(f'\nGoal: {goal[:-1]}, {constraint}.\n')
    if len(goal_df.response) < 10: 
        print(goal, len(goal_df.response))
    for raw_plan in goal_df.response: 
        plan = parse_plan(raw_plan)
        f.write(
            f'\n\tPlan: {plan}\n'
        )
        
filepth = f"{save_dir}/plans_per_goal.txt"
f = open(filepth, 'w')
f.write("Generated Plans per Goal\n")
full_goals = sorted(parsed_plans_per_goal.keys())
for goal in full_goals: 
    f.write(f'\nGoal: {goal}\n')
    for plan in parsed_plans_per_goal[goal]: 
        f.write(
            f'\n\tPlan: {plan}\n'
        )
    
f.close()


In [28]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# compute number of tokens per plan
n_tokens = []
tot_plans = 0
for goal, plans in parsed_plans_per_goal.items(): 
    n_tokens.extend([len(tokenizer(plan)['input_ids']) for plan in plans])

In [29]:
print(f"Mean num tokens: {np.mean(n_tokens)}") 
print(f"  Min num tokens: {np.min(n_tokens)}") 
print(f"  Max num tokens: {np.max(n_tokens)}") 


Mean num tokens: 50.97142857142857
  Min num tokens: 6
  Max num tokens: 290


In [15]:
for goal, plans in parsed_plans_per_goal.items(): 
    for plan in plans: 
        if len(tokenizer(plan)['input_ids']) == np.min(n_tokens): print(goal, plan)

Paint a fence, without using a paint brush, paint, a pressure washer, primer, a pail, music, lemonade, or cleaning supplies. "Hire a painter."
