In [1]:
'''
Process human plan data for Condition 1 (unconstrained)
'''

# to start, processing for Exp Name: unconstrained planning, pilot, v0

import numpy as np
import pandas as pd
import json 
import os
import sys
import random
import math
import matplotlib.pylab as plt 
import seaborn as sns
from matplotlib.font_manager import FontProperties
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.gridspec as gridspec
from transformers import GPT2Tokenizer

In [2]:
data_dir = f"../data/generations/"
data_pth = f"{data_dir}raw-generate-plans-unconstrained.csv"
save_dir = f"../exp_results/unconstrained"

if not os.path.exists(save_dir): os.makedirs(save_dir)


In [3]:
# goals per condition
# [condition_1_goals, condition_2_goals,...]

goals = [[{'goal': 'Protect the deer in your nearby woods during hunting season.'},
  {'goal': 'Jump over a six foot tall man.'},
  {'goal': 'Bake a cake.'},
  {'goal': 'Escape quicksand.'},
  {'goal': 'Escape from inside a locked custodial closet.'},
  {'goal': "Build a float to dazzle the crowd at the Macy's Day Parade."},
  {'goal': 'Paint a fence.'}],
 [{'goal': 'Help your local town mayor win re-election.'},
  {'goal': 'Take your dog for a walk.'},
  {'goal': 'Make a pair of new shoes.'},
  {'goal': 'Play baseball with your friends.'},
  {'goal': 'Get your sofa onto the roof of your house.'},
  {'goal': "Steal the championship trophy from behind your school's award case."},
  {'goal': 'Cool down in a record-breaking heat wave.'}],
 [{'goal': 'Stop your canoe from falling down the waterfall.'},
  {'goal': 'Keep a baby platupyus entertained.'},
  {'goal': "Order food in a restaurant, where you don't speak the native language."},
  {'goal': 'Remove plaque from the teeth of a lion.'},
  {'goal': "Decorate the world's largest Christmas tree."},
  {'goal': 'Build a bookshelf.'},
  {'goal': 'Cut down a tree.'}],
 [{'goal': 'Find out how many sharks live in a cove off a nearby coast.'},
  {'goal': 'Clean the dirty dishes.'},
  {'goal': 'Fix a flat tire.'},
  {'goal': 'Keep the plants in your garden alive.'},
  {'goal': 'Put out a fire that started near the wires from your TV.'},
  {'goal': 'Create a safe landing for a falling skydiver.'},
  {'goal': 'Make a fort underwater.'}]]

In [None]:
df = pd.read_csv(data_pth)

# extract subj ids
all_subjs = set(df.PROLIFIC_PID)

print("Num subjs: ", len(all_subjs), ", Num rows: ", len(df))

df.head(3)

In [6]:
# check num subjs per condition 
subjs_per_condition = {}
for subj_id in all_subjs: 
    subj_df = df.loc[df.PROLIFIC_PID == subj_id].reset_index()
    condition_num = int(subj_df.condition[0])
    if condition_num not in subjs_per_condition: subjs_per_condition[condition_num] = [subj_id]
    else: subjs_per_condition[condition_num].append(subj_id)

for cond_num in sorted(subjs_per_condition.keys()): 
    print("Condition Num: ", cond_num, " Count: ", len(subjs_per_condition[cond_num]))

Condition Num:  1  Count:  12
Condition Num:  2  Count:  12
Condition Num:  3  Count:  12
Condition Num:  4  Count:  12


In [None]:
# filter out trial type to only include goal text + ratings
# key columns: 
# - "PROLIFIC_PID": subj_id
# - "prompt": goal
# - "responses": plan or rating (depending on task)
# - "rt": reaction time (in milliseconds)
goal_ratings_df = df[(df.trial_type == 'survey-likert')].reset_index()
generated_plans_df = df[(df.trial_type == 'survey-text') 
                     & (df.task=="generate, unconstrained")].reset_index() # remove comments (b/c same data type)


In [11]:
# save out plans generated by each subject

def parse_plan(raw_plan): 
    # remove extraneous symbols from plan 
    # (hacky b/c of way original data was saved)
    plan = raw_plan.split("\"Plan\":")[-1].split("}")[0]
    return plan.split(":\"")[-1].split("\"}")[0].replace("\\n", "<br />") 

def save_subj_data(subj_id, data_df, f): 
    # extract data only for subject
    subj_df = data_df.loc[data_df.PROLIFIC_PID == subj_id]#.reset_index()
    f.write(f'\nPID: {subj_id}\n')
    for goal, raw_plan in zip(subj_df.prompt, subj_df.response): 
        plan = parse_plan(raw_plan)
        f.write(
            f'\n\tGoal: {goal}\n\tPlan: {plan}\n'
        )
    

subj_ids = sorted(list(all_subjs)) # ensure same order for consistency
filepth = f"{save_dir}/goals_per_subj.txt"
f = open(filepth, 'w')
f.write("Generated Plans per Subject\n")

for subj_id in subj_ids: 
    save_subj_data(subj_id, generated_plans_df, f)
f.close()

In [12]:
# get a list of plans per goal 
random.seed(10)
parsed_plans_per_goal = {}
all_goals = sorted(list(set(generated_plans_df.prompt)))
n_keep = 10
for full_goal in all_goals:
    goal_df = generated_plans_df.loc[generated_plans_df.prompt == full_goal]
    if len(goal_df.response) < 10: 
        print(full_goal, len(goal_df.response))
    parsed_plans = []
    for raw_plan in goal_df.response: 
        plan = parse_plan(raw_plan)
        parsed_plans.append(plan)
    # subsample down to 10
    parsed_plans_per_goal[full_goal] = random.sample(parsed_plans, n_keep)

In [13]:
# save out plans per goal (aggregate over subjects)

def save_goal_data(goal, data_df, f): 
    # extract data only corresponding to specific goal
    goal_df = data_df.loc[data_df.prompt == goal]#.reset_index()
    constraint = goal_df.constraint.iloc[0]
    f.write(f'\nGoal: {goal}\n')
    if len(goal_df.response) < 10: 
        print(goal, len(goal_df.response))
    for raw_plan in goal_df.response: 
        plan = parse_plan(raw_plan)
        f.write(
            f'\n\tPlan: {plan}\n'
        )
        
filepth = f"{save_dir}/plans_per_goal.txt"
f = open(filepth, 'w')
f.write("Generated Plans per Goal\n")
full_goals = sorted(parsed_plans_per_goal.keys())
for goal in full_goals: 
    f.write(f'\nGoal: {goal}\n')
    for plan in parsed_plans_per_goal[goal]: 
        f.write(
            f'\n\tPlan: {plan}\n'
        )
    
f.close()


In [14]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# compute number of tokens per plan
n_tokens = []
tot_plans = 0
for goal, plans in parsed_plans_per_goal.items(): 
    n_tokens.extend([len(tokenizer(plan)['input_ids']) for plan in plans])
    
print(f"Mean num tokens: {np.mean(n_tokens)}") 
print(f"  Min num tokens: {np.min(n_tokens)}") 
print(f"  Max num tokens: {np.max(n_tokens)}") 

Mean num tokens: 49.53214285714286
  Min num tokens: 5
  Max num tokens: 215


In [None]:
def parse_rating(rating_str): 
    # cognition.run saves all data as strings
    # need to parse dictionary from a string
    # returns an int
    rating = rating_str.split(":")[-1].split("}")[0]
    return int(rating)

# extract ratings per goal 
# NOTE: hacky b/c didn't save the goal... 
# but, order is same as in generation exp. -- so map to generation exp (get order)
goal_order_per_subj = {}
for subj_id in all_subjs: 
    subj_df = generated_plans_df.loc[generated_plans_df.PROLIFIC_PID == subj_id]#.reset_index()
    idx2goal = {}
    for idx, goal in enumerate(subj_df.prompt): 
        idx2goal[idx] = goal
    goal_order_per_subj[subj_id] = idx2goal

ratings_per_goal = {}

for subj_id in all_subjs: 
    subj_df = goal_ratings_df.loc[goal_ratings_df.PROLIFIC_PID == subj_id]#.reset_index()
    for idx, rating_str in enumerate(subj_df["response"]): 
        # extract original name of goal (b/c order the same as phase 1)
        goal_name = goal_order_per_subj[subj_id][idx] 
        # process rating string => int 
        rating = parse_rating(rating_str)
        if goal_name not in ratings_per_goal: ratings_per_goal[goal_name] = [rating]
        else: ratings_per_goal[goal_name].append(rating)


In [None]:
# create one page w/ ratings per condition 
pdf_pth = f'{save_dir}/ratings_per_goal.pdf'

def get_ratings_count(rating_data, num_options=7): 
    # create a df to plot for rating
    # automatically convert to 1-7 scale
    rating_counts = pd.DataFrame()
    rating_counts["rating"]= [rating for rating in range(1,num_options+1)]
    rating_counts["counts"]= np.zeros(rating_counts["rating"].shape,dtype=int)
    for rating_val in rating_data: 
        rating_counts["counts"][rating_val] += 1
    return rating_counts

# compute the number of rows, based on # goals per condition (per page)
num_per_condition = len(goals[0])
cols = 2 
leftover = num_per_condition % cols
rows = (num_per_condition // cols) + (1 if leftover != 0 else 0)

# rating scale parameters
num_options = 7
# based on most participants in any condition
max_y = np.max(list(map(lambda v: len(v), subjs_per_condition.values()))) 

with PdfPages(pdf_pth) as pdf:
    for goal_batch in goals: 
        f = plt.figure(figsize=(14, 8), dpi=600)
        gs0 = gridspec.GridSpec(rows, cols, figure=f,
                               wspace=0.1, hspace=1.0)
        current_row = 0
        current_col = 0
        for idx, goal_obj in enumerate(goal_batch): 
            goal_name = goal_obj["goal"]
            # extract proper axis based on idx
            if current_col >= cols: 
                current_col = 0
                current_row += 1 
            ax = f.add_subplot(gs0[current_row, current_col])
            rating_data = ratings_per_goal[goal_name]
            rating_counts = get_ratings_count(rating_data, num_options=num_options)
            ax = sns.barplot(rating_counts["rating"], rating_counts["counts"])
            ax.set_ylim([0, max_y])
            ax.set_yticks([0, 2, 4, 6])
#             ax.set_title(f"Goal: {goal_name}", fontsize=12)
            ax.set_title(f"{goal_name}", fontsize=12)
            if current_row == rows-1: 
                ax.set_xlabel("Frequency Rating (1 = most freq)", fontsize=10)
            else: ax.set_xlabel("")
            if current_col == 1: 
                ax.set_ylabel("")
            else: 
                ax.set_ylabel("Num Subjs", fontsize=10)
            current_col += 1
        pdf.savefig()

In [None]:
subjs_per_goal = {} # key = goal, value = subjs who responded to that goal 
for subj_id, data in goal_order_per_subj.items(): 
    subj_goals = {goal for goal in goal_order_per_subj[subj_id].values()}
    for goal_name in subj_goals: 
        if goal_name in subjs_per_goal: subjs_per_goal[goal_name].append(subj_id)
        else: subjs_per_goal[goal_name] = [subj_id]

subjs_per_goal['Make a fort underwater.']

In [None]:
# analyzing ratings for specific goals 

def analyze_ratings(goal): 
    print("Ratings for {goal}: ", ratings_per_goal[goal])
    subjs4goal = subjs_per_goal[goal]
    for subj_id in subjs4goal: 
        print(f"Subj ID: {subj_id}")
    #     goal_idx_seen = {goal: idx for idx, goal in goal_order_per_subj[subj_id].items()}
    #     idx = goal_idx_seen[goal]
        # inspect all goal ratings for subj for consistency
        subj_df = goal_ratings_df.loc[goal_ratings_df.PROLIFIC_PID == subj_id]
        for idx, rating in enumerate(subj_df.response): 
    #         rating = rating["freq"]
            print(f"\tGoal: {goal_order_per_subj[subj_id][idx]}, Rating: {parse_rating(rating)}")

goal = "Make a fort underwater."
analyze_ratings(goal)