# Analyzing Goodness Ratings from Planning Data

In [9]:
import numpy as np
import pandas as pd
import json 
import os
import sys
import random
import math
import matplotlib.pylab as plt 
import seaborn as sns
import scipy.stats as stats
from matplotlib.font_manager import FontProperties
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.gridspec as gridspec
import itertools

In [10]:
def parse_rating(rating_str): 
    # cognition.run saves all data as strings
    # need to parse dictionary from a string
    # returns an int
    rating = rating_str.split(":")[-1].split("}")[0]
    return int(rating)

import os
save_dir = "./results_planning/"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [None]:
data_pth = "../data/goodness_ratings/human_rated_goodness.csv"
df = pd.read_csv(data_pth)
# extract subj ids
all_subjs = set(df.PROLIFIC_PID)

print("Num subjs: ", len(all_subjs), ", Num rows: ", len(df))

typicality_data = pd.read_csv("../typicality_ratings.csv", usecols=["Original Goals", "Avg Rating"])
rating_col =  "Avg Rating"



In [12]:
# filter out to only include the ratings
# e.g., any response that was of type "rate_goodness" 
rating_df = df[(df.task == "rate_goodness")].reset_index()
# gather all goals
all_goals = set(rating_df.prompt)
print("Num goals: ", len(all_goals), " rating df: ", len(rating_df))
# keep only the first rating for people who saw a duplicate plan (due to accidental repeated planner)
for subj_id in all_subjs:
    
    subj_df = rating_df[rating_df.PROLIFIC_PID == subj_id]
    rated_plans = set()
    for idx, plan in zip(subj_df.index, subj_df.plan): 
        if plan in rated_plans: rating_df = rating_df.drop(idx)
        else: rated_plans.add(plan)
rating_df.columns

Num goals:  84  rating df:  11600


Index(['level_0', 'index', 'run_id', 'condition', 'view_history', 'rt',
       'trial_type', 'trial_index', 'time_elapsed', 'internal_node_id',
       'PROLIFIC_PID', 'STUDY_ID', 'SESSION_ID', 'subject_id', 'study_id',
       'session_id', 'response', 'question_order', 'prompt', 'task', 'subj_id',
       'plan', 'goal_type', 'stimulus', 'recorded_at', 'ip', 'user_agent',
       'device', 'browser', 'browser_version', 'platform', 'platform_version',
       'source_code_version'],
      dtype='object')

## Extract mean, median, and all goodness ratings for GPT-3 and Human for each Constraint Level

In [13]:
set(rating_df.goal_type)

{'constrained_many', 'constrained_single', 'unconstrained'}

In [14]:
typicality_data = typicality_data.sort_values(by=[rating_col])
sorted_goals = list(typicality_data["Original Goals"])

midpoint = int(len(sorted_goals)/2)
most_typical = sorted_goals[:int(len(sorted_goals)/2)]
least_typical = sorted_goals[int(len(sorted_goals)/2):]

In [15]:
from scipy import stats
rating_data = {}

sources = ["gpt-3", "human"]
constraints = ['unconstrained', 'constrained_single', 'constrained_many']

z_score = False
num_responses = len(rating_df.response)
# rating_df["goodness_rating"] = np.zeros([num_responses,1], dtype=int)
goodness_ratings = []

if z_score == True: 
    # normalize w/in subjs (okay to assume b/c assumed large enough for representative sample)
    scaled_dfs = []
    for subj_id in all_subjs: 
        subj_df = rating_df[rating_df.PROLIFIC_PID == subj_id]
        # extract ratings
        subj_ratings = np.array([json.loads(rating_str)['goodness'] + 1 for rating_str in subj_df.response])
        # run z score
        scaled_ratings = stats.zscore(subj_ratings)
        subj_df["goodness_rating"] = scaled_ratings
        scaled_dfs.append(subj_df)
    rating_df = pd.concat(scaled_dfs, ignore_index=True)
        # map back in the original data frame the new scaled rating
#         orig_idxs = list(subj_df.index)
#         for orig_idx, scaled_rating in zip(orig_idxs, scaled_ratings): 
#             goodness_ratings[orig_idx] = scaled_rating
        
else: 
    for idx, rating_str in enumerate(rating_df.response): 
#         rating_df["goodness_rating"][idx] = json.loads(rating_str)['goodness'] + 1
        goodness_ratings.append(json.loads(rating_str)['goodness'] + 1)
    rating_df["goodness_rating"] = goodness_ratings
        

for source in sources: 
    if source == "human": df_subset = rating_df[rating_df.subj_id == "human"]
    else: df_subset = rating_df[rating_df.subj_id == "gpt-3"]
    goal_rating_data = {}
    for constraint in constraints:
        if constraint not in goal_rating_data:
            goal_rating_data[constraint] = {}
        constraint_df = df_subset[df_subset.goal_type == constraint]
        
#         ratings = [json.loads(rating_str)['goodness'] + 1 for rating_str in constraint_df.response]
        ratings = list(constraint_df["goodness_rating"])
        avg_goodness = np.mean(ratings)
        median_goodness = np.median(ratings)
        
        
        goal_rating_data[constraint] = {'mean': avg_goodness, 'median': median_goodness, 'ratings': ratings}

    rating_data[source] = goal_rating_data




In [16]:
most_typical, least_typical

(['Clean the dirty dishes.',
  'Keep the plants in your garden alive.',
  'Fix a flat tire. ',
  'Take your dog for a walk.',
  'Bake a cake.',
  'Cut down a tree. ',
  "Order food in a restaurant, where you don't speak the native language.",
  'Build a bookshelf. ',
  'Paint a fence. ',
  'Play baseball with your friends. ',
  'Help your local town mayor win re-election. ',
  'Put out a fire that started near the wires from your TV.',
  'Find out how many sharks live in a cove off a nearby coast. ',
  'Cool down in a record-breaking heat wave.'],
 ['Create a safe landing for a falling skydiver.',
  'Protect the deer in your nearby woods during hunting season.',
  'Stop your canoe from falling down the waterfall. ',
  'Make a pair of new shoes. ',
  "Build a float to dazzle the crowd at the Macy's Day Parade.",
  "Decorate the world's largest Christmas tree.",
  'Escape from inside a locked custodial closet. ',
  'Get your sofa onto the roof of your house.',
  'Jump over a six foot tal

In [17]:
# get summary scores for each explanation 
grouped_data = {}
num_generations = {source: 0 for source in sources}
stimuli_per_cond = {constraint: [] for constraint in constraints}
data_per_goal = {source: {} for source in sources}

for source in sources: 
    source_rating_data = {constraint: [] for constraint in constraints}
    df_subset = rating_df[rating_df.subj_id == source]
    goal_rating_data = {}
    for constraint in constraints:
        constraint_df = df_subset[df_subset.goal_type == constraint]
        
        if len(stimuli_per_cond[constraint]) == 0: stimuli_per_cond[constraint] = sorted(set(constraint_df.prompt))
            
        generations = set(constraint_df["plan"])
        num_generations[source] += len(generations)
        for generation in generations: 
            generation_df = constraint_df[constraint_df.plan == generation]
            prompt = list(generation_df.prompt)[0] # same prompt for all generations, so just take the first
            ratings = [json.loads(rating_str)['goodness'] + 1 for rating_str in generation_df.response]
            avg_goodness = np.mean(ratings)
            median_goodness = np.median(ratings)
            
            obj = {'generation': generation, 'stimuli': prompt, 'mean': avg_goodness, 'median': median_goodness, 'ratings': ratings,
                  "constraint":constraint}
            
            source_rating_data[constraint].append(obj)
            if prompt not in data_per_goal[source]: data_per_goal[source][prompt] = [obj]
            else: data_per_goal[source][prompt].append(obj)
    grouped_data[source] = source_rating_data

In [18]:
sample_goal = "Clean the dirty"
# extract generations for specific representative stimuli 
sample_data = {constraint: {} for constraint in constraints} 
for source in sources: 
    for goal, goal_data in data_per_goal[source].items(): 
        if sample_goal in goal: 
            constraint = goal_data[0]["constraint"]
            
            print(f"Constraint: {constraint}, Source: {source} \n{goal}")
            # sort generations by mean rating
            means = [(plan_data["mean"], plan_data["generation"]) for plan_data in goal_data]
            means = sorted(means, key=lambda x: x[0])
            
            print("\n\tMiddle: ", means[int(len(means)/2)], "\n")
            
            sample_data[constraint][source] = means



Constraint: unconstrained, Source: gpt-3 
Goal: Clean the dirty dishes.

	Middle:  (5.0, 'I would first check to see what is the dirtiest. I would then find the appropriate tool for cleaning that item. I would then clean the dishes. I would then put the dishes away.') 

Constraint: constrained_single, Source: gpt-3 
Goal: Clean the dirty dishes, without using soap.

	Middle:  (4.0, 'I would first try to use a sponge to clean them.') 

Constraint: constrained_many, Source: gpt-3 
Goal: Clean the dirty dishes, without using soap, water, a tub, sanitizer, a sponge, detergent, a counter, or a dishwasher.

	Middle:  (2.25, 'I would use a sponge to make a soap substitute. I would make a mixture of water and baking soda. I would take a cup of baking soda and mix it with a cup of water. I would put it in a spray bottle and spray it on the dishes.') 

Constraint: unconstrained, Source: human 
Goal: Clean the dirty dishes.

	Middle:  (5.166666666666667, 'wash all dirty dishes.') 

Constraint: co

In [33]:
source = "gpt-3"
constraint = "unconstrained"
sample_data[constraint][source]

[(3.6666666666666665,
  'I would first make sure I had the right cleaning supplies.'),
 (4.0,
  'I would ask my mom if I could help her with the dishes. I would ask her if she wanted me to dry them or put them away. If she said dry them I would dry them. If she said put them away I would put them away.'),
 (4.0,
  'I would break down the dirty dishes into categories. I would start with the cups and bowls. I would fill the sink with warm soapy water. I would wash each cup and bowl one at a time. I would then rinse each cup and bowl one at a time. I would then put each cup and bowl one at a time into the dishwasher. I would repeat the process until all the dishes are clean.'),
 (4.666666666666667,
  'I would first put the dishes in the dishwasher and run it.'),
 (4.666666666666667,
  'I would start by finding a dish I can wash. Then I would get a sponge and soap and water and I would wash the dish.'),
 (4.666666666666667,
  'I would first put the dirty dishes in a pile and then I would p

In [34]:
sample_goal = "Get your sofa onto the roof of your house"#"Cool down in a record-breaking heat wave"

In [44]:
# extract generations for specific representative stimuli 
sample_data = {constraint: {} for constraint in constraints} 
for source in sources: 
    for goal, goal_data in data_per_goal[source].items(): 
        if sample_goal in goal: 
            constraint = goal_data[0]["constraint"]
            
            print(f"\nConstraint: {constraint}, Source: {source} \n{goal}")
            # sort generations by mean rating
            means = [(plan_data["mean"], plan_data["generation"]) for plan_data in goal_data]
            means = sorted(means, key=lambda x: x[0])
            
            middle = int(len(means)/2)
#             print("\n\tMiddle: ", means[int(len(means)/2)], "\n")
            samples = means[middle-4:middle+4]
            for sample in samples: 
                print("\t",sample)
            
            sample_data[constraint][source] = means




Constraint: unconstrained, Source: gpt-3 
Goal: Get your sofa onto the roof of your house.
	 (3.25, 'I would firstly make a plan on how to get my sofa to the roof. Secondly,I will put my sofa on a truck and take it to the top of my house.')
	 (3.6666666666666665, 'I would start by getting a very strong ladder and a very strong friend. I would then use the ladder to climb onto the roof and then I would use the sofa and my friend to get it up there. I would do this by placing the sofa on top of the roof and then using my friend to help me lift the sofa onto the roof.')
	 (3.6666666666666665, 'I would get a ladder and try to get it up there.')
	 (3.75, 'I would first have to measure my front door and the width of my sofa. I would then have to find a way to lift the sofa up to my roof.')
	 (3.75, 'I would first try to determine if it was possible to get the sofa up there. If it was, I would then try to figure out how to lift it up there.')
	 (4.25, 'I would get a strong rope and tie it to

In [None]:
source = "human"
constraint = "unconstrained"
sample_data[constraint][source], sample_data[constraint][source][4:6]#[9:11]

In [37]:
source = "human"
constraint = "unconstrained"
sample_data[constraint][source], sample_data[constraint][source][4:6]#[9:11]

([(3.857142857142857,
   'I get a ladder and some persons climb up while others are down and a rope is attached to the sofa to drag it up.'),
  (3.857142857142857,
   'this will not be easy to be honest but the i will build a stairs to the roof of the house from which i will gather men together with myself and then we all pull or push the sofa to the roof.'),
  (4.0,
   "I'm gonna call someone to fix my sofa onto the roof of my house very soon."),
  (4.142857142857143,
   '1) cover the sofa in a protective layer, such as plastic. 2) move sofa to the outside of the house. 3) have someone climb onto the roof using a ladder. 4) using that same ladder, move the sofa, with the help of the person on the roof, onto the roof.'),
  (4.428571428571429,
   'The easiest way would be a crane but it would be costly.'),
  (4.571428571428571,
   'To get a sofa onto the roof of the house, there could be several options to achieve this task. This would depend on tall the house was. This could require di

In [41]:
sample_data["constrained_many"]["human"]

[(3.142857142857143,
  'I will fill the bathtub with cold water and dump some ice cubes in there. I will also put the fan next to the tub and turn it on. I will jump into the ice cold tub and eat a popsicle while at it. After my tub I will drink cold bottled water for the rest of the day.'),
 (4.0,
  'use a dehumidifier in the home to increase evaporation from your skin to cool down.'),
 (4.142857142857143,
  'I will be inside of my house in the shade. I will install some heat blocking shades in my windows. I will then strip down naked, cover my body in cooling lotion and than lay down on the tile floor.'),
 (4.285714285714286,
  'I would wet shirts and bath towels with warm water and place them into a refrigerator. Eventually they will become cool and I can put the shirts on and use the towels to cool down even more.'),
 (4.571428571428571,
  'You would find a tree with a good shade and sit underneath it and wait for a cool breeze to come by.'),
 (4.714285714285714,
  'To cool down in

In [None]:
num_generations

In [None]:
# save out the best and worst generations
stat = "mean"
threshold = 2
for source in sources: 
    filepth = f"{save_dir}/bad_plans_{source}.txt"
    f = open(filepth, 'w')
    f.write("Worst Plans (Mean Likert < 2)\n")
    count = 0
    for constraint in constraints: 
        f.write(f"\nConstraint Type: {constraint} \n")
        for datum in grouped_data[source][constraint]: 
            rating = datum[stat]
            if rating < threshold: 
                f.write(f"\n{datum['stimuli']}")
                f.write(f"\tGeneration: {datum['generation']}")
                f.write(f"\tMedian Rating: {datum['median']}")
                f.write(f"\tMedian Rating: {datum['mean']}")
                f.write(f"\tAll Ratings: {datum['ratings']}")
                count += 1
    print(f"{source}, {round((count/num_generations[source])*100, 2)}% bad")

In [None]:
# save out the best and worst generations
stat = "mean"
threshold = 6
for source in sources: 
    filepth = f"{save_dir}/good_plans_{source}.txt"
    f = open(filepth, 'w')
    f.write("Best Plans (Mean Likert > 6)\n")
    count = 0
    for constraint in constraints: 
        f.write(f"\nConstraint Type: {constraint} \n")
        for datum in grouped_data[source][constraint]: 
            rating = datum[stat]
            if rating > threshold: 
                f.write(f"\n{datum['stimuli']}")
                f.write(f"\tGeneration: {datum['generation']}")
                f.write(f"\tMedian Rating: {datum['median']}")
                f.write(f"\tMedian Rating: {datum['mean']}")
                f.write(f"\tAll Ratings: {datum['ratings']}")
                count += 1
    print(f"{source}, {round((count/num_generations[source])*100, 2)}% good")

In [None]:
# # save out the best and worst generations
# stat = "mean"
# threshold = 2
# for source in sources: 
#     count = 0
#     for constraint in constraints: 
#         print(f"\nConstraint Type: {constraint} \n")
#         for datum in grouped_data[source][constraint]: 
#             rating = datum[stat]
#             if rating < threshold: 
#                 print("\n" + datum["stimuli"])
#                 print("\tGeneration: ", datum["generation"])
#                 print("\tMedian Rating: ", datum["median"])
#                 print("\tMedian Rating: ", datum["mean"])
#                 print("\tAll Ratings: ", datum["ratings"])
#                 count += 1
#     print(f"{source} proportion bad: {(count/num_generations[source])*100}")

In [None]:
ratings = []
data = grouped_data
for constraint in constraints:
    for source in grouped_data:
        for generation_data in grouped_data[source][constraint]:
            for r in generation_data["ratings"]:
                row = [constraint, source, r]
                ratings.append(row)
    
rating_df = pd.DataFrame(ratings, columns = ['Number of Constraints', 'Subject ID', 'Rating'])

fig, ax = plt.subplots(figsize=(8,6))
ax = sns.boxplot(x='Number of Constraints', y='Rating', hue='Subject ID', data=rating_df)
ax.set(ylabel='Goodness Rating Across Goals (1-7)', title='Plan Goodness Rating vs Number of Constraints')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)                                         
                                           

In [None]:
ratings = []
data = grouped_data
for constraint in constraints:
    for source in grouped_data:
        for generation_data in grouped_data[source][constraint]:
            row = [constraint, source, generation_data["mean"]]
            ratings.append(row)
    
rating_df = pd.DataFrame(ratings, columns = ['Number of Constraints', 'Subject ID', 'Rating'])

fig, ax = plt.subplots(figsize=(8,6))
ax = sns.boxplot(x='Number of Constraints', y='Rating', hue='Subject ID', data=rating_df)
ax.set(ylabel='Goodness Rating Across Goals (1-7)', title='Plan Goodness Rating vs Number of Constraints')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)                                         
                                           

In [None]:
stimuli_per_cond["unconstrained"]

In [None]:
# np.array(sorted_goals).reshape([int(n_stimuli/(rows * cols)), rows, cols])

In [None]:
generation_data["stimuli"]

In [None]:
# plot goodness broken down by stimuli
# 28 stimuli in total - show 4 per page
n_stimuli = 28
cols = 2 
rows = 2 

# based on likert scale
max_y = 7 # showing rating on a scale of 1 to 7

pdf_pth = f'{save_dir}/per_stimuli_goodness_plans_all.pdf'
    
# stims = np.array(stimuli_per_cond["unconstrained"]).reshape([int(n_stimuli/(rows * cols)), rows, cols])
stims = np.array(sorted_goals).reshape([int(n_stimuli/(rows * cols)), rows, cols])

show_mean = False

with PdfPages(pdf_pth) as pdf:
    for stim_batch in stims: 
        f = plt.figure(figsize=(14, 8), dpi=600)
        gs0 = gridspec.GridSpec(rows, cols, figure=f,
                               wspace=0.2, hspace=0.3)
        for i in range(rows): 
            for j in range(cols): 
                stim_name = stim_batch[i][j]
#                 stim_fragment = stim_name.split("Goal: ")[1][:-1] # consistent portion across goal types
                stim_fragment = stim_name.split(".")[0] # consistent portion across goal types
                ax = f.add_subplot(gs0[i, j])
                
                ratings = []
                data = grouped_data
                for constraint in constraints:
                    for source in grouped_data:
                        for generation_data in grouped_data[source][constraint]:
                            if stim_fragment in generation_data["stimuli"]: 
                                if show_mean: 
                                    row = [constraint, source, generation_data["mean"]]
                                    ratings.append(row)
                                else: 
                                    for k in generation_data["ratings"]: 
                                        row = [constraint, source, k]
                                        ratings.append(row)

                rating_df = pd.DataFrame(ratings, columns = ['Number of Constraints', 'Subject ID', 'Rating'])

                sns.boxplot(x='Number of Constraints', y='Rating', hue='Subject ID', data=rating_df,
                                ax=ax)
#                 plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) 
#                 sns.barplot(x="dimension", y="rating", hue="source", 
#                             data=show_df, 
#                             ci=None, ax = ax)
                ax.set_ylim([0, max_y])
                ax.set_yticks([1,2,3,4,5,6,7])
                title = stim_fragment
                ax.set_title(title, fontsize=10)
                
                ax.set_xlabel("Plan Goodness (1=Worst, 7=Best)", fontsize=10)
#                 else: ax.set_xlabel("")
#                 if current_col == 1: 
#                     ax.set_ylabel("")
                ax.set_ylabel("Median Rating", fontsize=10)
                
        pdf.savefig()

In [None]:
stim_batch

In [None]:
grouped_data["human"]["constrained_single"]