In [None]:
import csv
import math
import os
import pickle
import random
import sys
from random import randint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import statsmodels
from PIL import Image
from scipy import stats
from scipy.stats import ttest_ind, ttest_rel
from sklearn.metrics import classification_report
from statsmodels.stats import multitest
from tabulate import tabulate
from tqdm import tqdm

"""
array of time taken for each task j. [0, 10, 200, 20, ...]
helper for code chunk that parses data
"""


def get_each_time_taken(response, time_string):
    raw_times = response[time_string]
    clean_times = []  # first time will be dropped (unless we know what time they)
    i = 1
    while i < len(raw_times):
        clean_time = raw_times[i] - raw_times[i - 1]
        # print(clean_time_noai)
        # print(clean_time_noai)
        if (
            clean_time < 120000
        ):  # if people leave mid-task, then comeback (if more than 2minutes spent on 1 task, don't add to clean time)
            clean_times.append(clean_time)
        i += 1
    # print(clean_times_noai)
    return clean_times


def time_add_to_table(times, col_num):
    # print(times)
    s_times = [x / 1000 for x in times]
    mean, std = np.mean(s_times), np.std(s_times) / np.sqrt(len(s_times))
    r_mean = round(mean, 3)
    r_std = round(std, 3)
    time_row[col_num] = (r_mean, r_std)


def ai_rel_add_to_table(ai_rel, col_num):
    mean, std = np.mean(ai_rel), np.std(ai_rel) / np.sqrt(len(ai_rel))
    r_mean = round(mean, 3)
    r_std = round(std, 3)
    ai_reliance_row[col_num] = (r_mean, r_std)


def mean_std_add_to_table(accuracies, col_num):
    mean, std = np.mean(accuracies), np.std(accuracies) / np.sqrt(len(accuracies))
    r_mean = round(mean, 3)
    r_std = round(std, 3)
    mean_row[col_num] = (r_mean, r_std)


def p_t_add_to_table(accuracies, col_num):
    t_stat, p_val = ttest_ind(accuracies, accuracies_hum_ai_1)
    r_t_stat = round(t_stat, 3)
    r_p_val = round(p_val, 3)
    p_row[col_num] = (r_p_val, r_t_stat)

In [None]:
data = pickle.load(open("../data/bdd_study_data.pkl", "rb"))

In [None]:
accuracies_ai_all = []

In [None]:
tasks = data["tasks"]

# Human+AI baseline


In [None]:
responses = data["human+ai baseline"]

In [None]:
from sklearn.metrics import accuracy_score

accuracies_hum_no_ai = []
accuracies_hum_ai = []
accuracies_ai_with_ai = []
accuracies_ai_no_ai = []
failed_attention_check = []
ai_reliance = []

# time variables
ai_reliance = []
times_hum_no_ai = []
times_hum_with_ai = []

for i in range(len(responses)):
    # find the matching task
    id_task = responses[i]["id"].split("-")[0]
    task = tasks[id_task]
    user_answers_no_ai_raw = responses[i]["testing_answers_noai"]
    user_answers_ai_raw = responses[i]["testing_answers_withai"]

    # task
    labels_no_ai = task["testing_withairec_labels"]  # humans with
    labels_ai = task["testing_withai_label"]
    ai_no_ai = task["testing_withairec_ai_answers_raw"]
    ai_ai = task["testing_withai_ai_answers_raw"]
    att_check_no_ai = task["testing_attentioncheck"]
    att_check_ai = task["testing_withai_attentioncheck"]

    # time
    response = responses[i]
    clean_times_noai = get_each_time_taken(
        response, "testing_times_noai"
    )  # [example1 time, example2 time, ...]
    clean_times_withai = get_each_time_taken(response, "testing_times_withai")
    times_hum_no_ai.append(
        sum(clean_times_noai) / len(clean_times_noai)
    )  # [avg time per example by person1, by person2, ...]
    times_hum_with_ai.append(sum(clean_times_withai) / len(clean_times_withai))

    # initialize to compile user answers
    user_answers_no_ai = []
    user_answers_ai = []
    reliance_ai = 0
    for j in range(len(user_answers_no_ai_raw)):
        if user_answers_no_ai_raw[j] == "yes":
            user_answers_no_ai.append(1)
        else:
            user_answers_no_ai.append(0)

    for j in range(len(user_answers_ai_raw)):
        if user_answers_ai_raw[j] == "yes":
            user_answers_ai.append(1)
        elif user_answers_ai_raw[j] == "no":
            user_answers_ai.append(0)
        else:
            reliance_ai += 1
            user_answers_ai.append(ai_ai[j])
    # check if the attention check is correct
    failed_checks = 0
    for j in range(len(att_check_no_ai)):
        if att_check_no_ai[j] == 1:
            if user_answers_no_ai[j] != labels_no_ai[j]:
                failed_checks += 1
    if failed_checks >= np.sum(att_check_no_ai):
        continue
    failed_checks = 0
    for j in range(len(att_check_ai)):
        if att_check_ai[j] == 1:
            if user_answers_ai[j] != labels_ai[j]:
                failed_checks += 1
    failed_attention_check.append(failed_checks)
    if failed_checks >= np.sum(att_check_ai):
        continue

    accuracies_hum_no_ai.append(accuracy_score(labels_no_ai, user_answers_no_ai))
    accuracies_hum_ai.append(accuracy_score(labels_ai, user_answers_ai))
    accuracies_ai_with_ai.append(accuracy_score(labels_ai, ai_ai))
    accuracies_ai_no_ai.append(accuracy_score(labels_no_ai, ai_no_ai))
    if accuracy_score(labels_ai, user_answers_ai) == 0:
        print(user_answers_ai)
        print(labels_ai)
    ai_reliance.append(reliance_ai / len(user_answers_ai))

accuracies_hum_no_ai_1 = accuracies_hum_no_ai.copy()
accuracies_hum_ai_1 = accuracies_hum_ai.copy()
ai_reliance_1 = ai_reliance.copy()
accuracies_ai_no_ai_1 = accuracies_ai_no_ai.copy()
accuracies_ai_all.append(accuracies_ai_no_ai_1)

# print(times_hum_no_ai)
print(len(accuracies_hum_ai_1))
print(f"Human only accuracy: {np.mean(accuracies_hum_no_ai_1)}")
print(f"Human + AI accuracy: {np.mean(accuracies_hum_ai_1)}")
print(f" AI (no ai) accuracy: {np.mean(accuracies_ai_no_ai_1)}")
print(f" AI (with AI) accuracy: {np.mean(accuracies_ai_with_ai)}")

print(f"AI reliance: {np.mean(ai_reliance_1)}")
print(f"Failed attention checks: {np.mean(failed_attention_check)}")

In [None]:
# initialize table

num_col = 7
empty_row = [None] * num_col
# 1: human, 2: human+ai, 3: human+ai+teach+rec, 4: human+ai+teach, 5: human+ai+teach baseline, 6:human+ai+rec, 7: ai only

mean_row = ["Accuracy (mean, std dev)"] + empty_row
p_row = ["Test with Human+AI (p-value, t-value)"] + empty_row
ai_reliance_row = ["AI reliance"] + empty_row
time_row = ["Time per example"] + empty_row
ai_acc_row = ["Accuracy when using AI"] + empty_row
no_ai_acc_row = ["Accuracy when not using AI"] + empty_row

table = [
    [
        "Metric",
        "Human",
        "Human+AI",
        "Human+AI+Teach+Rec",
        "Human+AI+Teach",
        "Human+AI+Teach baseline",
        "Human+AI+Rec",
        "AI only",
    ],
    mean_row,
    p_row,
    ai_reliance_row,
    time_row,
    ai_acc_row,
    no_ai_acc_row,
]

headers = table.pop(0)

print(tabulate(table, headers=headers))
# print(tabulate(table, headers=headers, tablefmt="latex"))

In [None]:
# populate table for human and human_ai columns
mean_std_add_to_table(accuracies_hum_no_ai_1, 1)
mean_std_add_to_table(accuracies_hum_ai_1, 2)

ai_reliance_row[1] = "N/A"
ai_rel_add_to_table(ai_reliance_1, 2)

time_add_to_table(times_hum_no_ai, 1)
time_add_to_table(times_hum_with_ai, 2)

# human and human_ai pairwise t test

t_stat1, p_val1 = ttest_rel(accuracies_hum_no_ai_1, accuracies_hum_ai_1)
r_t_stat1 = round(t_stat1, 3)
r_p_val1 = round(p_val1, 3)
p_row[1] = (r_p_val1, r_t_stat1)
p_row[2] = (r_p_val1, r_t_stat1)

# Human + AI + Teach (ours)


In [None]:
responses = data["human+ai+teach(ours)"]

In [None]:
from sklearn.metrics import accuracy_score

accuracies_hum_no_ai = []
accuracies_hum_ai = []
accuracies_ai_with_ai = []
accuracies_ai_no_ai = []
failed_attention_check = []
ai_reliance_3 = []
ai_reliance_4 = []

# time
times_hum_no_ai = []
times_hum_with_ai = []

for i in range(len(responses)):
    # find the matching task
    id_task = responses[i]["id"].split("-")[0]
    task = tasks[id_task]
    user_answers_no_ai_raw = responses[i]["testing_answers_withairec"]
    user_answers_ai_raw = responses[i]["testing_answers_withai"]
    # task
    labels_no_ai = task["testing_withairec_labels"]
    labels_ai = task["testing_withai_label"]
    ai_no_ai = task["testing_withairec_ai_answers_raw"]
    ai_ai = task["testing_withai_ai_answers_raw"]
    att_check_no_ai = task["testing_attentioncheck"]
    att_check_ai = task["testing_withai_attentioncheck"]
    user_answers_no_ai = []
    user_answers_ai = []
    reliance_ai_3 = 0
    reliance_ai_4 = 0

    # time
    response = responses[i]
    clean_times_noai = get_each_time_taken(
        response, "testing_times_withairec"
    )  # [example1 time, example2 time, ...]
    clean_times_withai = get_each_time_taken(response, "testing_times_withai")
    times_hum_no_ai.append(
        sum(clean_times_noai) / len(clean_times_noai)
    )  # [avg time per example by person1, by person2, ...]
    times_hum_with_ai.append(sum(clean_times_withai) / len(clean_times_withai))

    for j in range(len(user_answers_no_ai_raw)):
        if user_answers_no_ai_raw[j] == "yes":
            user_answers_no_ai.append(1)
        elif user_answers_no_ai_raw == "no":
            user_answers_no_ai.append(0)
        else:
            reliance_ai_3 += 1
            user_answers_no_ai.append(ai_no_ai[j])

    for j in range(len(user_answers_ai_raw)):
        if user_answers_ai_raw[j] == "yes":
            user_answers_ai.append(1)
        elif user_answers_ai_raw[j] == "no":
            user_answers_ai.append(0)
        else:
            reliance_ai_4 += 1
            user_answers_ai.append(ai_ai[j])
    # check if the attention check is correct
    failed_checks = 0

    for j in range(len(att_check_no_ai) - 1):
        if att_check_no_ai[j] == 1:
            if user_answers_no_ai[j] != labels_no_ai[j]:
                failed_checks += 1
    if failed_checks >= np.sum(att_check_no_ai):
        continue
    failed_checks = 0
    for j in range(len(att_check_ai)):
        if att_check_ai[j] == 1:
            if user_answers_ai[j] != labels_ai[j]:
                failed_checks += 1
    if failed_checks >= np.sum(att_check_ai):
        continue
    failed_attention_check.append(failed_checks)

    try:
        if accuracy_score(labels_no_ai, user_answers_no_ai) == 0.5:
            continue
        accuracies_hum_no_ai.append(accuracy_score(labels_no_ai, user_answers_no_ai))
        accuracies_hum_ai.append(accuracy_score(labels_ai, user_answers_ai))

        accuracies_ai_with_ai.append(accuracy_score(labels_ai, ai_ai))
        accuracies_ai_no_ai.append(accuracy_score(labels_no_ai, ai_no_ai))

        if accuracy_score(labels_ai, user_answers_ai) == 0:
            print(user_answers_ai)
            print(labels_ai)
        ai_reliance_4.append(reliance_ai_4 / len(user_answers_ai))

        if accuracy_score(labels_no_ai, user_answers_no_ai) == 0:
            print(user_answers_no_ai)
            print(labels_no_ai)
        ai_reliance_3.append(reliance_ai_3 / len(user_answers_no_ai))

    except:
        print("some error")
        continue


accuracies_hum_no_ai_teach_rec = accuracies_hum_no_ai.copy()
accuracies_hum_no_ai_teach = accuracies_hum_ai.copy()
accuracies_ai_no_ai_2 = accuracies_ai_no_ai.copy()
accuracies_ai_with_ai_2 = accuracies_ai_with_ai.copy()
accuracies_ai_all.append(accuracies_ai_with_ai_2)

print(f"Human + AI + Teach + rec accuracy: {np.mean(accuracies_hum_no_ai_teach_rec)}")
print(f"Human + AI + Teach accuracy: {np.mean(accuracies_hum_no_ai_teach)}")
print(f" AI (with rec) accuracy: {np.mean(accuracies_ai_no_ai_2)}")
print(f" AI (no rec) accuracy: {np.mean(accuracies_ai_with_ai_2)}")

print(f"AI reliance: {np.mean(ai_reliance_3)}")
print(f"AI reliance: {np.mean(ai_reliance_4)}")
print(f"Failed attention checks: {np.mean(failed_attention_check)}")
# show the distribution of accuracies

In [None]:
# populate table for 3: human+ai+teach+rec t-test, 4: human+ai+teach
mean_std_add_to_table(accuracies_hum_no_ai_teach_rec, 3)
p_t_add_to_table(accuracies_hum_no_ai_teach_rec, 3)

mean_std_add_to_table(accuracies_hum_no_ai_teach, 4)
p_t_add_to_table(accuracies_hum_no_ai_teach, 4)

ai_rel_add_to_table(ai_reliance_3, 3)
ai_rel_add_to_table(ai_reliance_4, 4)

time_add_to_table(times_hum_no_ai, 3)
time_add_to_table(times_hum_with_ai, 4)

print(tabulate(table, headers=headers))

# Human + AI + Teach Baseline


In [None]:
responses = data["human+ai+teach baseline"]

In [None]:
from sklearn.metrics import accuracy_score

accuracies_hum_no_ai = []
accuracies_hum_ai = []
accuracies_ai_with_ai = []
accuracies_ai_no_ai = []
failed_attention_check = []
ai_reliance = []
times_hum_with_ai = []
for i in range(len(responses)):
    # find the matching task
    id_task = responses[i]["id"].split("-")[0]
    task = tasks[id_task]
    user_answers_ai_raw = responses[i]["testing_answers_withai"]
    # task
    labels_no_ai = task["testing_withairec_labels"]
    labels_ai = task["testing_withai_label"]
    ai_no_ai = task["testing_withairec_ai_answers_raw"]
    ai_ai = task["testing_withai_ai_answers_raw"]
    att_check_no_ai = task["testing_attentioncheck"]
    att_check_ai = task["testing_withai_attentioncheck"]
    user_answers_no_ai = []
    user_answers_ai = []
    reliance_ai = 0

    # time
    response = responses[i]
    clean_times_withai = get_each_time_taken(response, "testing_times_withai")
    times_hum_with_ai.append(sum(clean_times_withai) / len(clean_times_withai))

    for j in range(len(user_answers_ai_raw)):
        if user_answers_ai_raw[j] == "yes":
            user_answers_ai.append(1)
        elif user_answers_ai_raw[j] == "no":
            user_answers_ai.append(0)
        else:
            reliance_ai += 1
            user_answers_ai.append(ai_ai[j])
    # check if the attention check is correct
    failed_checks = 0

    for j in range(len(att_check_ai) - 1):
        if att_check_ai[j] == 1:
            if user_answers_ai[j] != labels_ai[j]:
                failed_checks += 1

    if failed_checks >= np.sum(att_check_ai):
        continue
    failed_attention_check.append(failed_checks)
    try:
        accuracies_hum_ai.append(accuracy_score(labels_ai, user_answers_ai))
        accuracies_ai_with_ai.append(accuracy_score(labels_ai, ai_ai))
        if accuracy_score(labels_ai, user_answers_ai) == 0:
            print(user_answers_ai)
            print(labels_ai)
        ai_reliance.append(reliance_ai / len(user_answers_ai))
    except:
        print("some error")
        continue

accuracies_hum_ai_base = accuracies_hum_ai.copy()
accuracies_ai_only = accuracies_ai_with_ai.copy()
ai_reliance_5 = ai_reliance.copy()
accuracies_ai_only_3 = accuracies_ai_only.copy()
accuracies_ai_all.append(accuracies_ai_only_3)

print(
    f"Human + AI + Teach Baseline: {np.mean(accuracies_hum_ai_base)}"
)  # 1 treatment, just teach baseline
print(f" AI (no rec) accuracy: {np.mean(accuracies_ai_only_3)}")

print(f"AI reliance: {np.mean(ai_reliance_5)}")
print(f"Failed attention checks: {np.mean(failed_attention_check)}")
print(len(accuracies_hum_ai_base))
print(failed_attention_check)
# show the distribution of accuracies

In [None]:
# 5: human+ai+teach baseline
mean_std_add_to_table(accuracies_hum_ai_base, 5)
p_t_add_to_table(accuracies_hum_ai_base, 5)

ai_rel_add_to_table(ai_reliance_5, 5)

time_add_to_table(times_hum_with_ai, 5)

print(tabulate(table, headers=headers))

# Human + AI + REC


In [None]:
responses = data["human+ai rec"]

In [None]:
from sklearn.metrics import accuracy_score

accuracies_hum_no_ai = []
accuracies_hum_ai = []
accuracies_ai_with_ai = []
accuracies_ai_no_ai = []
failed_attention_check = []
ai_reliance = []
# time
times_hum_ai = []

for i in range(len(responses)):
    # find the matching task
    id_task = responses[i]["id"].split("-")[0]
    task = tasks[id_task]
    user_answers_ai_raw = responses[i]["testing_answers_withairec"]
    # task
    labels_no_ai = task["testing_withairec_labels"]
    labels_ai = task["testing_withai_label"]
    ai_no_ai = task["testing_withairec_ai_answers_raw"]
    ai_ai = task["testing_withai_ai_answers_raw"]
    att_check_no_ai = task["testing_attentioncheck"]
    att_check_ai = task["testing_withai_attentioncheck"]
    user_answers_no_ai = []
    user_answers_ai = []
    reliance_ai = 0

    # time
    response = responses[i]
    clean_times_withai = get_each_time_taken(response, "testing_times_withairec")
    times_hum_with_ai.append(sum(clean_times_withai) / len(clean_times_withai))

    for j in range(len(user_answers_ai_raw)):
        if user_answers_ai_raw[j] == "yes":
            user_answers_ai.append(1)
        elif user_answers_ai_raw[j] == "no":
            user_answers_ai.append(0)
        else:
            reliance_ai += 1
            user_answers_ai.append(ai_no_ai[j])
    # check if the attention check is correct
    failed_checks = 0

    for j in range(len(att_check_ai) - 1):
        if att_check_ai[j] == 1:
            if user_answers_ai[j] != labels_no_ai[j]:
                failed_checks += 1
    if failed_checks >= np.sum(att_check_ai):
        continue
    failed_attention_check.append(failed_checks)
    try:
        accuracies_hum_ai.append(accuracy_score(labels_no_ai, user_answers_ai))
        accuracies_ai_with_ai.append(accuracy_score(labels_no_ai, ai_no_ai))
        if accuracy_score(labels_ai, user_answers_ai) == 0:
            print(user_answers_ai)
            print(labels_ai)
        ai_reliance.append(reliance_ai / len(user_answers_ai))
    except:
        print("some error")
        continue

accuracies_hum_ai_rec = accuracies_hum_ai.copy()
accuracies_ai_with_ai4 = accuracies_ai_with_ai.copy()
accuracies_ai_all.append(accuracies_ai_with_ai4)
print(len(accuracies_hum_ai_rec))
print(
    f"Human + AI + REC Basline: {np.mean(accuracies_hum_ai_rec)}"
)  # just rec (no teaching)
print(f" AI (no rec) accuracy: {np.mean(accuracies_ai_with_ai4)}")

print(f"AI reliance: {np.mean(ai_reliance)}")
print(f"Failed attention checks: {np.mean(failed_attention_check)}")
# show the distribution of accuracies

In [None]:
# 6: human+ai+teach baseline
mean_std_add_to_table(accuracies_hum_ai_rec, 6)
p_t_add_to_table(accuracies_hum_ai_rec, 6)
ai_rel_add_to_table(ai_reliance, 6)
time_add_to_table(times_hum_with_ai, 6)
print(tabulate(table, headers=headers))

In [None]:
# 6: human+ai+teach baseline
mean_std_add_to_table(accuracies_hum_ai_rec, 6)
p_t_add_to_table(accuracies_hum_ai_rec, 6)
ai_rel_add_to_table(ai_reliance, 6)
time_add_to_table(times_hum_with_ai, 6)
print(tabulate(table, headers=headers))

# SUMMARY


In [None]:
accuracies_ai_all_flat = [
    item for nested_list in accuracies_ai_all for item in nested_list
]
data["ai only"] = accuracies_ai_all_flat
# 7: AI only
mean_std_add_to_table(accuracies_ai_all_flat, 7)
p_t_add_to_table(accuracies_ai_all_flat, 7)
print(tabulate(table, headers=headers))

In [None]:
print(tabulate(table, headers=headers))

In [None]:
print(tabulate(table, headers=headers, tablefmt="latex_booktabs"))

In [None]:
# List of array names for reference
array_names = [
    "accuracies_hum_ai_rec",
    "accuracies_ai_all_flat",
    "accuracies_hum_ai_base",
    "accuracies_hum_no_ai_teach_rec",
    "accuracies_hum_no_ai_teach",
    "accuracies_hum_no_ai_1",
    "accuracies_hum_ai_1",
]
names_array = [
    "Rec",
    "AI only",
    "Onboard(baseline)",
    "Onboard(ours)+Rec",
    "Onboard(ours)",
    "Human",
    "Human-AI",
]


# Create a list to hold the results
results = []

j = -1
# Perform pairwise t-tests
for i in range(len(array_names) - 1):
    p_value = stats.ttest_ind(eval(array_names[i]), eval(array_names[j])).pvalue
    results.append([names_array[i], names_array[j], p_value])


tests_to_add = []

for test in tests_to_add:
    i = test[0]
    j = test[1]
    p_value = stats.ttest_ind(eval(array_names[i]), eval(array_names[j])).pvalue
    results.append([names_array[i], names_array[j], p_value])


# Create a pandas DataFrame to organize the results
results_df = pd.DataFrame(results, columns=["Array 1", "Array 2", "p-value"])
p_values = results_df["p-value"].values

# Perform FDR correction using Benjamini-Hochberg method
rejected, adjusted_p_values = multitest.fdrcorrection(
    p_values, alpha=0.05, method="indep"
)

# Add adjusted p-values to the DataFrame
results_df["adjusted_p-value"] = adjusted_p_values
# Display the results
print(results_df)

# qualitative


In [None]:
responses_teach = data["human+ai+teach(ours)"]
responses = data["human+ai baseline"]
outake_teach = [
    responses_teach[i]["outake_quest1"] for i in range(len(responses_teach))
]
outake_base = [responses[i]["outake_quest1"] for i in range(len(responses))]

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# batch size of 16
all_responses = outake_teach + outake_base
embeddings = model.encode(all_responses, batch_size=16)
embeddings = np.array(embeddings)

# cluster_scores is 2d array where each element is [1,0] if it is in outake_teach and [0,1] if it is in outake_base
cluster_scores = np.zeros((len(all_responses), 2))
cluster_labels = np.zeros(len(all_responses))

for i in range(len(outake_teach)):
    cluster_scores[i][0] = 1
    cluster_labels[i] = int(1)
for i in range(len(outake_base)):
    cluster_scores[i + len(outake_teach)][1] = 1


def get_text_embedding(string):
    return model.encode([string], batch_size=16)[0]

In [None]:
import sys

sys.path.append("../src")
logging.getLogger().setLevel(logging.INFO)

from describers.itterative_describe import *

# OPEN AI KEY
keyfile = open("../keys.txt", "r")
# read the file
key = keyfile.read()

In [None]:
class IterativeRegionDescribeAnalysis(IterativeRegionDescribe):
    def __init__(
        self,
        descriptions,
        embeddings,
        cluster_scores,
        cluster_labels,
        open_ai_key,
        get_text_embedding_fn,
        n_rounds=5,
        initial_positive_set_size=15,
        initial_negative_set_size=5,
        chat_correct=False,
    ):
        super().__init__(
            descriptions,
            embeddings,
            cluster_scores,
            cluster_labels,
            open_ai_key,
            get_text_embedding_fn,
            n_rounds,
            initial_positive_set_size,
            initial_negative_set_size,
            chat_correct,
        )  # Calling the superclass constructor
        self.pre_instruction = (
            "I will provide you with a set of descriptions of points that belong to a region and a set of descriptions of point that do not belong to the region."
            + "Your task is to summarize the points inside the region in one or two sentences detailed while making sure the summary contrast to points outside the region. Please compare to outside the region."
            + "Your   summary should be able to allow a person to distinguish between points inside and outside the region while describing the region really well."
            + "The summary should be no more than 100 words, it should be accurate, detailed, concise, distinguishing and precise."
            + "Example: \n"
            + "inside the region: \n two cows and two sheep grazing in a pasture. \n the sheep is standing near a tree. \n outside the region:  the cows are lying on the grass beside the water.\n"
            + "summary: The region consists of descriptions that have have sheep in them outside in nature, it could have cows but must have sheep. \n End of Example \n"
        )
        self.post_instruction = "summary:"

    def get_completion(self, prompt, history=[]):
        while True:
            try:
                if len(history) == 0:
                    messages = [
                        {"role": "system", "content": "You are a helpful assistant."},
                        {"role": "user", "content": prompt},
                    ]
                else:
                    messages = history

                response = openai.ChatCompletion.create(
                    model="gpt-4", messages=messages
                )
                logging.info("Called OPENAI API")

                return response["choices"][0]["message"]["content"]
            except:
                print("pausing")
                time.sleep(0.3)
                continue

In [None]:
itt_desriber = IterativeRegionDescribeAnalysis(
    all_responses,
    embeddings,
    cluster_scores,
    cluster_labels,
    key,
    get_text_embedding,
    0,
    initial_positive_set_size=len(outake_teach),
    initial_negative_set_size=len(outake_base),
)

itt_des = itt_desriber.describe_region(1)
itt_des[0]

In [None]:
itt_desriber = IterativeRegionDescribeAnalysis(
    all_responses,
    embeddings,
    cluster_scores,
    cluster_labels,
    key,
    get_text_embedding,
    0,
    initial_positive_set_size=len(outake_base),
    initial_negative_set_size=len(outake_teach),
)

itt_des = itt_desriber.describe_region(0)
itt_des[0]