In [1]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import pickle
import os
import json
import numpy as np
import pandas as pd 
import re 
import ast 
import dateutil.parser as dparser

# Helper functions 

In [3]:
def get_time_to_completion(telemetry_data):
    starts = [event for event in telemetry_data if event["event_type"] == "load_task"]
    ends = [
        event#["timestamp"]
        for event in telemetry_data
        if event["event_type"] == "submit_code" and event["completed_task"] == 1
    ]
    times_tasks_solved = []
    task_indices_seen = set()
    for start in starts:
        if start["task_index"] == -1 or start["task_index"] in task_indices_seen:
            continue
        task_indices_seen.add(start["task_index"])
        
        for end in ends:
            if end["task_index"] == start["task_index"]:
                # check if tim is more than 10mins
                if (end["timestamp"] - start["timestamp"]) / 1000 < 600:
                    times_tasks_solved.append((end["timestamp"] - start["timestamp"]) / 1000)
                    break


    if len(times_tasks_solved) == 0:
        return [], np.nan
    return times_tasks_solved, np.mean(times_tasks_solved)


def get_coding_time(telemetry_data):
    # Get first load task
    start = [event["timestamp"] for event in telemetry_data if event["event_type"] == "load_task"][0]

    # Get last telemetry event
    end = telemetry_data[-1]["timestamp"]

    return (end - start) / 1000

def get_time_verifying_suggestion(telemetry_data):
    # Get suggestions
    suggestions_shown = [event for event in telemetry_data if event["event_type"] == "suggestion_shown"]

    suggestions_reviewed = [
        event for event in telemetry_data if event["event_type"] == "reject" or event["event_type"] == "accept"
    ]

    # Create a hashmap for suggestion reviews.
    reviewed_hashmap = {}
    for event in suggestions_reviewed:
        reviewed_hashmap[event["suggestion_id"]] = event["timestamp"]

    # Create a hashmap for times to completion
    time_spent_verifying = {}
    for event in suggestions_shown:
        if event["suggestion_id"] in reviewed_hashmap:
            time_spent_verifying[event["suggestion_id"]] = (
                reviewed_hashmap[event["suggestion_id"]] - event["timestamp"]
            ) / 1000
        # else:
        #     print("No review found for suggestion: ", event["suggestion_id"])

    return time_spent_verifying


# Pre-processing

In [4]:
cred = credentials.Certificate("../codeinterface-85b5e-firebase-adminsdk-11q7e-837ba92a03.json")
firebase_admin.initialize_app(cred)
db = firestore.client()

In [5]:
docs = db.collection('responses').get()

In [6]:
DICT_TOOL_USAGE = {"1":"Strongly Disagree","2":"Disagree", "3":"Neutral", "4":"Agree", "5":"Strongly Agree"}

In [7]:
df = (pd.DataFrame([x.to_dict() for x in docs])
     .dropna(subset=["telemetry_data", "completed_task_time", "date_performed"])
     .assign(completed_task_time = lambda x: [dparser.parse(y, fuzzy=True) for y in x["completed_task_time"]])
     .assign(date_performed = lambda x: [dparser.parse(y, fuzzy=True) for y in x["date_performed"]])
     .assign(task_duration = lambda x: x.completed_task_time - x.date_performed)
     .assign(model = lambda x: [re.match("[a-zA-Z]*_[a-zA-Z0-9]*", x)[0] if re.match("[a-zA-Z]*_[a-zA-Z0-9]*", str(x)) else "" for x in x["task_id"]])
     .assign(n_tasks_completed = lambda z: [len([x for x in y if x["event_type"] == "submit_code" and x["completed_task"] == 1 and x["task_index"] != -1]) for y in z["telemetry_data"]])
     .assign(n_tasks_attempted = lambda z: [len([x for x in y if x["event_type"] == "load_task"]) for y in z["telemetry_data"]])
     .assign(n_tasks_skipped = lambda z: [len([x for x in y if x["event_type"] == "skip_task"]) for y in z["telemetry_data"]])
     .assign(TLX_frustration = lambda x: x["frustration"].astype(int))
     .assign(TLX_performance = lambda x: x["performance"].astype(int))
     .assign(TLX_temporal_demand = lambda x: x["temporalDemand"].astype(int))
     .assign(TLX_physical_demand = lambda x: x["physicalDemand"].astype(int))
     .assign(TLX_effort = lambda x: x["effort"].astype(int))
     .assign(TLX_mental_demand = lambda x: x["mentalDemand"].astype(int))
     .assign(TLX_total_score = lambda x: x.filter(like="TLX").sum(axis=1) * 5)
     .assign(n_sugg_accepted = lambda z: [len([x for x in y if x["event_type"] == "accept"]) for y in z["telemetry_data"]])
     .assign(n_sugg_shown = lambda z: [len([x for x in y if x["event_type"] == "suggestion_shown" and x["suggestion"] != ""]) for y in z["telemetry_data"]])
     .assign(sugg_accept_rate = lambda x: x.n_sugg_accepted / x.n_sugg_shown)
     .assign(task_completion_durations = lambda x: [get_time_to_completion(y)[0] for y in x['telemetry_data']])
     .assign(mean_task_duration = lambda x: [np.nanmean(y) for y in x.task_completion_durations])
     .assign(coding_time = lambda x: [get_coding_time(y) for y in x['telemetry_data']])
     .assign(time_spent_verifying = lambda x: [get_time_verifying_suggestion(y) for y in x['telemetry_data']]))

  .assign(mean_task_duration = lambda x: [np.nanmean(y) for y in x.task_completion_durations])


## Sanity checks
* Add more after discussing data structure with Hussein 

In [8]:
# This is failing ... what are these durations exactly? Are we comparing the right values?  
assert all(np.array([len(x) for x in df["task_completion_durations"]]) == df["n_tasks_completed"])

AssertionError: 

In [9]:
df.groupby("model")["n_tasks_completed"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
autocomplete_gpt35,16.0,2.75,2.569047,0.0,0.0,2.5,5.0,7.0
autocomplete_llama34,12.0,2.75,1.658312,0.0,2.0,2.5,3.25,6.0
autocomplete_llama7,8.0,2.875,2.587746,0.0,0.75,2.5,5.0,7.0
chat_gpt35,12.0,2.5,2.236068,0.0,1.0,2.0,4.25,7.0
chat_llama34,10.0,2.4,1.837873,0.0,1.25,2.0,2.75,6.0
chat_llama7,11.0,2.0,1.48324,0.0,1.0,2.0,2.5,5.0
nomodel_0,4.0,4.25,1.5,3.0,3.0,4.0,5.25,6.0
nomodel_1,2.0,4.0,1.414214,3.0,3.5,4.0,4.5,5.0
nomodel_2,2.0,2.5,0.707107,2.0,2.25,2.5,2.75,3.0
nomodel_4,2.0,3.0,2.828427,1.0,2.0,3.0,4.0,5.0


# OLD CODE

In [116]:
docs = db.collection('responses').get()
# save docs in a pickle file called old_tasks.pkl
responses = {}
all_responses = {}
done_count = 0
for doc in docs:
    doc_dict = doc.to_dict()
    doc_dict['id'] = doc.id 
    all_responses[doc.id] = doc_dict
    if 'entered_exit_survey' and 'finalcomments'  in doc_dict:
        print(doc_dict['entered_exit_survey'])
        responses[doc.id] = doc_dict

        done_count += 1
        print(doc_dict['finalcomments'])
        print(doc_dict['howaiimproved'])



print(done_count)

Sun Jan 28 2024 20:34:44 GMT-0500 (Eastern Standard Time)

It could be more contextualized to the problem at hand, and it could also integrate correct documentation.
Mon Jan 29 2024 22:46:07 GMT-0800 (Pacific Standard Time)
The feedback aspect was very poor. The output section for the code was very small 2 lines so I had to scroll to see my outputs and the error messages were difficult to parse. It would show me a assertion error but since it was pure text, I had a hard time seeing what was the correct output and what my output was. The question was sometimes hard. For the transforming dataframe part, the example shown was very out of shape and the rows and columns were not in line making it hard to read the dataframe.
I wish I could prompt the AI to what I want to do, for example: If I make a comment in the program starting with AI like this, "#AI What is the syntax for making a string upper case?" Then the AI would answer this prompt with a comment. Another way AI could be improved i

In [None]:
pickle.dump(all_responses, open('responses.pkl', 'wb'))

In [117]:
import dateutil.parser as dparser


def process_log(study_data, type = "none"):
    frustration = int(study_data["frustration"])
    performance = int(study_data["performance"])
    temporal_demand = int(study_data["temporalDemand"])
    physical_demand = int(study_data["physicalDemand"])
    effort = int(study_data["effort"])
    mental_demand = int(study_data["mentalDemand"])
    tlx_score = get_tlx_score(frustration, performance, temporal_demand, physical_demand, effort, mental_demand)
    #print("TLX Score: ", tlx_score)


    completed_time = dparser.parse(study_data["completed_task_time"], fuzzy=True)
    date_performed = dparser.parse(study_data["date_performed"], fuzzy=True)
    study_completion = get_completion_time(date_performed, completed_time)
    #print("Completion Time: ", study_completion)


    tasks_completed = get_tasks_completed(study_data["telemetry_data"])
    #print("Tasks Completed: ", tasks_completed)

    tasks_attempted = get_tasks_attempted(study_data["telemetry_data"])
    #print("Tasks Attempted: ", tasks_attempted)

    time_to_completion, avg_time_to_completion = get_time_to_completion(study_data["telemetry_data"])
    #print("Time To Completion: ", time_to_completion)
    #print("Average Time To Completion: ", avg_time_to_completion)

    tasks_skipped = get_tasks_skipped(study_data["telemetry_data"])
    #print("Tasks Skipped: ", tasks_skipped)

    coding_time = get_coding_time(study_data["telemetry_data"])
    #print("Coding Time: ", coding_time)
    additional_metrics = {}
    if type == "autocomplete":
        accept_rate = get_suggestion_acceptance_rate(study_data["telemetry_data"])
        additional_metrics = {
            "accept_rate": accept_rate,
        }
    
    if type == "chat":
        # count 'assistant_response', 'copy_code' event types in telemetry_data
        assistant_response_count = len([event for event in study_data["telemetry_data"] if event["event_type"] == "assistant_response"])
        copy_code_count = len([event for event in study_data["telemetry_data"] if event["event_type"] == "copy_code"])
        paste_into_editor_count = len([event for event in study_data["telemetry_data"] if event["event_type"] == "paste_into_editor"])
        additional_metrics = {
            "assistant_response_count": assistant_response_count,
            "copy_code_count": copy_code_count,
            "paste_into_editor_count": paste_into_editor_count,
        }
    	
    if type == "chat" or type == "autocomplete":
        # aihelpful 
        aihelpful = int(study_data["aihelpful"])
        additional_metrics["aihelpful"] = aihelpful

    dict_metrics  = {

        "tlx_score": tlx_score,
        "study_completion": study_completion,
        "tasks_completed": tasks_completed,
        "tasks_attempted": tasks_attempted,
        "time_to_completion": time_to_completion,
        "avg_time_to_completion": avg_time_to_completion,
        "tasks_skipped": tasks_skipped,
        "coding_time": coding_time,
        **additional_metrics
    }
    return dict_metrics

def get_tlx_score(frustration, performance, temporal_demand, physical_demand, effort, mental_demand):
    return (frustration + performance + temporal_demand + physical_demand + effort + mental_demand) * 5


def convert_tool_usage_to_str(tool_usage):
    if tool_usage == "1":
        return "Strongly Disagree"
    elif tool_usage == "2":
        return "Disagree"
    elif tool_usage == "3":
        return "Neutral"
    elif tool_usage == "4":
        return "Agree"
    elif tool_usage == "5":
        return "Strongly Agree"
    else:
        raise ValueError("Invalid tool usage")


def get_completion_time(start_time, end_time):
    return end_time - start_time


def get_suggestion_acceptance_rate(telemetry_data):
    num_accept = len([event for event in telemetry_data if event["event_type"] == "accept"])
    # only count suggestion_shown when suggestion is not ""
    num_suggestion_shown = len([event for event in telemetry_data if event["event_type"] == "suggestion_shown" and event["suggestion"] != ""])
    # if num_suggestion_shown == 0:
    #     print("No suggestions shown!!")
    #     return np.nan
    
    return num_accept / num_suggestion_shown


def get_tasks_completed(telemetry_data):
    return len(
        [event for event in telemetry_data if event["event_type"] == "submit_code" and event["completed_task"] == 1 and event["task_index"] != -1]
    )


def get_tasks_attempted(telemetry_data):
    return len([event for event in telemetry_data if event["event_type"] == "load_task"])


def get_time_to_completion(telemetry_data):
    starts = [event for event in telemetry_data if event["event_type"] == "load_task"]
    ends = [
        event#["timestamp"]
        for event in telemetry_data
        if event["event_type"] == "submit_code" and event["completed_task"] == 1
    ]
    times_tasks_solved = []
    task_indices_seen = set()
    for start in starts:
        if start["task_index"] == -1 or start["task_index"] in task_indices_seen:
            continue
        task_indices_seen.add(start["task_index"])
        
        for end in ends:
            if end["task_index"] == start["task_index"]:
                # check if tim is more than 10mins
                if (end["timestamp"] - start["timestamp"]) / 1000 < 600:
                    times_tasks_solved.append((end["timestamp"] - start["timestamp"]) / 1000)
                    break


    if len(times_tasks_solved) == 0:
        return 0, np.nan
    return times_tasks_solved, np.mean(times_tasks_solved)


def get_coding_time(telemetry_data):
    # Get first load task
    start = [event["timestamp"] for event in telemetry_data if event["event_type"] == "load_task"][0]

    # Get last telemetry event
    end = telemetry_data[-1]["timestamp"]

    return (end - start) / 1000


def get_tasks_skipped(telemetry_data):
    return len([event for event in telemetry_data if event["event_type"] == "skip_task"])


def get_tasks_skipped(telemetry_data):
    return len([event for event in telemetry_data if event["event_type"] == "skip_task"])


def get_time_verifying_suggestion(telemetry_data):
    # Get suggestions
    suggestions_shown = [event for event in telemetry_data if event["event_type"] == "suggestion_shown"]

    suggestions_reviewed = [
        event for event in telemetry_data if event["event_type"] == "reject" or event["event_type"] == "accept"
    ]

    # Create a hashmap for suggestion reviews.
    reviewed_hashmap = {}
    for event in suggestions_reviewed:
        reviewed_hashmap[event["suggestion_id"]] = event["timestamp"]

    # Create a hashmap for times to completion
    time_spent_verifying = {}
    for event in suggestions_shown:
        if event["suggestion_id"] in reviewed_hashmap:
            time_spent_verifying[event["suggestion_id"]] = (
                reviewed_hashmap[event["suggestion_id"]] - event["timestamp"]
            ) / 1000
        else:
            print("No review found for suggestion: ", event["suggestion_id"])

    return time_spent_verifying


In [118]:
autoocomplete_gpt35_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'accept_rate': [], 'aihelpful': []}
autocomplete_llama34_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'accept_rate': [], 'aihelpful': []}
autocomplete_llama7_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'accept_rate': [], 'aihelpful': []}
chat_gpt35_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'assistant_response_count': [], 'copy_code_count': [], 'paste_into_editor_count': [],  'aihelpful': []}
chat_llama34_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'assistant_response_count': [], 'copy_code_count': [], 'paste_into_editor_count': [],  'aihelpful': []}
chat_llama7_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'assistant_response_count': [], 'copy_code_count': [], 'paste_into_editor_count': [],  'aihelpful': []}
nomodel_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': []}
for resp in responses.values():
    if 'entered_exit_survey' not in resp:
        continue
    resp_id = resp['id']
    print(resp_id)
    log_metrics = process_log(resp)
    if log_metrics['tasks_completed'] < 2:
        continue
    print(f'name {resp["name"]} email {resp["email"]} completed {log_metrics["tasks_completed"]} tasks')
    # get event types
    print(f'event types {set([event["event_type"] for event in resp["telemetry_data"]])}')
    if 'autocomplete_gpt35' in resp_id:
        autoocomplete_gpt35_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="autocomplete")
        # merge dicts
        for key in log_metrics:
            autoocomplete_gpt35_metrics[key].append(log_metrics[key])
        if log_metrics['accept_rate'] <0.001:
            print(f'never accepted sug for {resp_id} name {resp["name"]} email {resp["email"]}')
            
    elif 'autocomplete_llama34' in resp_id:
        autocomplete_llama34_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="autocomplete")
        for key in log_metrics:
            autocomplete_llama34_metrics[key].append(log_metrics[key])
        if log_metrics['accept_rate'] <0.001:
            print(f'never accepted sug for {resp_id} name {resp["name"]} email {resp["email"]}')
    elif 'autocomplete_llama7' in resp_id:
        autocomplete_llama7_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="autocomplete")
        for key in log_metrics:
            autocomplete_llama7_metrics[key].append(log_metrics[key])
        if log_metrics['accept_rate'] <0.001:
            print(f'never accepted sug for {resp_id} name {resp["name"]} email {resp["email"]}')
    elif 'chat_gpt35' in resp_id:
        chat_gpt35_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="chat")
        for key in log_metrics:
            chat_gpt35_metrics[key].append(log_metrics[key])
        if log_metrics['assistant_response_count'] < 1:
            print(f'never responded for {resp_id} name {resp["name"]} email {resp["email"]}')
            # print all events with type paste_into_editor
            for event in resp["telemetry_data"]:
                if event["event_type"] == "paste_into_editor":
                    print(event)
    elif 'chat_llama34' in resp_id:
        chat_llama34_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="chat")
        for key in log_metrics:
            chat_llama34_metrics[key].append(log_metrics[key])
        if log_metrics['assistant_response_count'] < 1:
            print(f'never responded for {resp_id} name {resp["name"]} email {resp["email"]}')
            for event in resp["telemetry_data"]:
                if event["event_type"] == "paste_into_editor":
                    print(event)
    elif 'chat_llama7' in resp_id:
        chat_llama7_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="chat")
        for key in log_metrics:
            chat_llama7_metrics[key].append(log_metrics[key])
        if log_metrics['assistant_response_count'] < 1:
            print(f'never responded for {resp_id} name {resp["name"]} email {resp["email"]}')
            for event in resp["telemetry_data"]:
                if event["event_type"] == "paste_into_editor":
                    print(event)
    elif 'nomodel' in resp_id:
        nomodel_metrics['num_responses'] += 1
        log_metrics = process_log(resp)
        for key in log_metrics:
            nomodel_metrics[key].append(log_metrics[key])
    else:
        print('no model found for response ' + resp_id)


autocomplete_gpt35_0_4-4767663
name Ming Chong Lim email mingchol@andrew.cmu.edu completed 7 tasks
event types {'before_shown', 'load_task', 'submit_code', 'suggestion_shown', 'request_suggestion', 'reject', 'save_code', 'accept', 'run_code'}
autocomplete_gpt35_0_9-263157
name San Shin email samshin0714@gmail.com completed 2 tasks
event types {'before_shown', 'load_task', 'submit_code', 'suggestion_shown', 'skip_task', 'request_suggestion', 'reject', 'save_code', 'code_reset', 'accept', 'run_code'}
autocomplete_gpt35_1_1-5457944
name Shrikara Varna email svarna@andrew.cmu.edu completed 3 tasks
event types {'before_shown', 'load_task', 'submit_code', 'suggestion_shown', 'skip_task', 'reject', 'save_code', 'accept', 'run_code'}
autocomplete_gpt35_1_10-7542461
name Nirajan Koirala email nkoirala@nd.edu completed 3 tasks
event types {'before_shown', 'load_task', 'submit_code', 'suggestion_shown', 'skip_task', 'reject', 'save_code', 'accept', 'run_code'}
autocomplete_gpt35_1_2-7342708
name 

In [119]:

# print metrics for each
print('autoocomplete_gpt35_metrics')
print(autoocomplete_gpt35_metrics)
print('autocomplete_llama34_metrics')
print(autocomplete_llama34_metrics)
print('autocomplete_llama7_metrics')
print(autocomplete_llama7_metrics)
print('chat_gpt35_metrics')
print(chat_gpt35_metrics)
print('chat_llama34_metrics')
print(chat_llama34_metrics)
print('chat_llama7_metrics')
print(chat_llama7_metrics)
print('nomodel_metrics')
print(nomodel_metrics)


autoocomplete_gpt35_metrics
{'num_responses': 9, 'tlx_score': [265, 260, 375, 380, 340, 135, 350, 455, 285], 'study_completion': [datetime.timedelta(seconds=2006), datetime.timedelta(seconds=3162), datetime.timedelta(seconds=2240), datetime.timedelta(seconds=2265), datetime.timedelta(seconds=2284), datetime.timedelta(seconds=2545), datetime.timedelta(seconds=2456), datetime.timedelta(seconds=2316), datetime.timedelta(seconds=1634)], 'tasks_completed': [7, 2, 3, 3, 6, 5, 2, 3, 7], 'tasks_attempted': [8, 6, 6, 6, 8, 7, 5, 6, 8], 'time_to_completion': [[134.873, 226.801, 269.65, 261.069, 146.12, 155.127, 558.661], [511.013, 45.36], [226.867, 208.59], [292.668, 528.917, 356.693], [146.457, 208.015, 273.293, 378.161, 319.032, 331.631], [127.053, 508.808, 386.152, 249.752, 244.011], [175.612], [262.154, 429.33], [88.23, 124.503, 304.871, 186.677, 319.081, 352.976, 65.946]], 'avg_time_to_completion': [250.32871428571428, 278.18649999999997, 217.7285, 392.75933333333336, 276.09816666666666, 30

In [120]:
# get average avg_time_to_completion and tasks_completed
print('avg_time_to_completion')
print('autocomplete_gpt35_metrics')
print(np.nanmean(autoocomplete_gpt35_metrics['avg_time_to_completion']))
print('autocomplete_llama34_metrics')
print(np.nanmean(autocomplete_llama34_metrics['avg_time_to_completion']))
print('autocomplete_llama7_metrics')
print(np.nanmean(autocomplete_llama7_metrics['avg_time_to_completion']))
print('chat_gpt35_metrics')
print(np.nanmean(chat_gpt35_metrics['avg_time_to_completion']))
print('chat_llama34_metrics')
print(np.nanmean(chat_llama34_metrics['avg_time_to_completion']))
print('chat_llama7_metrics')
print(np.nanmean(chat_llama7_metrics['avg_time_to_completion']))
print('nomodel_metrics')
print(np.nanmean(nomodel_metrics['avg_time_to_completion']))

print('tasks_completed')
print('autocomplete_gpt35_metrics')
print(np.nanmean(autoocomplete_gpt35_metrics['tasks_completed']))
print('autocomplete_llama34_metrics')
print(np.nanmean(autocomplete_llama34_metrics['tasks_completed']))
print('autocomplete_llama7_metrics')
print(np.nanmean(autocomplete_llama7_metrics['tasks_completed']))
print('chat_gpt35_metrics')
print(np.nanmean(chat_gpt35_metrics['tasks_completed']))
print('chat_llama34_metrics')
print(np.nanmean(chat_llama34_metrics['tasks_completed']))
print('chat_llama7_metrics')
print(np.nanmean(chat_llama7_metrics['tasks_completed']))
print('nomodel_metrics')
print(np.nanmean(nomodel_metrics['tasks_completed']))
print("tasks_skipped")
print('autocomplete_gpt35_metrics')
print(np.nanmean(autoocomplete_gpt35_metrics['tasks_skipped']))
print('autocomplete_llama34_metrics')
print(np.nanmean(autocomplete_llama34_metrics['tasks_skipped']))
print('autocomplete_llama7_metrics')
print(np.nanmean(autocomplete_llama7_metrics['tasks_skipped']))
print('chat_gpt35_metrics')
print(np.nanmean(chat_gpt35_metrics['tasks_skipped']))
print('chat_llama34_metrics')
print(np.nanmean(chat_llama34_metrics['tasks_skipped']))
print('chat_llama7_metrics')
print(np.nanmean(chat_llama7_metrics['tasks_skipped']))
print('nomodel_metrics')
print(np.nanmean(nomodel_metrics['tasks_skipped']))

print("aihelpful")
print('autocomplete_gpt35_metrics')
print(np.nanmean(autoocomplete_gpt35_metrics['aihelpful']))
print('autocomplete_llama34_metrics')
print(np.nanmean(autocomplete_llama34_metrics['aihelpful']))
print('autocomplete_llama7_metrics')
print(np.nanmean(autocomplete_llama7_metrics['aihelpful']))
print('chat_gpt35_metrics')
print(np.nanmean(chat_gpt35_metrics['aihelpful']))
print('chat_llama34_metrics')
print(np.nanmean(chat_llama34_metrics['aihelpful']))
print('chat_llama7_metrics')
print(np.nanmean(chat_llama7_metrics['aihelpful']))


avg_time_to_completion
autocomplete_gpt35_metrics
271.73899841269844
autocomplete_llama34_metrics
264.5228125
autocomplete_llama7_metrics
214.24385416666667
chat_gpt35_metrics
208.97562857142862
chat_llama34_metrics
316.08382666666665
chat_llama7_metrics
242.8976
nomodel_metrics
296.14387500000004
tasks_completed
autocomplete_gpt35_metrics
4.222222222222222
autocomplete_llama34_metrics
3.0
autocomplete_llama7_metrics
5.0
chat_gpt35_metrics
4.6
chat_llama34_metrics
3.4
chat_llama7_metrics
2.6
nomodel_metrics
3.6666666666666665
tasks_skipped
autocomplete_gpt35_metrics
0.6666666666666666
autocomplete_llama34_metrics
0.625
autocomplete_llama7_metrics
0.5
chat_gpt35_metrics
0.6
chat_llama34_metrics
0.8
chat_llama7_metrics
0.8
nomodel_metrics
0.16666666666666666
aihelpful
autocomplete_gpt35_metrics
2.6666666666666665
autocomplete_llama34_metrics
2.0
autocomplete_llama7_metrics
2.25
chat_gpt35_metrics
5.6
chat_llama34_metrics
5.2
chat_llama7_metrics
5.8
