In [1]:
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import pickle
import os
import json
import numpy as np


In [2]:
cred = credentials.Certificate("../codeinterface-85b5e-firebase-adminsdk-11q7e-837ba92a03.json")
firebase_admin.initialize_app(cred)
db = firestore.client()

In [14]:
docs = db.collection('responses').get()
# save docs in a pickle file called old_tasks.pkl
responses = {}
done_count = 0
for doc in docs:
    doc_dict = doc.to_dict()
    doc_dict['id'] = doc.id 
    if 'entered_exit_survey' and 'finalcomments'  in doc_dict:
        print(doc_dict['entered_exit_survey'])
        responses[doc.id] = doc_dict

        done_count += 1
        print(doc_dict['finalcomments'])
        print(doc_dict['howaiimproved'])



print(done_count)

Sun Jan 28 2024 20:34:44 GMT-0500 (Eastern Standard Time)

It could be more contextualized to the problem at hand, and it could also integrate correct documentation.
Mon Jan 29 2024 17:36:18 GMT-0500 (Eastern Standard Time)

By ensuring that the suggestions meet the requirement and not change or add new functions that are not required.
Mon Jan 29 2024 13:32:50 GMT-0500 (Eastern Standard Time)

By changing the font color of suggestions provided by AI.
Sun Jan 28 2024 21:40:32 GMT-0600 (Central Standard Time)
The way the AI text was highlighted (maybe because of dark mode?) made it quite hard to read the AI generated text, which made it hard to tell if I wanted to use the suggestion.
Suggestions after 2-seconds of inactivity seems bad. It would often interject an annoying amount of crap when I was just thinking, which I would have to remove. Only using the output when prompting for it was much more helpful.
Mon Jan 29 2024 14:14:05 GMT-0500 (Eastern Standard Time)
What a time and efforts

In [4]:
import dateutil.parser as dparser


def process_log(study_data, type = "none"):
    frustration = int(study_data["frustration"])
    performance = int(study_data["performance"])
    temporal_demand = int(study_data["temporalDemand"])
    physical_demand = int(study_data["physicalDemand"])
    effort = int(study_data["effort"])
    mental_demand = int(study_data["mentalDemand"])
    tlx_score = get_tlx_score(frustration, performance, temporal_demand, physical_demand, effort, mental_demand)
    #print("TLX Score: ", tlx_score)


    completed_time = dparser.parse(study_data["completed_task_time"], fuzzy=True)
    date_performed = dparser.parse(study_data["date_performed"], fuzzy=True)
    study_completion = get_completion_time(date_performed, completed_time)
    #print("Completion Time: ", study_completion)


    tasks_completed = get_tasks_completed(study_data["telemetry_data"])
    #print("Tasks Completed: ", tasks_completed)

    tasks_attempted = get_tasks_attempted(study_data["telemetry_data"])
    #print("Tasks Attempted: ", tasks_attempted)

    time_to_completion, avg_time_to_completion = get_time_to_completion(study_data["telemetry_data"])
    #print("Time To Completion: ", time_to_completion)
    #print("Average Time To Completion: ", avg_time_to_completion)

    tasks_skipped = get_tasks_skipped(study_data["telemetry_data"])
    #print("Tasks Skipped: ", tasks_skipped)

    coding_time = get_coding_time(study_data["telemetry_data"])
    #print("Coding Time: ", coding_time)
    additional_metrics = {}
    if type == "autocomplete":
        accept_rate = get_suggestion_acceptance_rate(study_data["telemetry_data"])
        additional_metrics = {
            "accept_rate": accept_rate,
        }
    
    if type == "chat":
        # count 'assistant_response', 'copy_code' event types in telemetry_data
        assistant_response_count = len([event for event in study_data["telemetry_data"] if event["event_type"] == "assistant_response"])
        copy_code_count = len([event for event in study_data["telemetry_data"] if event["event_type"] == "copy_code"])
        additional_metrics = {
            "assistant_response_count": assistant_response_count,
            "copy_code_count": copy_code_count,
        }
    	

    dict_metrics  = {

        "tlx_score": tlx_score,
        "study_completion": study_completion,
        "tasks_completed": tasks_completed,
        "tasks_attempted": tasks_attempted,
        "time_to_completion": time_to_completion,
        "avg_time_to_completion": avg_time_to_completion,
        "tasks_skipped": tasks_skipped,
        "coding_time": coding_time,
        **additional_metrics
    }
    return dict_metrics

def get_tlx_score(frustration, performance, temporal_demand, physical_demand, effort, mental_demand):
    return (frustration + performance + temporal_demand + physical_demand + effort + mental_demand) * 5


def convert_tool_usage_to_str(tool_usage):
    if tool_usage == "1":
        return "Strongly Disagree"
    elif tool_usage == "2":
        return "Disagree"
    elif tool_usage == "3":
        return "Neutral"
    elif tool_usage == "4":
        return "Agree"
    elif tool_usage == "5":
        return "Strongly Agree"
    else:
        raise ValueError("Invalid tool usage")


def get_completion_time(start_time, end_time):
    return end_time - start_time


def get_suggestion_acceptance_rate(telemetry_data):
    num_accept = len([event for event in telemetry_data if event["event_type"] == "accept"])
    # only count suggestion_shown when suggestion is not ""
    num_suggestion_shown = len([event for event in telemetry_data if event["event_type"] == "suggestion_shown" and event["suggestion"] != ""])
    # if num_suggestion_shown == 0:
    #     print("No suggestions shown!!")
    #     return np.nan
    
    return num_accept / num_suggestion_shown


def get_tasks_completed(telemetry_data):
    return len(
        [event for event in telemetry_data if event["event_type"] == "submit_code" and event["completed_task"] == 1]
    )


def get_tasks_attempted(telemetry_data):
    return len([event for event in telemetry_data if event["event_type"] == "load_task"])


def get_time_to_completion(telemetry_data):
    starts = [event["timestamp"] for event in telemetry_data if event["event_type"] == "load_task"]
    ends = [
        event["timestamp"]
        for event in telemetry_data
        if event["event_type"] == "submit_code" and event["completed_task"] == 1
    ]

    times = [(end - start) / 1000 for start, end in zip(starts, ends)]
    if len(times) == 0:
        return 0, np.nan
    return times, sum(times) / len(times)


def get_coding_time(telemetry_data):
    # Get first load task
    start = [event["timestamp"] for event in telemetry_data if event["event_type"] == "load_task"][0]

    # Get last telemetry event
    end = telemetry_data[-1]["timestamp"]

    return (end - start) / 1000


def get_tasks_skipped(telemetry_data):
    return len([event for event in telemetry_data if event["event_type"] == "skip_task"])


def get_tasks_skipped(telemetry_data):
    return len([event for event in telemetry_data if event["event_type"] == "skip_task"])


def get_time_verifying_suggestion(telemetry_data):
    # Get suggestions
    suggestions_shown = [event for event in telemetry_data if event["event_type"] == "suggestion_shown"]

    suggestions_reviewed = [
        event for event in telemetry_data if event["event_type"] == "reject" or event["event_type"] == "accept"
    ]

    # Create a hashmap for suggestion reviews.
    reviewed_hashmap = {}
    for event in suggestions_reviewed:
        reviewed_hashmap[event["suggestion_id"]] = event["timestamp"]

    # Create a hashmap for times to completion
    time_spent_verifying = {}
    for event in suggestions_shown:
        if event["suggestion_id"] in reviewed_hashmap:
            time_spent_verifying[event["suggestion_id"]] = (
                reviewed_hashmap[event["suggestion_id"]] - event["timestamp"]
            ) / 1000
        else:
            print("No review found for suggestion: ", event["suggestion_id"])

    return time_spent_verifying


In [15]:
autoocomplete_gpt35_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'accept_rate': []}
autocomplete_llama34_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'accept_rate': []}
autocomplete_llama7_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'accept_rate': []}
chat_gpt35_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'assistant_response_count': [], 'copy_code_count': []}
chat_llama34_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'assistant_response_count': [], 'copy_code_count': []}
chat_llama7_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': [], 'assistant_response_count': [], 'copy_code_count': []}
nomodel_metrics = {'num_responses': 0, 'tlx_score': [], 'study_completion': [], 'tasks_completed': [], 'tasks_attempted': [], 'time_to_completion': [], 'avg_time_to_completion': [], 'tasks_skipped': [], 'coding_time': []}
for resp in responses.values():
    if 'entered_exit_survey' not in resp:
        continue
    resp_id = resp['id']
    print(resp_id)
    log_metrics = process_log(resp)
    if log_metrics['tasks_completed'] < 2:
        continue
    print(f'name {resp["name"]} email {resp["email"]} completed {log_metrics["tasks_completed"]} tasks')
    # get event types
    print(f'event types {set([event["event_type"] for event in resp["telemetry_data"]])}')
    if 'autocomplete_gpt35' in resp_id:
        autoocomplete_gpt35_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="autocomplete")
        # merge dicts
        for key in log_metrics:
            autoocomplete_gpt35_metrics[key].append(log_metrics[key])
        if log_metrics['accept_rate'] <0.001:
            print(f'never accepted sug for {resp_id} name {resp["name"]} email {resp["email"]}')
            
    elif 'autocomplete_llama34' in resp_id:
        autocomplete_llama34_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="autocomplete")
        for key in log_metrics:
            autocomplete_llama34_metrics[key].append(log_metrics[key])
        if log_metrics['accept_rate'] <0.001:
            print(f'never accepted sug for {resp_id} name {resp["name"]} email {resp["email"]}')
    elif 'autocomplete_llama7' in resp_id:
        autocomplete_llama7_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="autocomplete")
        for key in log_metrics:
            autocomplete_llama7_metrics[key].append(log_metrics[key])
        if log_metrics['accept_rate'] <0.001:
            print(f'never accepted sug for {resp_id} name {resp["name"]} email {resp["email"]}')
    elif 'chat_gpt35' in resp_id:
        chat_gpt35_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="chat")
        for key in log_metrics:
            chat_gpt35_metrics[key].append(log_metrics[key])
        if log_metrics['assistant_response_count'] < 1:
            print(f'never responded for {resp_id} name {resp["name"]} email {resp["email"]}')
            # print all events with type paste_into_editor
            for event in resp["telemetry_data"]:
                if event["event_type"] == "paste_into_editor":
                    print(event)
    elif 'chat_llama34' in resp_id:
        chat_llama34_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="chat")
        for key in log_metrics:
            chat_llama34_metrics[key].append(log_metrics[key])
        if log_metrics['assistant_response_count'] < 1:
            print(f'never responded for {resp_id} name {resp["name"]} email {resp["email"]}')
            for event in resp["telemetry_data"]:
                if event["event_type"] == "paste_into_editor":
                    print(event)
    elif 'chat_llama7' in resp_id:
        chat_llama7_metrics['num_responses'] += 1
        log_metrics = process_log(resp, type="chat")
        for key in log_metrics:
            chat_llama7_metrics[key].append(log_metrics[key])
        if log_metrics['assistant_response_count'] < 1:
            print(f'never responded for {resp_id} name {resp["name"]} email {resp["email"]}')
            for event in resp["telemetry_data"]:
                if event["event_type"] == "paste_into_editor":
                    print(event)
    elif 'nomodel' in resp_id:
        nomodel_metrics['num_responses'] += 1
        log_metrics = process_log(resp)
        for key in log_metrics:
            nomodel_metrics[key].append(log_metrics[key])
    else:
        print('no model found for response ' + resp_id)


autocomplete_gpt35_0_4-4767663
name Ming Chong Lim email mingchol@andrew.cmu.edu completed 8 tasks
event types {'before_shown', 'load_task', 'submit_code', 'suggestion_shown', 'request_suggestion', 'reject', 'save_code', 'accept', 'run_code'}
autocomplete_gpt35_1_1-5457944
name Shrikara Varna email svarna@andrew.cmu.edu completed 4 tasks
event types {'before_shown', 'load_task', 'submit_code', 'suggestion_shown', 'skip_task', 'reject', 'save_code', 'accept', 'run_code'}
autocomplete_gpt35_1_10-7542461
name Nirajan Koirala email nkoirala@nd.edu completed 4 tasks
event types {'before_shown', 'load_task', 'submit_code', 'suggestion_shown', 'skip_task', 'reject', 'save_code', 'accept', 'run_code'}
autocomplete_gpt35_1_2-7342708
name Eric Schneider email franz.eric.schneider@gmail.com completed 7 tasks
event types {'before_shown', 'load_task', 'submit_code', 'suggestion_shown', 'request_suggestion', 'reject', 'save_code', 'accept', 'run_code'}
autocomplete_gpt35_1_9-7410549
autocomplete_gpt

In [16]:

# print metrics for each
print('autoocomplete_gpt35_metrics')
print(autoocomplete_gpt35_metrics)
print('autocomplete_llama34_metrics')
print(autocomplete_llama34_metrics)
print('autocomplete_llama7_metrics')
print(autocomplete_llama7_metrics)
print('chat_gpt35_metrics')
print(chat_gpt35_metrics)
print('chat_llama34_metrics')
print(chat_llama34_metrics)
print('chat_llama7_metrics')
print(chat_llama7_metrics)
print('nomodel_metrics')
print(nomodel_metrics)


autoocomplete_gpt35_metrics
{'num_responses': 7, 'tlx_score': [265, 375, 380, 340, 350, 455, 285], 'study_completion': [datetime.timedelta(seconds=2006), datetime.timedelta(seconds=2240), datetime.timedelta(seconds=2265), datetime.timedelta(seconds=2284), datetime.timedelta(seconds=2456), datetime.timedelta(seconds=2316), datetime.timedelta(seconds=1634)], 'tasks_completed': [8, 4, 4, 7, 3, 4, 8], 'tasks_attempted': [8, 6, 6, 8, 5, 6, 8], 'time_to_completion': [[137.493, 134.873, 226.801, 269.65, 261.069, 146.12, 155.127, 558.661], [115.316, 226.867, 1298.0, 848.491], [176.118, 292.668, 528.917, 356.693], [191.283, 146.457, 208.015, 273.293, 378.161, 319.032, 331.631], [249.316, 175.612, 633.999], [74.629, 262.154, 637.68, 1044.376], [53.815, 88.23, 124.503, 304.871, 186.677, 319.081, 352.976, 65.946]], 'avg_time_to_completion': [236.22424999999998, 622.1685, 338.599, 263.98171428571425, 352.9756666666667, 504.70975, 187.01237500000002], 'tasks_skipped': [0, 1, 1, 0, 1, 1, 0], 'coding_

In [17]:
# get average avg_time_to_completion and tasks_completed
print('avg_time_to_completion')
print('autocomplete_gpt35_metrics')
print(np.nanmean(autoocomplete_gpt35_metrics['avg_time_to_completion']))
print('autocomplete_llama34_metrics')
print(np.nanmean(autocomplete_llama34_metrics['avg_time_to_completion']))
print('autocomplete_llama7_metrics')
print(np.nanmean(autocomplete_llama7_metrics['avg_time_to_completion']))
print('chat_gpt35_metrics')
print(np.nanmean(chat_gpt35_metrics['avg_time_to_completion']))
print('chat_llama34_metrics')
print(np.nanmean(chat_llama34_metrics['avg_time_to_completion']))
print('chat_llama7_metrics')
print(np.nanmean(chat_llama7_metrics['avg_time_to_completion']))
print('nomodel_metrics')
print(np.nanmean(nomodel_metrics['avg_time_to_completion']))

print('tasks_completed')
print('autocomplete_gpt35_metrics')
print(np.nanmean(autoocomplete_gpt35_metrics['tasks_completed']))
print('autocomplete_llama34_metrics')
print(np.nanmean(autocomplete_llama34_metrics['tasks_completed']))
print('autocomplete_llama7_metrics')
print(np.nanmean(autocomplete_llama7_metrics['tasks_completed']))
print('chat_gpt35_metrics')
print(np.nanmean(chat_gpt35_metrics['tasks_completed']))
print('chat_llama34_metrics')
print(np.nanmean(chat_llama34_metrics['tasks_completed']))
print('chat_llama7_metrics')
print(np.nanmean(chat_llama7_metrics['tasks_completed']))
print('nomodel_metrics')
print(np.nanmean(nomodel_metrics['tasks_completed']))


avg_time_to_completion
autocomplete_gpt35_metrics
357.9530365646259
autocomplete_llama34_metrics
320.59544841269843
autocomplete_llama7_metrics
323.28175
chat_gpt35_metrics
430.27267499999994
chat_llama34_metrics
397.70338888888887
chat_llama7_metrics
514.9847708333334
nomodel_metrics
374.0788333333333
tasks_completed
autocomplete_gpt35_metrics
5.428571428571429
autocomplete_llama34_metrics
3.8333333333333335
autocomplete_llama7_metrics
5.0
chat_gpt35_metrics
4.75
chat_llama34_metrics
5.333333333333333
chat_llama7_metrics
2.625
nomodel_metrics
5.333333333333333
