In [47]:
from utils import *
from collections import Counter
import ast

df = get_dataframe(True)

plt.rcParams.update({'font.size': 20})

In [None]:
df.columns

# Appendix D.1

In [None]:
sns.pointplot(x="sugg_accept_rate", y="model_size", data=df.query("interface == 'autocomplete'"), linestyles="", errorbar="se")
plt.ylabel("")
plt.xlabel(r'Autocomplete: % Suggestion Accepted')
plt.xlim(0,0.2)
plt.tick_params(left = False , labelleft = False ) 
plt.savefig("num_sugg_accepted.pdf", format="pdf", bbox_inches="tight")

In [None]:
sns.pointplot(x="sugg_accept_rate_requested", y="model_size", data=df.query("interface == 'autocomplete'"), linestyles="", errorbar="se")
plt.ylabel("")
plt.xlim(0, 0.5)
plt.xlabel(r'Requested: % Suggestion Accepted')
#plt.yticks([0,1,2], ['GPT-3.5', 'CodeLlama-34b', 'CodeLlama-7b'])
plt.tick_params(left = False , labelleft = False ) 
plt.savefig("num_sugg_accepted_requested.pdf", format="pdf", bbox_inches="tight")

In [None]:
chat_df = pd.read_csv("../data/chat_data.csv")
chat_df

In [6]:
###chat analysis


msg_lens = [] # length of indiv messages
all_words = {} #words used in indiv messages


df_chat = chat_df
for index, row in df_chat.iterrows():
    
    requests = ast.literal_eval(row['request'])
    
    if len(requests)>0:
        msg = requests[-1]['content']
        
        msg_lens.append(len(msg))
        
        words_in_msg = msg.split()
            
        for word in words_in_msg:
            clean_word = word.lower()

            if clean_word in all_words:
                all_words[clean_word]+=1
            else:
                all_words[clean_word]=1

In [None]:
import pandas as pd
all_words_values = list(all_words.values())
all_words_values.sort(reverse = True)
x = [i for i in range(len(all_words_values))]

plt.plot(x, all_words_values)
plt.xticks([])
plt.ylabel("Frequency")
plt.xlabel("Words appearing in chat msgs")

In [None]:
plt.hist(msg_lens)
plt.xlabel("Length of Chat Message")
plt.ylabel("Frequency")

In [9]:
msg_per = [] #number of messages sent for a task
msg_per_task = {}

for programmer in chat_df['programmer_id'].unique():
    chat_df_subset = chat_df[chat_df['programmer_id']==programmer]
    
    msg_per_chat = Counter(chat_df_subset['task_name'])
    
    for task in msg_per_chat:
        msg_per.append(msg_per_chat[task])
        
        if task in msg_per_task:
            msg_per_task[task].append(msg_per_chat[task])
        else:
            msg_per_task[task] = [msg_per_chat[task]]
    
    

In [None]:
plt.hist(msg_per)
plt.xlabel("Number of messages per task")
plt.ylabel("Frequency")

In [11]:
task_display_order = ["sum_product", 
                      "t_test", "table_transform_named", "table_transform_unnamed1", "table_transform_unnamed2", 
                      "tokenizer", "calculator", "login_authenticator", "retriever",
                      "even_odd_count", "triple_sum_to_zero", "encode_message", "is_bored", "is_multiply_prime", "count_nums", "order_by_points", "event_scheduler"]

task_mapper = dict(zip(task_display_order, ["tutorial"]+["data_manipulation"]*4+["edit_code"]*4+["puzzles"]*8))

task_comb_results = {"tutorial" :[], "data_manipulation":[], "edit_code":[], "puzzles":[]}
for task in msg_per_task:
    
    act_task = task_mapper[task]
    task_comb_results[act_task] += msg_per_task[task]

tasks = []
msgs = []

for task in task_comb_results:
    tasks += [task]*len(task_comb_results[task])
    msgs += task_comb_results[task]

df_task_level = pd.DataFrame({"task_category":tasks, "num_msg_sent":msgs})
df_task_level.task_category = pd.Categorical(df_task_level.task_category, 
                      categories=["puzzles", "data_manipulation", "edit_code"])

In [None]:
sns.pointplot(y="task_category", x="num_msg_sent", data=df_task_level.query("task_category != 'tutorial'"), linestyles="", errorbar="se")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Number of messages", fontsize=18)
plt.ylabel("")
plt.yticks([0, 1, 2], ['Algorithmic Problems', 'Data Manipulation', 'Edit/Augment Code'])

# Appendix D.2

In [13]:
outcome_cols = ["n_tasks_completed", "mean_task_duration"]
for task_id in df["task_id"].unique():
    df.loc[df["task_id"] == task_id, ["ctl_" + x for x in outcome_cols]] = StandardScaler().fit_transform(df.loc[df["task_id"] == task_id, outcome_cols])

In [None]:
sns.pointplot(x="ctl_mean_task_duration", y="model", data=df, linestyles="", hue="interface", errorbar="se")
plt.ylabel("")
plt.xlabel(r'$\Delta$ in Avg Task Duration')
plt.xlim(-1, 1)
plt.yticks([0, 1, 2,3,4,5,6,7], ['GPT-3.5-Turbo-Instruct', 'CodeLlama-34b', 'CodeLlama-7b', 'GPT-3.5-Turbo', 'GPT-4o', 'CodeLlama-34b-Instruct', 'CodeLlama-7b-Instruct', 'No LLM' ])
plt.legend([],[], frameon=False)

In [None]:
sns.pointplot(x="ctl_n_tasks_completed", y="model", data=df, linestyles="", hue="interface", errorbar="se")
plt.ylabel("")
plt.xlabel(r'$\Delta$ in Avg Task Duration')
plt.xlim(-1, 1)
plt.yticks([0, 1, 2,3,4,5,6,7], ['GPT-3.5-Turbo-Instruct', 'CodeLlama-34b', 'CodeLlama-7b', 'GPT-3.5-Turbo', 'GPT-4o', 'CodeLlama-34b-Instruct', 'CodeLlama-7b-Instruct', 'No LLM' ])
plt.legend([],[], frameon=False)

# Appendix D.3

In [25]:
list_task_level_dfs = []

import ast

for row in df.itertuples():
        
    temp = row.task_data
    df_temp = pd.DataFrame(temp.values()).assign(model=row.model, interface=row.interface, model_size=row.model_size, 
                                                          task_set=temp.keys())
    list_task_level_dfs.append(df_temp)
    
    
df_task_level = pd.concat(list_task_level_dfs, ignore_index=True).assign(has_ai = lambda x: x.model != "nomodel").query("time_in_task < 30*60 and name != 'event_scheduler'").reset_index()

In [26]:
task_display_order = ["sum_product", 
                      "t_test", "table_transform_named", "table_transform_unnamed1", "table_transform_unnamed2", 
                      "tokenizer", "calculator", "login_authenticator", "retriever",
                      "even_odd_count", "triple_sum_to_zero", "encode_message", "is_bored", "is_multiply_prime", "count_nums", "order_by_points"]

df_task_level["ordered_name"] = pd.Categorical(df_task_level["name"], categories=task_display_order, ordered=True)

df_task_level["task_category"] = df_task_level["name"].map(dict(zip(task_display_order, ["tutorial"]+["data_manipulation"]*4+["edit_code"]*4+["puzzles"]*7)))

In [None]:
plt.figure(figsize=(6,20))
sns.pointplot(y="ordered_name", x="time_in_task", hue="has_ai", dodge=0.25, data=df_task_level.query("task_category != 'tutorial'"), linestyles="", errorbar="se")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Avg task duration (s)", fontsize=18)
plt.ylabel("")
plt.legend(title='LLM-assisted')

# Appendix D.5

In [None]:
df_TLX = df[['TLX_frustration', 'TLX_performance',
       'TLX_temporal_demand', 'TLX_physical_demand', 'TLX_effort',
       'TLX_mental_demand','model']]

df_TLX.groupby(by=["model"]).mean()

# Appendix D.6

In [None]:
task_times = []

for item in df['task_completion_durations']:
    # remove None and nan
    item = [x for x in item if x is not None]
    if 'nan' not in item:
        task_times += item


# Calculating statistics
mean_time = np.nanmean(task_times)
std_time = np.nanstd(task_times)
min_time = np.nanmin(task_times)
max_time = np.nanmax(task_times)
median_time = np.nanmedian(task_times)

# Outputting statistics for clarity
print(f'mean time to complete task: {mean_time}')
print(f'std time to complete task: {std_time}')
print(f'min time to complete task: {min_time}')
print(f'max time to complete task: {max_time}')
print(f'median time to complete task: {median_time}')

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))
n, bins, patches = ax.hist(task_times, bins='auto', color='#007acc', alpha=0.7, rwidth=0.85)
ax.grid(axis='y', alpha=0.75)
ax.set_xlabel('Time to Complete a Task (seconds)')
ax.set_ylabel('Frequency')
#ax.set_title('Histogram of Task Completion Times')

# Overlaying summary statistics
ax.axvline(mean_time, color='r', linestyle='dashed', linewidth=1)
ax.text(mean_time, max(n)*0.97, 'Mean', rotation=0, color='r')

ax.axvline(median_time, color='g', linestyle='dashed', linewidth=1)
ax.text(median_time, max(n)*0.9, 'Median', rotation=0, color='g')




In [None]:

times_by_model_index = {}
max_index = 6
model_names = df['model_name'].unique()
for model in model_names:
    all_data_model = df[df['model_name'] == model]['task_completion_durations'].to_list()
    times_by_model_index[model] = []
    for i in range(max_index):
        list_i = []
        for data in all_data_model:
            if i < len(data):
                if data[i] != None:
                    list_i.append(data[i])
        times_by_model_index[model].append(list_i)

color0 = (0,0,0,0.8)
color1 = (0.2,0.4,0.2,0.8) 
color2 = (0.1,0.4,0.2,1) 
color3 = (0.2,0.4,0.7,0.8) 
color4 = (0.2,0.4,0.7,1) 
color5 = (0.6,0.2,0.6,0.8) 
color6 = (0.8,0.2,0.6,1)
color7 = (0.8,0.8,0.2,1)
colors = [color0, color1, color2, color3, color4, color5, color6, color7]
markers = ['o', 'x', 's', 'd', 'p', 'P', '<', '>', 'v', '^']
for i, model in enumerate(times_by_model_index):
    avgs = [np.nanmean(times_by_model_index[model][i]) for i in range(max_index)]
    stds = [np.nanstd(times_by_model_index[model][i])/np.sqrt(len(times_by_model_index[model][i])) for i in range(max_index)]
    plt.errorbar(range(max_index), avgs, yerr=stds, label=model, color=colors[i], marker=markers[i], alpha = 0.5)


ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.get_xaxis().tick_bottom()    
ax.get_yaxis().tick_left()   
plt.grid()
plt.legend()
plt.ylabel('Time to Complete Task (s) ', fontsize='xx-large')
plt.xlabel('Task Index Solved', fontsize='xx-large')
fig_size = plt.rcParams["figure.figsize"]
x_labels = ['tutorial', '1', '2', '3', '4', '5']
plt.xticks(range(max_index), x_labels, fontsize='xx-large')
fig_size[0] = 6
fig_size[1] = 4.2
#plt.savefig('time_to_complete_task_index.pdf', bbox_inches='tight', dpi=300)
plt.show()


In [None]:
plt.figure(figsize=(14,14))
sns.violinplot(x="zscore_mean_task_duration", y="model_clean_name", data=df, linestyles="", errorbar="se")
plt.ylabel("")
plt.xlabel(r'$\Delta$ in Avg Task Duration ($\downarrow$ better)')
#plt.xlim(-1, 1)
#plt.xlim(-120,100)
#plt.yticks([0,1,2,3], ['GPT-3.5', 'CodeLlama-34b', 'CodeLlama-7b', 'No LLM'])#plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.legend([],[], frameon=False)


# Edit Distance

In [39]:
import json
import pandas as pd
from tqdm import tqdm
autocomplete_df = pd.read_csv("../data/autocomplete_data.csv")
study_df = pd.read_csv("../data/study_data.csv")

In [None]:
# add column to autocomplete_df which is the pd.DataFrame(dataset['study'][programmer_id]['code_history'])
autocomplete_df['code_history'] = None
for i in tqdm(range(len(autocomplete_df))):
    programmer_id = int(autocomplete_df['programmer_id'].iloc[i])
    task_name = autocomplete_df['task_name'].iloc[i]
    timestamp = autocomplete_df['timestamp'].iloc[i] 
    code_history = pd.read_json(study_df.iloc[programmer_id]['code_history'])
    code_history = code_history[code_history['task_name'] == task_name]
    # only keep the code history that was written after the timestamp
    #print(timestamp)
    # make sure code_history['times'] is integer
    code_history['times'] = code_history['times'].astype(int)
    # in code_history how many times the code was written before the timestamp
    code_history = code_history[code_history['times'] > timestamp]
    suffix = autocomplete_df['suffix_code'].iloc[i] 
    prefix = autocomplete_df['prefix_code'].iloc[i]
    # if suffix nan
    if suffix != suffix:
        suffix = ""
    if prefix != prefix:
        prefix = ""
    accepted = autocomplete_df['accepted'].iloc[i]
    if accepted:
        code_at_acceptance = prefix + autocomplete_df['suggestion'].iloc[i] + suffix
    else:
        code_at_acceptance = prefix + suffix
    code_at_shown = prefix + autocomplete_df['suggestion'].iloc[i] + suffix
    
    if not accepted:
        for j in range(len(code_history)):
            code_j = code_history['code'].iloc[j]
            if code_j == code_at_shown:
                code_history = code_history.iloc[j+1:]
                break
        
    new_row = {'code': code_at_acceptance, 'times': 0, 'time_gaps': 0, 'edit_score': None}
    #re order code history so last row becomes first
    code_history = pd.concat([pd.DataFrame(new_row, index=[0]), code_history], ignore_index=True)
    code_history['edit_score'] = None
    row_index = i  # Index of the row
    col_index = autocomplete_df.columns.get_loc('code_history')  # Get the integer index of the column
    autocomplete_df.iat[row_index, col_index] = code_history

In [41]:
def check_included(row):
    suggestion = row['suggestion']
    timestamp = row['timestamp']
    code_history = row['code_history']
    accepted = row['accepted']
    if accepted == 0:
        return -1
    # Filter code_history where times > timestamp
    filtered_history = code_history[code_history['times'] > timestamp]
    len_filtered = len(filtered_history)
    if not filtered_history.empty:
        # Check if suggestion is in the first 'code' entry
        first_code = filtered_history.iloc[min(min_indx,len_filtered-1)]['code']
        return 1 if suggestion in first_code else 0
    
    return -1

# Apply the function to each row
min_indx = 0 
autocomplete_df['in_code_after_15s'] = autocomplete_df.apply(check_included, axis=1)
min_indx = 1
autocomplete_df['in_code_after_30s'] = autocomplete_df.apply(check_included, axis=1)
min_indx = 3
autocomplete_df['in_code_after_60s'] = autocomplete_df.apply(check_included, axis=1)


In [None]:

filtered_df = autocomplete_df[autocomplete_df['in_code_after_15s'] != -1]

average_by_model = filtered_df.groupby('model')['in_code_after_15s'].mean().reset_index()

# Print the result
print(average_by_model)


In [None]:

filtered_df = autocomplete_df[autocomplete_df['in_code_after_30s'] != -1]

average_by_model = filtered_df.groupby('model')['in_code_after_30s'].mean().reset_index()

# Print the result
print(average_by_model)



In [None]:

filtered_df = autocomplete_df[autocomplete_df['in_code_after_60s'] != -1]

average_by_model = filtered_df.groupby('model')['in_code_after_60s'].mean().reset_index()

# Print the result
print(average_by_model)


In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats
plt.rcParams.update({'font.size': 20})

plt.figure(figsize=(7,7))
# Data
time_after_acceptance = [15, 30, 60]
models = [ 'CodeLlama7b', 'CodeLlama34b', 'GPT-3.5']

in_code_after_15s = [ 0.797468, 0.845238, 0.835777]
in_code_after_30s = [ 0.398734, 0.428571, 0.480938]
in_code_after_60s = [ 0.158228, 0.321429, 0.313783]

# Plotting

plt.plot(time_after_acceptance, [in_code_after_15s[1], in_code_after_30s[1], in_code_after_60s[1]], marker='o', label='CodeLlama34b')
plt.plot(time_after_acceptance, [in_code_after_15s[0], in_code_after_30s[0], in_code_after_60s[0]], marker='o', label='CodeLlama7b')
plt.plot(time_after_acceptance, [in_code_after_15s[2], in_code_after_30s[2], in_code_after_60s[2]], marker='o', label='GPT-3.5')

# Adding titles and labels
plt.xlabel('Time after Acceptance (s)')
plt.ylabel('Still in Code Fraction')

# Adding a legend
plt.legend(title='Models')

# Display the plot
plt.grid(True)
plt.show()
