In [1]:
import pandas as pd 
pd.set_option('display.max_columns', None)
import json
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import statsmodels.formula.api as smf 
from IPython.display import display, Markdown
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_pickle("../../data/final_df.pkl")
df["model_size"] = [x.split("_")[1] if x != "nomodel" else "nomodel" for x in df["model"]]
model_name_mapping = {
    'nomodel': 'No LLM',  # Assuming an empty string or some default value might be appropriate
    'chat_gpt35': 'GPT-3.5 (chat)',
    'autocomplete_gpt35': 'GPT-3.5',
    'autocomplete_llama34': 'CodeLlama34b',
    'chat_llama7': 'CodeLlama7b (chat)',
    'autocomplete_llama7': 'CodeLlama7b',
    'chat_llama34': 'CodeLlama34b (chat)',
    'chat_gpt4': 'GPT-4o (chat)',
}

df['model_name'] = df['model'].map(model_name_mapping)
# drop both_gpt35 model df
df = df[df["model"] != "both_gpt35"]

In [None]:
df['model'].value_counts()

# Autocomplete csv

In [4]:
columns = ["prefix_code", "suffix_code", "suggestion", "logprobs", "accepted", "programmer_id", "timestamp", "model", "task_name", "requested"]

# Create an empty DataFrame with the defined columns
autocomplete_df = pd.DataFrame(columns=columns)


In [5]:
programmer_ids = []
prefixes = []
suffixes = []
suggestions = []
logprobs = []
accepts = []
timestamps = []
models = []
task_names = []
requested = []
for i in range(len(df)):
    curr_df = df['suggestions_data'].iloc[i]
    if len(curr_df) == 0:
        continue
    initial_timestamp = curr_df['times'].iloc[0]
    for j in range(len(curr_df)):
        programmer_ids.append(i)
        models.append(df['model_name'].iloc[i])
        requested.append(curr_df['requested'].iloc[j])
        task_names.append(curr_df['task_name'].iloc[j])
        prefixes.append(curr_df['prefix_code'].iloc[j])
        suffixes.append(curr_df['suffix_code'].iloc[j])
        suggestions.append(curr_df['suggestion'].iloc[j])
        logprobs.append(curr_df['logprobs'].iloc[j])
        accepts.append(curr_df['label'].iloc[j])
        #timestamps.append((curr_df['times'].iloc[j]- initial_timestamp)/1000)
        timestamps.append((curr_df['times'].iloc[j])/1)
autocomplete_df['programmer_id'] = programmer_ids
autocomplete_df['prefix_code'] = prefixes
autocomplete_df['suffix_code'] = suffixes
autocomplete_df['suggestion'] = suggestions
autocomplete_df['logprobs'] = logprobs
autocomplete_df['accepted'] = accepts
autocomplete_df['timestamp'] = timestamps
autocomplete_df['model'] = models
autocomplete_df['task_name'] = task_names
autocomplete_df['requested'] = requested


In [6]:
# save to csv
autocomplete_df.to_csv("../../data/autocomplete_data.csv", index=False)

# Chat csv

In [7]:
columns = ["request", "response", "copy_events", "logprobs", "programmer_id", "timestamp", "model", "task_name"]

# Create an empty DataFrame with the defined columns
chat_df = pd.DataFrame(columns=columns)


In [8]:
programmer_ids = []
requests = []
responses = []
copy_events = []
timestamps = []
models = []
logprobs = []
task_names = []
for i in range(len(df)):
    curr_df = df['chat_history_data'].iloc[i]
    if len(curr_df) == 0:
        continue
    initial_timestamp = curr_df['times'].iloc[0]
    for j in range(len(curr_df)):
        programmer_ids.append(i)
        models.append(df['model_name'].iloc[i])
        task_names.append(curr_df['task_name'].iloc[j])
        requests.append(curr_df['message'].iloc[j])
        responses.append(curr_df['response'].iloc[j])
        copy_events.append(curr_df['copy_info'].iloc[j])
        logprobs.append(curr_df['logprobs'].iloc[j])
        timestamps.append((curr_df['times'].iloc[j]- initial_timestamp)/1000)


chat_df['programmer_id'] = programmer_ids
chat_df['request'] = requests
chat_df['response'] = responses
chat_df['copy_events'] = copy_events
chat_df['timestamp'] = timestamps
chat_df['model'] = models
chat_df['task_name'] = task_names
chat_df['logprobs'] = logprobs

In [None]:
chat_df

In [10]:
# save to csv
chat_df.to_csv("../../data/chat_data.csv", index=False)

# Task csv

In [11]:
# load tasks
import os, json
folder_path = '../../tasks_study/tasks'  
all_tasks = []
for file_name in os.listdir(folder_path):
    if file_name.endswith('.json'):
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'r') as file:
            task_data = json.load(file)
            all_tasks.append(task_data)

all_tasks = np.array(all_tasks)



In [None]:
columns = ["name", "task_description", "function_signature" , "unit_test", "solution", "type" ]
tasks_df = pd.DataFrame(columns=columns)

for i in range(len(all_tasks)):
    tasks_df = tasks_df.append({
        "name": all_tasks[i]["name"],
        "task_description": all_tasks[i]["task_description"],
        "function_signature": all_tasks[i]["function_signature"],
        "unit_test": all_tasks[i]["unit_test"],
        "solution": all_tasks[i]["solution"],
        "type": all_tasks[i]["type"]
    }, ignore_index=True)


In [13]:
# save to csv
tasks_df.to_csv("../../data/tasks_data.csv", index=False)

# Telemetry csv

In [14]:
# copy df
santized_df = df.copy()
santized_df = santized_df.drop(columns=['level_0', 'index', 'finalcomments', 'exp_condition', 'date_performed', 'completed_task' , 'TLX_total_score',  'howaiimproved', 'frustration', 'performance', 'howaiimproved', 'worker_id', 'temporalDemand', 'telemetry_data', 'mentalDemand', 'completed_task_time',  'entered_exit_survey', 'effort', 'physicalDemand', 'howaihelpful', 'time_completed', 'task_index', 'test', 'suggestions_data', 'chat_history_data', 'n_long_gaps', 'n_participants'])




In [None]:
#santized_df.iloc[172]['task_data'] 
# BEGIN: santized_df.iloc[172]['task_data']
for key, value in santized_df.iloc[172]['task_data'].items():
    santized_df.iloc[172]['task_data'][key]['code'] = ""
    print( santized_df.iloc[172]['task_data'][key]['code'] )

In [16]:
# add column programmer_id as index
santized_df['programmer_id'] = santized_df.index
#santized_df['code_history'] = santized_df['code_history'].apply(lambda x: x.assign(times=(x['times'] - x['times'].iloc[0])/1000))

# task_completion_durations, code_history, task_data
santized_df['task_completion_durations'] = santized_df['task_completion_durations'].apply(lambda x: [None if pd.isna(y) else y for y in x])

santized_df['code_history'] = santized_df['code_history'].apply(lambda x: x.to_json())
# Convert the 'task_data' dictionary column to a JSON string
#santized_df['task_data'] = santized_df['task_data'].apply(json.dumps)

In [17]:
# save to csv
santized_df.to_csv("../../data/study_data.csv", index=False)
# save to pickle
santized_df.to_pickle("../../data/study_data.pkl")

In [18]:
# load from csv
santized_df = pd.read_csv("../../data/study_data.csv")

In [19]:
santized_df['code_history'] = santized_df['code_history'].apply(lambda x: pd.read_json(x))
#santized_df['task_data'] = santized_df['task_data'].apply(json.loads)
santized_df['task_completion_durations'] = santized_df['task_completion_durations'].map(lambda x: eval(x))

