In [1]:
import pandas as pd

df_structs = pd.read_csv('../data/conversational_structures.csv')
experiment_submissions = pd.read_csv('../data/experiment_submissions.csv')
control_submissions = pd.read_csv("../data/control_submissions.csv")
df_exp = pd.read_csv('../data/experience_questionnaire_experiment.csv')


## Table 4: Conversational structures and task performance

In [2]:
merged_df = pd.merge(df_structs, experiment_submissions, on='conversation_id', how='inner')
merged_df["user_id"] = merged_df["user_id"].astype(int)
exp_df = df_exp.merge(merged_df, on='user_id', how='inner')


exp_df = exp_df.drop_duplicates('conversation_id')
means = merged_df.groupby('structure')['percentage_correct'].mean()
counts = merged_df.groupby('structure')['percentage_correct'].count()

result_df = pd.DataFrame({
    'Scenario': means.index,
    'Mean Percentage Correct': means.values,
    'Value Count': counts.values
})


exp = exp_df.groupby('structure')['exp_compared_peers'].mean()
counts = exp_df.groupby('structure')['percentage_correct'].count()

result_df2 = pd.DataFrame({
    'Scenario': exp.index,
    'Mean Exp Compared Peers': exp.values,
    'Value Count': counts.values
})


scenario_df = result_df.merge(result_df2, left_on='Scenario', right_on='Scenario', how='inner')
scenario_df

Unnamed: 0,Scenario,Mean Percentage Correct,Value Count_x,Mean Exp Compared Peers,Value Count_y
0,S1,0.442424,22,1.166667,18
1,S10,0.357407,9,2.125,8
2,S11,0.658333,4,1.5,4
3,S12,0.566667,2,2.5,2
4,S13,0.471212,22,1.333333,21
5,S2a,0.455128,13,1.846154,13
6,S2b,0.805556,3,1.0,3
7,S2c,0.8,1,2.0,1
8,S3,0.804762,7,2.0,7
9,S4,0.216667,9,1.125,8


# Table 3: Task performance between experiment and control

In [3]:
print(control_submissions.groupby("task")["percentage_correct"].agg(['mean', 'count']))
print(experiment_submissions.groupby("task")["percentage_correct"].agg(['mean', 'count']))


          mean  count
task                 
1     0.373684     19
2     0.100000      5
3     0.406250      8
4     0.850000      4
          mean  count
task                 
1     0.517778     45
2     0.263889     24
3     0.463542     16
4     0.683333     18


## Table 5: Task performance by prompt purposes

In [4]:
p = ['Code generation',
    'Code refinement',
    'Concept comprehension',
    'Code comprehension',
    'Docs querying',
    'Bug identification',
    'Testing']




mean_by_task = experiment_submissions.groupby("task")["percentage_correct"].agg(['mean', 'count'])


def get_experience(user_ids):  
    user_ids = [int(idx) for idx in user_ids]
    return df_exp[[True if r["user_id"] in user_ids else False for _ , r in df_exp.iterrows()]]["exp_compared_peers"].mean()

def get_experience_by_task(data):
    fin = []
    for t in [1,2,3,4]:
        fin.append(round(get_experience(data[data["task"]==t]["user_id"].unique()), 2))

    return fin

def get_line_by_purpose(purp):
    _with = experiment_submissions[experiment_submissions[purp]==1]
    _without = experiment_submissions[experiment_submissions[purp]==0]
    m_with = round((_with["percentage_correct"].mean() * 100),2)
    len_with = len(_with)
    m_without = round((_without["percentage_correct"].mean()* 100),2)
    len_without = len(_without)
    diff = round((m_with - m_without), 2)
    diff_perc = round((diff/ m_without *100), 1)
    exp = round(get_experience(list(_with["user_id"].unique())), 2)
    exp_without = round(get_experience(list(_without["user_id"].unique())), 2)

    return pd.DataFrame([{'purpose': purp ,"task": "*", 'mean': m_with, 'count': len_with, "exp": exp,  'mean-without': m_without, 'count-without': len_without, "exp-without": exp_without, 'diff': f"{diff} ({diff_perc}%)"}])


final_table = []
for purp in p: 
    first_line = get_line_by_purpose(purp)
    tmp_without = experiment_submissions[experiment_submissions[purp]==0]
    tmp_with = experiment_submissions[experiment_submissions[purp]==1]
    tmp = tmp_with.groupby("task")["percentage_correct"].agg(['mean', 'count'])
    tmp["purpose"] = purp

    without_aggr = tmp_without.groupby("task")["percentage_correct"].agg(['mean', 'count'])
    tmp["mean"] = tmp["mean"]*100
    tmp["mean-without"] = without_aggr["mean"] *100
    tmp["count-without"]= without_aggr["count"]
    diff = list(round((tmp["mean"] - tmp["mean-without"]), 2))
    diff_perc = list(round((diff/ tmp["mean-without"] *100), 1))

    tmp["mean"] = round(tmp["mean"], 2)
    tmp["mean-without"] = round(tmp["mean-without"], 2)
    tmp["diff"] = [f"{diff[i]} ({diff_perc[i]}%)" for i in range(len(diff))]
    tmp["exp"] = get_experience_by_task(tmp_with)
    tmp["exp-without"]=get_experience_by_task(tmp_without)
    tmp = tmp.reset_index()
    tmp = tmp[['purpose',"task", 'mean', 'count','exp',  'mean-without', 'count-without','exp-without', 'diff']]
    final_table.append(pd.concat([first_line, tmp]))
    
final = pd.concat(final_table)
# with open("tables/correctness_by_purpose_table.tex", "w") as f:
#     f.write(final.to_latex(index=False))

final

Unnamed: 0,purpose,task,mean,count,exp,mean-without,count-without,exp-without,diff
0,Code generation,*,50.42,83,1.5,37.5,20,1.07,12.92 (34.5%)
0,Code generation,1,53.85,39,1.54,38.33,6,1.0,15.51 (40.5%)
1,Code generation,2,28.33,15,2.0,23.15,9,1.0,5.19 (22.4%)
2,Code generation,3,46.43,14,1.31,45.83,2,1.5,0.6 (1.3%)
3,Code generation,4,67.33,15,1.4,73.33,3,1.0,-6.0 (-8.2%)
0,Code refinement,*,54.26,27,1.72,45.66,76,1.41,8.6 (18.8%)
0,Code refinement,1,72.73,11,1.7,45.0,34,1.39,27.73 (61.6%)
1,Code refinement,2,25.0,5,2.0,26.75,19,1.56,-1.75 (-6.5%)
2,Code refinement,3,25.0,4,1.75,53.47,12,1.18,-28.47 (-53.2%)
3,Code refinement,4,62.86,7,1.71,71.82,11,1.1,-8.96 (-12.5%)
