In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

In [16]:
survey_data = pd.read_csv('../survey/survey_data.csv')

n = 5
metrics = [1, 2]

survey_responses = pd.read_csv('FINAL_EXP3.csv')

In [None]:
survey_data

In [18]:
method_names = survey_data.iloc[0][['Method_1', 'Method_2', 'Method_3']]

In [19]:
def get_text_results(df):
    questions = [x for x in df.columns if '_open_' in x]
    qids = set([x.split('_')[2] for x in questions])
    
    n_ratings = 0
    
    all_ratings = {}
    for question in questions:
        split = question.split('_')
        qid = split[2]
        method = split[3]
        
        if int(qid) >= 5:
            continue

        ratings = df[question].dropna().iloc[2:].to_list()
                
        if len(ratings):
            n_ratings += 1
            if qid not in all_ratings:
                all_ratings[qid] = {name: [] for name in method_names}
            all_ratings[qid][method].append(ratings[0])
                
    
    print(n_ratings)
    
    return all_ratings

In [None]:
comments = get_text_results(survey_responses)
          
with open('expert_comments.json', 'w', encoding='utf-8') as f:
    json.dump(comments, f, indent=4)

In [21]:
def get_results(df, success_only=True):
    questions = [x for x in df.columns if '_mc_' in x]
    qids = set([x.split('_')[2] for x in questions])

    rows = {qid: {k: {m: [] for m in metrics} for k in method_names} for qid in qids}
    for question in questions:
        split = question.split('_')
        qid = split[2]
        method = split[3]
        metric = int(split[4])
        
        if int(qid) >= 5:
            continue
        
        if success_only:
            if not survey_data[survey_data['Question_ID'] == int(qid)].iloc[0][f'{method}_Success']:
                continue
        
        curr_metric = rows[qid][method][metric]

        ratings = df[question].dropna().iloc[2:].to_list()
        
        
        
        for rating in ratings:
            curr_metric.append(int(rating))
            
        curr_metric = curr_metric[:5]
            
    all_ratings = {k: {m: [] for m in metrics} for k in method_names}

    for row in rows:
        for method in rows[row]:
            for metric in metrics:
                if len(rows[row][method][metric]) > 0:
                    all_ratings[method][metric].append(np.mean(rows[row][method][metric]))
    
    return all_ratings, rows

In [24]:
all_ratings, rows = get_results(survey_responses, False)

averaged = {name: {metric: [] for metric in metrics} for name in method_names}
means = {str(i): {name: {metric: {} for metric in metrics} for name in method_names} for i in range(n)}
for i in range(n):
    i = str(i)
    for method in method_names:
        for metric in metrics:
            if len(rows[i][method][metric]) == 0:
                continue
            means[i][method][metric] = {}
            curr_means = means[i][method][metric]

            curr_means['mean'] = np.mean(rows[i][method][metric])
            curr_means['std'] = np.std(rows[i][method][metric])

            averaged[method][metric].append(means[i][method][metric]['mean'])
        

for method in method_names:
    for metric in metrics:
        if len(averaged[method][metric]) == 0:
            continue
        average = np.mean(averaged[method][metric])
        std = np.std(averaged[method][metric])

        averaged[method][metric] = {'average': average, 'std': std}

In [None]:
averaged

**ALL**

| Generator |   Fluency       |   Plausibility  |
|-----------|-----------------|-----------------|
| PPLM | 2.27 (0.49)     | 1.83 (0.26)     |
| Polyjuice      | 3.45 (0.91)     | **2.45** (0.66) |
| RELITC    | **3.90** (0.60) | 2.12 (0.26)     |

**SUCCESSFUL CE ONLY**

| Generator |   Fluency       |   Plausibility  |
|-----------|-----------------|-----------------|
| PPLM | 3.0 (0.0)     | 1.75 (0.0)     |
| Polyjuice      | 2.0 (0.0)     | **2.33** (0.0) |
| RELITC    | **3.62** (0.27) | 2.10 (0.27)     |

In [None]:
all_means = [np.array([np.mean(ratings[x]) for x in ratings]) for ratings in all_all_ratings]
all_stds = [np.array([np.std(ratings[x]) for x in ratings]) for ratings in all_all_ratings]

# means_1 = np.array([np.mean(all_ratings[x]) for x in all_ratings])
# stds_1 = np.array([np.std(all_ratings[x]) for x in all_ratings])

# means_2 = np.array([np.mean(all_ratings_2[x]) for x in all_ratings_2])
# stds_2 = np.array([np.std(all_ratings_2[x]) for x in all_ratings_2])

# means_3 = np.array([np.mean(all_ratings_3[x]) for x in all_ratings_3])
# stds_3 = np.array([np.std(all_ratings_3[x]) for x in all_ratings_3])

# means_4 = np.array([np.mean(all_ratings_4[x]) for x in all_ratings_4])
# stds_4 = np.array([np.std(all_ratings_4[x]) for x in all_ratings_4])

means = np.mean(all_means, axis=0)
stds = np.mean(all_stds, axis=0)

xs = np.arange(len(method_names))

plt.grid(axis='y', linestyle="--", alpha=0.5, zorder=1)

offset = 1 / (len(all_means) + 2)

for i, (mean, std) in enumerate(zip(all_means, all_stds)):
    plt.bar(xs+(offset*i), mean, yerr=(std), width=offset, label=f'Batch {i+1}', capsize=5)

# plt.bar(xs, means_1, yerr=(stds_1), width=offset, label='1st Batch', capsize=5)
# plt.bar(xs+offset, means_2, yerr=(stds_2), width=offset, label='2nd Batch', capsize=5)
# plt.bar(xs+2*offset, means_3, yerr=(stds_3), width=offset, label='3rd Batch', capsize=5)
# plt.bar(xs+3*offset, means_4, yerr=(stds_4), width=offset, label='4th Batch', capsize=5)
plt.bar(xs+offset*(len(means)+3), means, yerr=(stds), width=offset, label='Mean', capsize=5)

plt.xticks(xs + offset * (len(means) / 2 + 1), method_names)
plt.title('Four batches of ratings')
plt.legend()
plt.savefig('fourth_batch.png')
plt.show()

In [None]:
np.mean(all_means, axis=0)

In [None]:
all_means[5]

In [None]:
for method in all_ratings:
    print(method, np.mean(all_ratings[method]))

In [None]:
means

In [None]:
means_1

In [49]:
all_responses = [survey_responses, survey_responses_2, survey_responses_3, survey_responses_4, survey_responses_5]

In [50]:
all_questions = [[x for x in responses.columns if '_mc_' in x] for responses in all_responses]

In [51]:
questions_1 = [x for x in survey_responses.columns if '_mc_' in x]
questions_2 = [x for x in survey_responses_2.columns if '_mc_' in x]

In [52]:
all_ratings_per_person = [[row.dropna().astype(int).to_list() for i, row in responses.iloc[2:][questions].iterrows() if len(row.dropna()) > 0] for responses, questions in zip(all_responses, all_questions)]

In [53]:
ratings_per_person_1 = [row.dropna().astype(int).to_list() for i, row in survey_responses.iloc[2:][questions_1].iterrows() if len(row.dropna()) > 0]

In [54]:
ratings_per_person_2 = [row.dropna().astype(int).to_list() for i, row in survey_responses_2.iloc[2:][questions_2].iterrows() if len(row.dropna()) > 0]

In [55]:
means_pp = [[np.mean(x) for x in ratings_per_person] for ratings_per_person in all_ratings_per_person]
stds_pp = [[np.std(x) for x in ratings_per_person] for ratings_per_person in all_ratings_per_person]

In [56]:
means_pp_1 = [np.mean(x) for x in ratings_per_person_1]
means_pp_2 = [np.mean(x) for x in ratings_per_person_2]
stds_pp_1 = [np.std(x) for x in ratings_per_person_1]
stds_pp_2 = [np.std(x) for x in ratings_per_person_2]

In [None]:
plt.grid(axis='y', linestyle="--", alpha=0.5, zorder=1)

for i in range(len(means_pp)):
    num_responses = len(means_pp[i])
    plt.scatter([i+1]*num_responses, means_pp[i])
    plt.scatter([i+1]*num_responses, stds_pp[i], color='black', label='Std')

# plt.scatter([1]*5, means_pp_1)
# plt.scatter([2]*5, means_pp_2)
# plt.scatter([1]*5, stds_pp_1, label='Std', color='black')
# plt.scatter([2]*5, stds_pp_2, color='black')

ticks = [f'Batch {i}' for i in range(1, len(means_pp) + 1)]
xs = np.arange(len(ticks))
plt.xticks(xs+1, ticks)

plt.legend()
plt.ylim(-0.2, 5.2)
plt.xlim(0, 6)
plt.title('Ratings per respondent (mean, std)')
plt.savefig('respondents_ratings.png')

In [None]:
np.mean(stds_pp_1), np.mean(stds_pp_2)

In [None]:
np.mean(means_pp_1), np.mean(means_pp_2)

In [None]:
id_to_fomc = survey_data[['Question_ID', 'Fact_ID']].drop_duplicates()

indexes = rows.keys()
fomc_indexes = [int(id_to_fomc[id_to_fomc['Question_ID'] == int(ix)]['Fact_ID'].iloc[0]) for ix in indexes]

polyjuice_fluency = [np.mean(rows[ix]['Polyjuice'][1]) for ix in indexes]
pplm_fluency = [np.mean(rows[ix]['PPLM'][1]) for ix in indexes]
relitc_fluency = [np.mean(rows[ix]['RELITC'][1]) for ix in indexes]

polyjuice_plausibility = [np.mean(rows[ix]['Polyjuice'][2]) for ix in indexes]
pplm_plausibility = [np.mean(rows[ix]['PPLM'][2]) for ix in indexes]
relitc_plausibility = [np.mean(rows[ix]['RELITC'][2]) for ix in indexes]

In [19]:
df = pd.DataFrame({
    'q_id': indexes,
    'id': fomc_indexes,
    'polyjuice_fluency': polyjuice_fluency,
    'pplm_fluency': pplm_fluency,
    'relitc_fluency': relitc_fluency,
    'polyjuice_plausibility': polyjuice_plausibility,
    'pplm_plausibility': pplm_plausibility,
    'relitc_plausibility': relitc_plausibility,
})

In [20]:
df.to_csv('expert_mean_results.csv')