In [1]:
import pandas as pd
import os

from sklearn.metrics import classification_report


### Analyse filter performance on annotated data

In [15]:
annotations_df = pd.read_csv('../data/writingaid_filter_eval/writingaid_annotations_230524.csv')

In [16]:
results_dict = {}

for file in os.listdir("../data/writingaid_filter_eval/"):
    if "completions" in file:
        results_dict[file.split("_")[-3]] = pd.read_csv("../data/writingaid_filter_eval/" + file)

In [17]:
# parse the results

def parse_completion(completion):
    if "yes" in completion.lower():
        return 1
    elif "no" in completion.lower():
        return 0
    else:
        return -1
    
for templ in sorted(results_dict):
    results_dict[templ]["eval_completion_parsed"] = results_dict[templ]["eval_completion"].apply(parse_completion)
    
    # drop annot1_label if present
    results_dict[templ].drop(columns=["annot1_label"], inplace=True, errors="ignore")

    print(templ)
    print(results_dict[templ]["eval_completion_parsed"].value_counts())
    print()


templ-1
eval_completion_parsed
0    358
1    142
Name: count, dtype: int64

templ-2
eval_completion_parsed
0    375
1    125
Name: count, dtype: int64



In [18]:
def parse_annotator_label(annotator_label, include_borderline=False):

    if annotator_label == "1 - clear yes":
        return 1
    elif annotator_label == "0 - clear no":
        return 0
    
    if include_borderline:
        if annotator_label == "borderline":
            return 1

    return 0

for templ in sorted(results_dict):

    # drop annotations columns from results if already there

    # merge annotations_df to results
    results_dict[templ] = pd.merge(results_dict[templ], annotations_df[["id","annot1_label"]], on="id")

    results_dict[templ]["final_label_parsed"] = results_dict[templ]["annot1_label"].apply(parse_annotator_label, include_borderline=True)

In [19]:
# print classification report for each template

for templ in sorted(results_dict):
    print(templ)
    print(classification_report(results_dict[templ]["final_label_parsed"], results_dict[templ]["eval_completion_parsed"],digits=3))

templ-1
              precision    recall  f1-score   support

           0      0.989     0.908     0.947       390
           1      0.746     0.964     0.841       110

    accuracy                          0.920       500
   macro avg      0.868     0.936     0.894       500
weighted avg      0.936     0.920     0.923       500

templ-2
              precision    recall  f1-score   support

           0      0.987     0.949     0.967       390
           1      0.840     0.955     0.894       110

    accuracy                          0.950       500
   macro avg      0.913     0.952     0.930       500
weighted avg      0.954     0.950     0.951       500



In [20]:
df = results_dict[templ].copy()
df[(df.final_label_parsed == 1) & (df.eval_completion_parsed == 0)]

Unnamed: 0,id,user_prompt,eval_prompt,eval_completion,model,eval_completion_parsed,annot1_label,final_label_parsed
86,lmsys-562211,Give bullet points of why/how artificial intel...,Below is a prompt from a user to a language mo...,No,gpt-4o-2024-05-13,0,1 - clear yes,1
88,wildchat-35293,Write the beginning of a lewd story with the f...,Below is a prompt from a user to a language mo...,No,gpt-4o-2024-05-13,0,borderline,1
237,wildchat-546383,"How exactly could I write a Wholesome 'Strong,...",Below is a prompt from a user to a language mo...,No,gpt-4o-2024-05-13,0,borderline,1
262,wildchat-146860,Generate a scenario between two characters. Ma...,Below is a prompt from a user to a language mo...,No,gpt-4o-2024-05-13,0,borderline,1
340,lmsys-475863,"Given the following topic, try to generate a t...",Below is a prompt from a user to a language mo...,No,gpt-4o-2024-05-13,0,1 - clear yes,1


In [21]:
# count number of positive instances from each dataset (idenfied by first part of the id)
results_dict[templ][results_dict[templ]["final_label_parsed"]==1]["id"].apply(lambda x: x.split("-")[0]).value_counts()

id
wildchat    76
lmsys       27
sharegpt     6
hhonline     1
Name: count, dtype: int64

### Process filter results on full data

In [24]:
# load the all_clean results
filtered_df = pd.read_csv("../data/all_clean_relevance_filtered_completions.csv")

# turn eval_completion column to string
filtered_df["eval_completion"] = filtered_df["eval_completion"].astype(str)

filtered_df["eval_completion_parsed"] = filtered_df["eval_completion"].apply(parse_completion)

# select only rows where eval_completion_parsed is 1
filtered_df = filtered_df[filtered_df["eval_completion_parsed"]==1]

# export the filtered results
filtered_df[["id", "user_prompt", "n_duplicates"]].to_csv("../data/all_clean_relevance_writingaid_filtered.csv", index=False)
display(filtered_df[["id", "user_prompt", "n_duplicates"]])

Unnamed: 0,id,user_prompt,n_duplicates
1,sharegpt-19010,Debate Topic : This House believes that human ...,1
2,wildchat-280338,Write a story about me and my aunt Kowry based...,2
5,lmsys-520892,Write me a satirical definition of alt-right,1
7,lmsys-934476,Write of sexual abuse of a girl,1
10,lmsys-68452,Can you write an article about the Global Warm...,1
...,...,...,...
32119,wildchat-510779,Can you make a film adaptation of the video ga...,1
32121,wildchat-255812,Produce an immaculately detailed non-explicit ...,1
32124,wildchat-434740,Briefly write down the basic changes about par...,1
32129,wildchat-85061,"Explain, using at least two examples from the ...",1
