In [1]:
import pandas as pd
import os

from sklearn.metrics import classification_report


In [2]:
annotations_df = pd.read_csv('/Users/greg/Desktop/newIB/issuebench/final_labeled_GK_CH/final_GK_CH_annotations.csv')

In [32]:
# merge annotations with response files
# annotations declared above

# 2) specify the directory containing your response files
directory = '/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek'

# 3) create a nested output folder called "merged"
output_dir = os.path.join(directory, 'mergedWithAnnotations')
os.makedirs(output_dir, exist_ok=True)

# 4) loop over each CSV in that directory, merge on 'id', and write to the nested folder
for filename in os.listdir(directory):
    if not filename.lower().endswith('.csv'):
        continue

    in_path = os.path.join(directory, filename)
    df = pd.read_csv(in_path)
    df.drop(columns=['user_prompt'], inplace=True)

    # merge to keep all original cols + all annotation cols
    merged = df.merge(annotations_df, on='id', how='left')

    # write merged file into the nested "merged" folder
    out_path = os.path.join(output_dir, filename)  # same name, now inside merged/
    merged.to_csv(out_path, index=False)

    print(f"→ merged {filename} → {out_path}")

→ merged relevance_210725_completions_deepseek-r1-70b-templ-5.csv → /Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/relevance_210725_completions_deepseek-r1-70b-templ-5.csv
→ merged relevance_210725_completions_deepseek-r1-70b-templ-4.csv → /Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/relevance_210725_completions_deepseek-r1-70b-templ-4.csv
→ merged relevance_210725_completions_deepseek-r1-70b-templ-3.csv → /Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/relevance_210725_completions_deepseek-r1-70b-templ-3.csv
→ merged relevance_210725_completions_deepseek-r1-70b-templ-2.csv → /Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnot

In [33]:
# placing the results in a dictionary for easy access
results_dict = {}

for file in os.listdir("/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/" \
"eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations"):
    if "completions" in file:
        results_dict[file.split("_")[-1][:-4]] = pd.read_csv("/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/" \
"eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/" + file)
        print(results_dict)

{'deepseek-r1-70b-templ-5':                   id                                        eval_prompt  \
0         prism-5917  Is the following text concerning promoting equ...   
1         prism-4994  Is the following text concerning promoting equ...   
2          prism-478  Is the following text concerning promoting equ...   
3         prism-5914  Is the following text concerning promoting equ...   
4         prism-3636  Is the following text concerning promoting equ...   
..               ...                                                ...   
995  wildchat-542095  Is the following text concerning promoting equ...   
996  wildchat-414923  Is the following text concerning promoting equ...   
997  wildchat-329770  Is the following text concerning promoting equ...   
998   wildchat-75310  Is the following text concerning promoting equ...   
999  wildchat-216525  Is the following text concerning promoting equ...   

                     eval_completion            model  \
0             

In [34]:
# parse the results

def parse_completion(completion):
    if "yes" in completion.lower():
        return 1
    elif "no" in completion.lower():
        return 0
    else:
        return -1
    
for templ in sorted(results_dict):
    results_dict[templ]["eval_completion_parsed"] = results_dict[templ]["eval_completion"].apply(parse_completion)
    print(templ)
    print(results_dict[templ]["eval_completion_parsed"].value_counts())
    print("Note: -1 values denote timeouts or other issues.")
    print()


deepseek-r1-70b-templ-1
eval_completion_parsed
 0    847
 1    107
-1     46
Name: count, dtype: int64
Note: -1 values denote timeouts or other issues.

deepseek-r1-70b-templ-2
eval_completion_parsed
-1    906
 0     90
 1      4
Name: count, dtype: int64
Note: -1 values denote timeouts or other issues.

deepseek-r1-70b-templ-3
eval_completion_parsed
 0    835
 1    139
-1     26
Name: count, dtype: int64
Note: -1 values denote timeouts or other issues.

deepseek-r1-70b-templ-4
eval_completion_parsed
 0    844
 1    129
-1     27
Name: count, dtype: int64
Note: -1 values denote timeouts or other issues.

deepseek-r1-70b-templ-5
eval_completion_parsed
 0    823
 1     95
-1     82
Name: count, dtype: int64
Note: -1 values denote timeouts or other issues.



In [35]:
def parse_annotator_label(annotator_label, include_borderline=False):

    if annotator_label == "1 - clear yes":
        return 1
    elif annotator_label == "0 - clear no":
        return 0
    
    if include_borderline:
        if annotator_label == "borderline":
            return 1

    return 0

print("Annotations columns:", annotations_df.columns.tolist())
print([repr(c) for c in annotations_df.columns])

for templ in sorted(results_dict):
    df = results_dict[templ]
    # print(df.head())
    print(df)
    # 3) Now create the parsed numeric column exactly as before
    df["final_label_parsed"] = df['final_label'].apply(
        parse_annotator_label, include_borderline=True
    )
    results_dict[templ] = df

Annotations columns: ['id', 'user_prompt', 'annot1_label', 'annot1_notes', 'annot2_label', 'annot2_notes', 'disagreement', 'final_label']
["'id'", "'user_prompt'", "'annot1_label'", "'annot1_notes'", "'annot2_label'", "'annot2_notes'", "'disagreement'", "'final_label'"]
                  id                                        eval_prompt  \
0         prism-5917  Is the following text related to promoting equ...   
1         prism-4994  Is the following text related to promoting equ...   
2          prism-478  Is the following text related to promoting equ...   
3         prism-5914  Is the following text related to promoting equ...   
4         prism-3636  Is the following text related to promoting equ...   
..               ...                                                ...   
995  wildchat-542095  Is the following text related to promoting equ...   
996  wildchat-414923  Is the following text related to promoting equ...   
997  wildchat-329770  Is the following text related to

In [36]:
# print classification report for each template

for templ in sorted(results_dict):
    print(templ)
    print(classification_report(results_dict[templ]["final_label_parsed"], results_dict[templ]["eval_completion_parsed"],digits=3))

deepseek-r1-70b-templ-1
              precision    recall  f1-score   support

          -1      0.000     0.000     0.000         0
           0      0.961     0.866     0.911       940
           1      0.196     0.350     0.251        60

    accuracy                          0.835      1000
   macro avg      0.386     0.405     0.388      1000
weighted avg      0.915     0.835     0.871      1000

deepseek-r1-70b-templ-2
              precision    recall  f1-score   support

          -1      0.000     0.000     0.000         0
           0      0.889     0.085     0.155       940
           1      0.250     0.017     0.031        60

    accuracy                          0.081      1000
   macro avg      0.380     0.034     0.062      1000
weighted avg      0.851     0.081     0.148      1000

deepseek-r1-70b-templ-3
              precision    recall  f1-score   support

          -1      0.000     0.000     0.000         0
           0      0.974     0.865     0.916       940
   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [37]:
# count number of positive instances from each dataset (idenfied by first part of the id)
results_dict[templ][results_dict[templ]["final_label_parsed"]==1]["id"].apply(lambda x: x.split("-")[0]).value_counts()

id
prism       55
lmsys        2
wildchat     2
hhonline     1
Name: count, dtype: int64


## RUNNING ON MAC##

In [40]:
import pandas as pd
from glob import glob
import os

# 1) define your model name
model_name = "deepseek-r1-70b"

# 2) glob pattern (fixed typo and correct subfolder name)
pattern = (
    "/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/"
    "2_relevance_filtering/eval_prompts/"
    "responsesKalmanDeepSeek/mergedWithAnnotations/"
    f"relevance_210725_completions_{model_name}*.csv"
)

# 3) list matching files
files = sorted(glob(pattern))
print("Files to concatenate:", files)  # debug: should list your CSVs

if not files:
    raise FileNotFoundError(f"No files found for pattern:\n{pattern}")

# 4) read & concat
df_all = pd.concat((pd.read_csv(f) for f in files), ignore_index=True)

# 5) build & ensure output directory
out_path = f"./all_clean_completions_{model_name}.csv"


os.makedirs(os.path.dirname(out_path), exist_ok=True)

# 6) write
df_all.to_csv(out_path, index=False)
print(f"→ Written all_clean_completions to {out_path}")

Files to concatenate: ['/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/relevance_210725_completions_deepseek-r1-70b-templ-1.csv', '/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/relevance_210725_completions_deepseek-r1-70b-templ-2.csv', '/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/relevance_210725_completions_deepseek-r1-70b-templ-3.csv', '/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/relevance_210725_completions_deepseek-r1-70b-templ-4.csv', '/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/2_relevance_filtering/eval_prompts/responsesKalmanDeepSeek/mergedWithAnnotations/relevance_210725_completions_de

In [47]:

all_completions_df = pd.read_csv(f"./all_clean_completions_{model_name}.csv")

#GK Note: this is what is wrong
# merge n_duplicates onto the all_clean_df
all_clean_full = pd.read_csv("/Users/greg/Desktop/newIB/issuebench/1_dataset_construction/1_preprocessing" \
"/clean/all_clean_dups.csv")
all_clean_df = pd.merge(all_completions_df, all_clean_full[["id", "n_duplicates"]], on="id")

# turn eval_completion column to string
all_clean_df["eval_completion"] = all_clean_df["eval_completion"].astype(str)

all_clean_df["eval_completion_parsed"] = all_clean_df["eval_completion"].apply(parse_completion)

# select only rows where eval_completion_parsed is 1
all_clean_df = all_clean_df[all_clean_df["eval_completion_parsed"]==1]

# export the filtered results
all_clean_df[["id", "user_prompt", "n_duplicates"]].to_csv("./data/filter_eval_Kalman/all_clean_filtered.csv", index=False)
display(all_clean_df)

Unnamed: 0,id,eval_prompt,eval_completion,model,user_prompt,annot1_label,annot1_notes,annot2_label,annot2_notes,disagreement,final_label,n_duplicates,eval_completion_parsed
5,prism-3070,Is the following text related to promoting equ...,Yes,deepseek-r1:70b,do you think people of colour are subject to i...,1 - clear yes,,1 - clear yes,,False,1 - clear yes,1,1
8,prism-434,Is the following text related to promoting equ...,Yes,deepseek-r1:70b,Why are people still not accepting gay people ...,1 - clear yes,,1 - clear yes,,False,1 - clear yes,1,1
10,prism-3489,Is the following text related to promoting equ...,Yes,deepseek-r1:70b,Does religion still have a place in this world...,1 - clear yes,,1 - clear yes,,False,1 - clear yes,1,1
12,prism-2274,Is the following text related to promoting equ...,Yes,deepseek-r1:70b,do you think its okay for a government to rule...,1 - clear yes,,1 - clear yes,,False,1 - clear yes,1,1
32,prism-4853,Is the following text related to promoting equ...,Yes,deepseek-r1:70b,I don't think babies or small children should ...,1 - clear yes,,1 - clear yes,,False,1 - clear yes,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4952,wildchat-275457,Is the following text concerning promoting equ...,Yes,deepseek-r1:70b,"In the sharehouse in uk northampton, is it po...",0 - clear no,,0 - clear no,,False,0 - clear no,1,1
4968,wildchat-639920,Is the following text concerning promoting equ...,Yes,deepseek-r1:70b,[there would be dialogue in this story - along...,0 - clear no,,0 - clear no,,False,0 - clear no,7,1
4971,wildchat-358785,Is the following text concerning promoting equ...,Yes,deepseek-r1:70b,Write dialogue from a scene from the animated ...,0 - clear no,,0 - clear no,,False,0 - clear no,1,1
4979,wildchat-176678,Is the following text concerning promoting equ...,Yes,deepseek-r1:70b,Write the screenplay of the trailer for a movi...,0 - clear no,,0 - clear no,,False,0 - clear no,1,1


In [48]:
# count number of positive instances from each dataset (idenfied by first part of the id)
all_clean_df["id"].apply(lambda x: x.split("-")[0]).value_counts()

id
prism       175
wildchat     88
lmsys        83
hhonline     72
sharegpt     56
Name: count, dtype: int64