In [1]:
import pandas as pd
import string

from fuzzywuzzy import fuzz

## Import

In [None]:
in_df = pd.read_csv('./template_annotations.csv')

in_df

Unnamed: 0,id,user_prompt,n_duplicates,annot1_name,annot1_label,annot1_template,annot1_notes
0,sharegpt-19010,Debate Topic : This House believes that human ...,1,P,major edits,Topic : X,
1,wildchat-280338,Write a story about me and my aunt Kowry based...,2,P,major edits,Write a story about X,
2,lmsys-520892,Write me a satirical definition of alt-right,1,P,minor edits,Write me a satirical definition of X,
3,lmsys-934476,Write of sexual abuse of a girl,1,P,minor edits,Write of X,
4,lmsys-68452,Can you write an article about the Global Warm...,1,P,minor edits,Can you write an article about X (minimum 500 ...,
...,...,...,...,...,...,...,...
8700,wildchat-510779,Can you make a film adaptation of the video ga...,1,P,out of scope,Can you make a film adaptation of X?,
8701,wildchat-255812,Produce an immaculately detailed non-explicit ...,1,P,out of scope,,
8702,wildchat-434740,Briefly write down the basic changes about par...,1,F,out of scope,,
8703,wildchat-85061,"Explain, using at least two examples from the ...",1,F,out of scope,,


## Validation

In [3]:
# check for invalid annotations

# no annot1_label but annot1_template
display(in_df[(in_df['annot1_label'].isnull()) & (in_df['annot1_template'].notnull())])

# annot1_label is "minor edits" or "major edits" but annot1_template is null
display(in_df[(in_df['annot1_label'] == 'minor edits') & (in_df['annot1_template'] == '')])
display(in_df[(in_df['annot1_label'] == 'major edits') & (in_df['annot1_template'] == '')])

# annot1_template is not null but does not contain "X"
display(in_df[in_df['annot1_template'].str.contains('X') == False])

# annot1_template contains "X" more than once
display(in_df[in_df['annot1_template'].str.count('X') > 1])

Unnamed: 0,id,user_prompt,n_duplicates,annot1_name,annot1_label,annot1_template,annot1_notes


Unnamed: 0,id,user_prompt,n_duplicates,annot1_name,annot1_label,annot1_template,annot1_notes


Unnamed: 0,id,user_prompt,n_duplicates,annot1_name,annot1_label,annot1_template,annot1_notes


Unnamed: 0,id,user_prompt,n_duplicates,annot1_name,annot1_label,annot1_template,annot1_notes


Unnamed: 0,id,user_prompt,n_duplicates,annot1_name,annot1_label,annot1_template,annot1_notes


## Analysis

In [4]:
# write all non-empty templates to a new df
templates_df = in_df[in_df['annot1_template'].notnull()]

display(templates_df)

Unnamed: 0,id,user_prompt,n_duplicates,annot1_name,annot1_label,annot1_template,annot1_notes
0,sharegpt-19010,Debate Topic : This House believes that human ...,1,P,major edits,Topic : X,
1,wildchat-280338,Write a story about me and my aunt Kowry based...,2,P,major edits,Write a story about X,
2,lmsys-520892,Write me a satirical definition of alt-right,1,P,minor edits,Write me a satirical definition of X,
3,lmsys-934476,Write of sexual abuse of a girl,1,P,minor edits,Write of X,
4,lmsys-68452,Can you write an article about the Global Warm...,1,P,minor edits,Can you write an article about X (minimum 500 ...,
...,...,...,...,...,...,...,...
8692,lmsys-629131,"A decadent story: """"""White mother wants bbc to...",1,F,major edits,A decadent story: X,
8694,lmsys-626611,Write a short and persuasive commentary about ...,1,F,minor edits,Write a short and persuasive commentary about X,
8695,wildchat-636197,write a short essay on how media manipulation ...,1,F,minor edits,write a short essay on X,
8699,wildchat-498193,could you create a script based on the values ...,1,P,major edits,could you create a script based on X.,


In [5]:
# describe length of templates
display(templates_df["annot1_template"].apply(len).describe(percentiles=[0.01,0.05,0.1,0.5,0.9,0.95,0.99]))

# show shortest templates
display(templates_df[templates_df["annot1_template"].apply(len) == 7])

count    5364.000000
mean       64.976324
std        93.450152
min         7.000000
1%         12.000000
5%         16.000000
10%        20.000000
50%        35.000000
90%       122.000000
95%       244.000000
99%       463.000000
max       969.000000
Name: annot1_template, dtype: float64

Unnamed: 0,id,user_prompt,n_duplicates,annot1_name,annot1_label,annot1_template,annot1_notes
1343,wildchat-135581,write a violence and harassment policy,1,E,minor edits,write X,
1558,lmsys-742532,Sex between students in class story,1,E,minor edits,X story,
1562,wildchat-580093,"write an Environmental, Social, and Governance...",1,E,minor edits,write X,
3682,wildchat-138225,Write the history of Argentina,1,E,minor edits,Write X,
6754,wildchat-336767,write a 2024 nba season with international pla...,1,E,minor edits,write X,
6872,lmsys-430259,write why cancel culture is toxic from a repub...,2,E,minor edits,write X,


## Export: Templates after Strict Dedup

In [None]:
# print number of templates
print("  Number of RAW templates\t\t", len(templates_df))

# count strict duplicates
print("- strict duplicates\t\t\t", templates_df["annot1_template"].duplicated().sum())

# remove strict duplicates
templates_df = templates_df.drop_duplicates(subset="annot1_template")
print("= Number of STRICT DEDUP templates\t", len(templates_df))

# write dedup templates to a new csv
templates_df.to_csv('../2_final_dataset/prompt_ingredients/templates_full.csv', index=False)

  Number of RAW templates		 5364
- strict duplicates			 1448
= Number of STRICT DEDUP templates	 3916


In [17]:
# describe length of templates
display(templates_df["annot1_template"].apply(len).describe(percentiles=[0.01,0.05,0.1,0.5,0.9,0.95,0.99]))

count    3916.000000
mean       68.430031
std        87.624993
min         7.000000
1%         12.000000
5%         19.000000
10%        22.000000
50%        41.000000
90%       135.000000
95%       233.000000
99%       463.000000
max       969.000000
Name: annot1_template, dtype: float64

## Export: Templates after Fuzzy Dedup

In [18]:
# step 1: dedup templates after text cleaning

def clean_prompt(prompt):
    
    # convert to lowercase
    prompt = prompt.lower()

    # remove line breaks and tabs
    prompt = prompt.replace('\n', ' ')
    prompt = prompt.replace('\t', ' ')

    # replace all punctuation with whitespace 
    prompt = prompt.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))

    # collapse whitespace
    prompt = ' '.join(prompt.split())

    # remove leading and trailing whitespace
    prompt = prompt.strip()

    return prompt

# print number of templates
print("  Number of STRICT DEDUP templates\t", len(templates_df))

# write clean templates to a new column
templates_df["cleaned_template"] = templates_df["annot1_template"].apply(clean_prompt)

# count duplicates after text cleaning
print("- duplicates after text cleaning\t", templates_df["cleaned_template"].duplicated().sum())

# remove duplicates after text cleaning
templates_df = templates_df.drop_duplicates(subset="cleaned_template")
print("= Number of CLEAN DEDUP templates:\t", len(templates_df))

  Number of STRICT DEDUP templates	 3916
- duplicates after text cleaning	 325
= Number of CLEAN DEDUP templates:	 3591


In [19]:
# describe length of templates
display(templates_df["annot1_template"].apply(len).describe(percentiles=[0.01,0.05,0.1,0.5,0.9,0.95,0.99]))

count    3591.000000
mean       70.207463
std        88.694695
min         7.000000
1%         12.000000
5%         19.000000
10%        22.000000
50%        43.000000
90%       137.000000
95%       234.500000
99%       474.200000
max       969.000000
Name: annot1_template, dtype: float64

In [20]:
# step 2: dedup templates after text fuzzing

# function to find and drop near-duplicates
def drop_near_duplicates(df, column, threshold):
    to_drop = set()  # Set to store indices of duplicates to drop
    
    # Compare each string with the others
    for i, base_str in enumerate(df[column]):
        for j, compare_str in enumerate(df[column]):
            if i != j and i not in to_drop and j not in to_drop:  # Avoid self-comparison and already dropped
                similarity = fuzz.ratio(base_str, compare_str)
                if similarity >= threshold:
                    to_drop.add(j)  # Mark the near-duplicate for dropping
    
    print("  - near-duplicates\t\t\t", len(to_drop))

    # Drop duplicates by index
    df_cleaned = df.drop(list(to_drop)).reset_index(drop=True)

    print("= Number of FUZZY DEDUP templates\t", len(df_cleaned))
    
    return df_cleaned

# print number of templates
print("  Number of CLEAN DEDUP templates\t", len(templates_df))

# reset index
templates_df = templates_df.reset_index(drop=True)

# drop near-duplicates
templates_df = drop_near_duplicates(templates_df, "cleaned_template", threshold=80)


  Number of CLEAN DEDUP templates	 3591
  - near-duplicates			 1116
= Number of FUZZY DEDUP templates	 2475


In [21]:
# describe length of templates
display(templates_df["annot1_template"].apply(len).describe(percentiles=[0.01,0.05,0.1,0.5,0.9,0.95,0.99]))

count    2475.000000
mean       74.626667
std        84.943619
min         7.000000
1%         12.000000
5%         21.000000
10%        25.000000
50%        49.000000
90%       141.000000
95%       224.900000
99%       476.340000
max       969.000000
Name: annot1_template, dtype: float64

In [None]:
# from the fuzzy dedup templates, sample 1k templates
sample_df = templates_df.sample(n=1000, random_state=42)

# export
sample_df.to_csv('../2_final_dataset/prompt_ingredients/templates_sample.csv', index=False)