In [20]:
import pandas as pd
from matplotlib import pyplot as plt

In [4]:
df = pd.read_csv('/data2/borito1907/sandcastles/outputs/InternLMOracle_GPT4o_unwatermarked_SentenceMutator_n-steps=500_attack_results_annotated.csv')

In [6]:
# Helper function to separate attacks based on step_num reset
def separate_attacks(df, length=10000):
    attacks = []
    current_attack = []
    
    for idx, row in df.iterrows():
        # Start a new attack if the step_num resets
        if idx > 0 and row['step_num'] < df.loc[idx - 1, 'step_num']:
            attacks.append(pd.DataFrame(current_attack))
            current_attack = []        

        current_attack.append(row)
    
    # Append the last attack
    if current_attack:
        attacks.append(pd.DataFrame(current_attack))
    
    return attacks

attacks = separate_attacks(df)

print(f"Length of attacks: {len(attacks)}")

Length of attacks: 90


In [11]:
attack = attacks[0]

0       True
1       True
2       True
3       True
4       True
       ...  
497    False
498    False
499     True
500     True
501    False
Name: quality_preserved, Length: 502, dtype: bool

In [15]:
def count_successful_mutations(df):
    return df['quality_preserved'].sum()

for attack in attacks:
    print(count_successful_mutations(attack))

359
354
463
194
183
304
54
196
268
91
248
131
140
171
120
121
242
136
186
129
114
131
97
239
149
48
125
125
82
223
241
165
132
155
309
273
274
200
304
159
254
84
145
253
218
111
94
234
254
286
369
88
77
116
139
274
203
80
66
91
447
493
502
502
312
419
435
500
144
259
475
208
176
266
378
65
65
274
40
289
99
162
148
168
109
210
36
31
161
91


In [16]:
analysis_df = pd.read_csv('/data2/borito1907/sandcastles/distinguisher/results/long_InternLMOracle_GPT4o_unwatermarked_SentenceMutator_news_SimpleDistinguisher.csv')

In [17]:
attacks = []
current_attack = []

for idx, row in analysis_df.iterrows():
    # Start a new attack if the step_num resets
    if idx > 0 and row['Num'] < analysis_df.loc[idx - 1, 'Num']:
        attacks.append(pd.DataFrame(current_attack))
        current_attack = []        

    current_attack.append(row)

# Append the last attack
if current_attack:
    attacks.append(pd.DataFrame(current_attack))

len(attacks)


30

In [22]:
for attack in attacks:
    print(len(attack))

894
894
986
624
838
624
870
288
288
518
416
416
352
352
532
124
124
126
70
70
196
296
322
296
218
68
68
58
58
182


In [42]:
long_attacks = [attack for attack in attacks if len(attack) > 400]

In [43]:
combined_long_attacks = pd.concat(long_attacks)

In [12]:
def return_correct_flipped_correct(df, prefix=""):
    df[f'{prefix}correct'] = (df['Origin'] == df[f'{prefix}choice']).astype(float)
    df[f'{prefix}flipped_correct'] = (df['Origin'] == df[f'{prefix}flipped_choice']).astype(float)
    df[f'{prefix}avg_correct'] = (df[f'{prefix}correct']+df[f'{prefix}flipped_correct'])/2
    return df[f'{prefix}correct'].mean(), df[f'{prefix}flipped_correct'].mean(),df[f'{prefix}avg_correct']. mean()



In [None]:
data = []

for attack in long_attacks:    
    correct, flipped_correct, avg_correct = return_correct_flipped_correct(attack)

    data.append({
        'correct': correct,
        'flipped_correct': flipped_correct,
        'avg_correct': avg_correct
    })
    

In [45]:
data_df = pd.DataFrame(data)

data_df['avg_correct'].std()

0.12884039827550106

In [46]:
data_df

Unnamed: 0,correct,flipped_correct,avg_correct
0,0.855705,1.0,0.927852
1,1.0,1.0,1.0
2,1.0,0.73428,0.86714
3,0.963141,0.956731,0.959936
4,0.865155,1.0,0.932578
5,0.958333,1.0,0.979167
6,0.621839,1.0,0.81092
7,0.519305,0.76834,0.643822
8,0.639423,0.766827,0.703125
9,0.774038,0.555288,0.664663


In [53]:
analysis_df = analysis_df[analysis_df['Num'] >= 50]
return_correct_flipped_correct(analysis_df)

(0.8019607843137255, 0.9366013071895425, 0.869281045751634)

In [18]:
df_sublists = [[] for _ in range(10)]  # prepare 10 empty lists

for i, df in enumerate(attacks):
    remainder = i % 10
    df_sublists[remainder].append(df)

entropy_dfs = [pd.concat(lst) for lst in df_sublists]


In [19]:
for entropy_df in entropy_dfs:
    print(return_correct_flipped_correct(entropy_df))

(0.8147410358565738, 0.9355909694555112, 0.8751660026560425)
(0.9371108343711083, 0.798879202988792, 0.8679950186799502)
(0.9006024096385542, 0.6987951807228916, 0.7996987951807228)
(0.9308176100628931, 0.9064465408805031, 0.9186320754716981)
(0.8753148614609572, 0.9798488664987406, 0.9275818639798489)
(0.9681372549019608, 1.0, 0.9840686274509803)
(0.6902071563088512, 1.0, 0.8451035781544256)
(1.0, 1.0, 1.0)
(0.9206730769230769, 0.8269230769230769, 0.8737980769230769)
(0.6766233766233766, 0.8441558441558441, 0.7603896103896104)
