In [1]:
import os
import pandas as pd
from attack.utils import load_all_csvs
import numpy as np

def print_attack_stats(df):
    """
    Splits the given DataFrame into separate attacks using separate_attacks(df),
    and prints summary statistics for the resulting list of DataFrames.
    
    Statistics include:
      - Mean, median, min, max, and standard deviation of the number of rows per attack.
      - Number of attacks with exactly 102 rows.
      - Indices of attacks that do not have exactly 102 rows.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame to process.
    """
    # Split the DataFrame into a list of attack DataFrames
    attacks = separate_attacks(df)
    
    # Get the length (number of rows) for each attack
    lengths = [len(attack) for attack in attacks]
    
    # Compute summary statistics
    mean_length   = np.mean(lengths)
    median_length = np.median(lengths)
    min_length    = np.min(lengths)
    max_length    = np.max(lengths)
    std_length    = np.std(lengths, ddof=1)  # Sample standard deviation
    
    # Print the results
    print("Summary statistics of DataFrame lengths in 'attacks':")
    print(f"\nNumber of attacks: {len(attacks)}")
    print(f"Mean:   {mean_length}")
    print(f"Median: {median_length}")
    print(f"Min:    {min_length}")
    print(f"Max:    {max_length}")
    print(f"Std:    {std_length}")

# watermark_types = ["Adaptive", "KGW", "SIR", "GPT4o_unwatermarked"]
watermark_types = ["SIR"]
mutators = [
    "DocumentMutator", "Document1StepMutator", "Document2StepMutator",
    "SentenceMutator", "SpanMutator", "WordMutator", "EntropyWordMutator"
]

results = []

def fix_encoding(text):
    if isinstance(text, str):
        try:
            # First, try fixing common mis-encoding issues (mojibake)
            return text.encode('latin1').decode('utf-8')
        except (UnicodeEncodeError, UnicodeDecodeError):
            try:
                # If that fails, try Windows-1252 (often mixed with Latin-1)
                return text.encode('cp1252').decode('utf-8')
            except (UnicodeEncodeError, UnicodeDecodeError):
                # Return original text if all decoding attempts fail
                return text  
    return text  # If it's not a string, return as is

import pandas as pd

def sanity_check_quality_preservation(df):
    """
    Perform a sanity check on the DataFrame to ensure that if the previous row has 
    `quality_preserved` as True, the `mutated_text` of the previous row matches the 
    `current_text` of the current row.

    Parameters:
        df (pd.DataFrame): The DataFrame to check. It must contain the columns 
                           'quality_preserved', 'mutated_text', and 'current_text'.

    Returns:
        None: Prints the index of the first row where the sanity check fails. 
              If no rows fail, nothing is printed.
    """
    # Shift the 'quality_preserved' and 'mutated_text' columns to align with the next row
    df['prev_quality_preserved'] = df['quality_preserved'].shift(1)
    df['prev_mutated_text'] = df['mutated_text'].shift(1)

    # Perform the sanity check
    df['sanity_check'] = (
        (df['prev_quality_preserved'] == True) & 
        (df['prev_mutated_text'] != df['current_text'])
    )

    # Find the index of the first row where the sanity check fails
    first_failure_index = df[df['sanity_check']].index.min()

    # Print the result if a failure is found
    if not pd.isna(first_failure_index):
        print(f"Sanity check first failed at row index: {first_failure_index}")

    # Clean up temporary columns
    df.drop(columns=['prev_quality_preserved', 'prev_mutated_text', 'sanity_check'], inplace=True)

def assign_unique_group_ids(df):
    df['new_group'] = (df['step_num'] == -1).astype(int)
    df['group_id'] = df['new_group'].cumsum()
    return df

def separate_attacks(df):
    attacks = []
    current_attack = []
    
    for idx, row in df.iterrows():
        # Start a new attack if the step_num resets
        if idx > 0 and row['step_num'] < df.loc[idx - 1, 'step_num']:
            attacks.append(pd.DataFrame(current_attack))
            current_attack = []
        
        current_attack.append(row)
    
    # Append the last attack
    if current_attack:
        attacks.append(pd.DataFrame(current_attack))
    
    return attacks

In [None]:
def compare_dfs(df1, df2):
    # Step 1: Create a new column selecting mutated_text when available, otherwise current_text
    df1['selected_text'] = df1.apply(lambda row: row['mutated_text'] if pd.notna(row['mutated_text']) else row['current_text'], axis=1)
    
    # Step 2: Extract values where 'step_num' == -1 and strip whitespace
    df1_filtered = df1[df1['step_num'] == -1]['selected_text'].str.strip()
    
    # Step 3: Strip whitespace from df2's 'text' column and count occurrences
    match_counts = df2['text'].str.strip().apply(lambda x: (df1_filtered == x).sum())
    
    # Step 4: Create a result DataFrame
    result_df = pd.DataFrame({'text': df2['text'], 'match_count': match_counts})
    
    # Find indices of rows in df2 that don't have a match
    no_match_indices = result_df[result_df['match_count'] == 0].index.tolist()
    
    # Print indices of unmatched rows
    print(f"Indices of rows in df2 with no match: {no_match_indices}")

    # if no_match_indices:
    #     first_no_match_index = no_match_indices[0]
    #     first_no_match_prompt = df2.loc[first_no_match_index, 'prompt'] if 'prompt' in df2.columns else "N/A"
    #     first_no_match_text = df2.loc[first_no_match_index, 'text']
        
    #     print(f"First unmatched row details - Index: {first_no_match_index}")
    #     print(f"Prompt: {first_no_match_prompt}")
    #     print(f"Text: {first_no_match_text}")

    return result_df

def check_step_num_validity(df):
    if 'step_num' in df.columns:
        step_nums = df['step_num'].values  # Extract step_num column as a NumPy array for efficient computation
        
        # Check condition: step_num should be either -1 or one more than the previous row
        valid = (step_nums[0] == -1) and all(
            (step_nums[i] == -1) or (step_nums[i] == step_nums[i - 1] + 1) for i in range(1, len(step_nums))
        )

        if not valid:
            print("Warning: step_num sequence is not valid. It should either be -1 or increment by 1.")


def interpret_results(result_df):
    # Count the number of values where match_count is not 1
    count_not_one = (result_df['match_count'] != 1).sum()
    count_greater_than_one = (result_df['match_count'] > 1).sum()
    total_sum = result_df['match_count'].sum()

    # Only print if values deviate from expected ones
    if count_not_one != 0 or count_greater_than_one != 0 or total_sum != 90:
        print(f"Number of values where match_count is not 1: {count_not_one}")
        print(f"Number of rows where match_count is greater than 1: {count_greater_than_one}")
        print(f"Total sum of match_count column: {total_sum}")


In [8]:
for watermark_type in watermark_types:
    for mutator in mutators:
        print(f"Processing watermark_type: {watermark_type}, mutator: {mutator}")
        # Load data with fallback to non-annotated directory
        df = load_all_csvs("./attack/traces/annotated", watermark_type, mutator)
        if df.empty:
            df = load_all_csvs("./attack/traces", watermark_type, mutator)

        df = df.applymap(fix_encoding)

        # sanity_check_quality_preservation(df)

        if 'step_num' in df.columns:
            step_num_neg1_count = (df['step_num'] == -1).sum()
            if step_num_neg1_count != 90:
                print(f"Number of rows with step_num == -1: {step_num_neg1_count}")

        check_step_num_validity(df)

        entropy_df = pd.read_csv(f'/data2/borito1907/sandcastles/data/texts/entropy_control_{watermark_type}.csv')

        # print(df.columns)

        result_df = compare_dfs(df, entropy_df)

        interpret_results(result_df)

Processing watermark_type: SIR, mutator: DocumentMutator


Indices of rows in df2 with no match: []
Processing watermark_type: SIR, mutator: Document1StepMutator
Number of rows with step_num == -1: 82
Indices of rows in df2 with no match: [12, 13, 20, 23, 27, 61, 74, 83]
First unmatched row details - Index: 12
Prompt: Write a 500-word story about Evan, an American tourist, who falls for Emilie, a barista, during a spring festival in Paris:
Text: Paris, the City of Love, is renowned for its charming atmosphere, iconic landmarks, and enchanting beauty. For Evan, a young American tourist, the city's magic is about to take on a whole new meaning. It's the first day of the spring festival, and the air is alive with music, laughter, and the sweet scent of blooming flowers. As Evan wanders through the crowded streets, he stumbles upon a quaint little café, where the aroma of freshly brewed coffee beckons him inside. Behind the counter stands Emilie, a beautiful barista with piercing green eyes and a bright smile. She greets Evan with a warm "Bonjour!