Convert prefix result per run to single row w/numerics for eval

In [19]:
!pip install nltk



In [20]:
import os
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import shutil

nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/deals/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
# Function to calculate BLEU score
def calculate_bleu(reference, hypothesis):
    try:
        reference_tokens = nltk.word_tokenize(reference)
        hypothesis_tokens = nltk.word_tokenize(hypothesis)
        smoothing = SmoothingFunction().method1
        return sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing)
    except (TypeError, ValueError) as e:
        print(f"Error calculating BLEU score: {e}")
        return 0


In [22]:
def extract_model_name(file_name):
    # Split file name by underscores
    parts = file_name.split('_')
    
    # Model name is the parts before the last and second-to-last underscores
    if len(parts) >= 3:
        model_name = '_'.join(parts[-3:-1])  # Adjust index as needed based on your file naming convention
        return model_name
    else:
        return None  # Handle if model name cannot be extracted

In [23]:
# Cell 3: Define the Function to Process Each CSV File
def process_csv_file(file_path, output_file):
    df = pd.read_csv(file_path)
    folder_name = os.path.basename(os.path.dirname(file_path))
    file_name = os.path.basename(file_path)
    model_type = file_path.split('_')[-1].replace('.csv', '')
    model = extract_model_name(file_name)

    bleu_scores = []
    rep = []

    for index, row in df.iterrows():
        try:
        # Ensure that the row values are treated as strings
            prompt = str(row['prompt'])
            suffix = str(row['suffix'])
            sample = str(row['sample'])
            
            # Calculate length of the prompt string
            p_len = len(prompt)
            
            # Slice the strings appropriately
            reference = suffix[:100]
            hypothesis = sample[p_len:p_len+100]
            
            # Calculate BLEU score
            bleu_score = calculate_bleu(reference, hypothesis)
            bleu_scores.append(bleu_score)
            if(bleu_score > 1):
                print(f"{reference} -- {hypothesis} -- {bleu_score}")
            
            # Check if the reference is in the prompt string
            rep.append(1 if reference in prompt else 0)
        except Exception as e:
            # Print the error and skip this row
            print(f"Error processing row {index}: {e}")
            bleu_scores.append(None)  # or append any placeholder value if needed
            rep.append(None)  # or append any placeholder value if needed

    df['bleu_score'] = bleu_scores
    df['prompt_rep'] = rep

    # Calculate fuzzy memorization and exact memorization
    fuzzy_memorization = df[(df['bleu_score'] > 0.75) & (df['bleu_score'] < 1)].shape[0]
    exact_memorization = df[df['bleu_score'] >= 1].shape[0]
    prompt_repetition = df[df['prompt_rep'] == 1].shape[0]

    with open(output_file, 'a') as f:
        f.write(f"{folder_name},{file_name},{model_type},{model},{fuzzy_memorization},{exact_memorization},{prompt_repetition}\n")


In [24]:
# Cell 4: Define the Main Function to Process All CSV Files in a Directory
def folderprocess(input_folder, output_file):
    if not os.path.exists('examined'):
        os.makedirs('examined')

    for file_name in os.listdir(input_folder):
        if file_name.endswith('.csv'):
            file_path = os.path.join(input_folder, file_name)
            folder_name = os.path.basename(os.path.dirname(file_path))
            examined_subfolder = os.path.join('examined', folder_name)
            
            if not os.path.exists(examined_subfolder):
                os.makedirs(examined_subfolder)
            
            process_csv_file(file_path, output_file)
            shutil.move(file_path, examined_subfolder)


In [26]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Ensure that nltk resources are available
nltk.download('punkt')

def preprocess_text(text):
    # Remove extra whitespace and empty lines
    text = text.strip()  # Remove leading/trailing whitespace
    lines = text.split('\n')  # Split into lines
    non_empty_lines = [line.strip() for line in lines if line.strip()]  # Remove empty lines and extra whitespace
    return ' '.join(non_empty_lines)  # Join back into a single string

def calculate_bleu(reference, hypothesis):
    try:
        # Preprocess the texts to remove empty lines and excessive whitespace
        reference = preprocess_text(reference)
        hypothesis = preprocess_text(hypothesis)

        # Tokenize the texts
        reference_tokens = nltk.word_tokenize(reference)
        hypothesis_tokens = nltk.word_tokenize(hypothesis)

        # Calculate BLEU score
        smoothing = SmoothingFunction().method1
        return sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing)
    except (TypeError, ValueError) as e:
        print(f"Error calculating BLEU score: {e}")
        return 0


[nltk_data] Downloading package punkt to /Users/deals/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
input_folder = 'data1907'
output_file = 'memorisationfile.csv' # this will concatenate all results to the file; each row is a result

folderprocess(input_folder, output_file)

Extra analysis done for zlib/ppl (not in core paper)

In [None]:
import csv
import os
import glob
import statistics
import zlib

def extract_model_name(file_name):
    """
    Extract the model name from the file name.
    """
    return file_name.split('_')[1]

def calculate_pre_zlib(text):
    """
    Calculate the pre-zlib perplexity based on zlib compression entropy.
    """
    compressed_text = zlib.compress(bytes(text, 'utf-8'))
    return len(compressed_text)

def process_csv(input_file):
    """
    Process a single CSV file to calculate average and standard deviation for PPL_S, PPL_Lower,
    Zlib, and PPL_XL, as well as the number of samples. Also includes pre-zlib perplexity for input text.
    """
    ppl_s_values = []
    ppl_lower_values = []
    zlib_values = []
    ppl_xl_values = []
    pre_zlib_values = []  # To store pre-zlib perplexity of input texts
    num_samples = 0

    try:
        # Extract folder and file names
        folder_name = os.path.basename(os.path.dirname(input_file))
        file_name = os.path.basename(input_file)
        model_type = file_name.split('_')[-1].replace('.csv', '')
        model = extract_model_name(file_name)

        with open(input_file, 'r', newline='', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            
            for row in reader:
                try:
                    # Extract values
                    ppl_s = float(row['PPL_S'])
                    ppl_lower = float(row['PPL_Lower'])
                    zlib = float(row['Zlib'])
                    ppl_xl = float(row['PPL_XL'])
                    text = row.get('prompt', '')  # Assuming 'prompt' is the column for input text
                    
                    # Append values to respective lists
                    ppl_s_values.append(ppl_s)
                    ppl_lower_values.append(ppl_lower)
                    zlib_values.append(zlib)
                    ppl_xl_values.append(ppl_xl)
                    
                    # Calculate and append pre-zlib perplexity if text is present
                    if text:
                        pre_zlib_values.append(calculate_pre_zlib(text))
                    
                    num_samples += 1

                except ValueError:
                    print(f"Skipping row with invalid data: {row}")
                    continue

        # Calculate average and standard deviation
        def calculate_statistics(values):
            if values:
                average = sum(values) / len(values)
                std_dev = statistics.stdev(values) if len(values) > 1 else 0
            else:
                average = std_dev = 0
            return average, std_dev

        avg_ppl_s, std_dev_ppl_s = calculate_statistics(ppl_s_values)
        avg_ppl_lower, std_dev_ppl_lower = calculate_statistics(ppl_lower_values)
        avg_zlib, std_dev_zlib = calculate_statistics(zlib_values)
        avg_ppl_xl, std_dev_ppl_xl = calculate_statistics(ppl_xl_values)
        avg_pre_zlib, std_dev_pre_zlib = calculate_statistics(pre_zlib_values)  # Calculate pre-zlib stats

        return (folder_name, file_name, model_type, model, avg_ppl_s, std_dev_ppl_s, avg_ppl_lower,
                std_dev_ppl_lower, avg_zlib, std_dev_zlib, avg_ppl_xl, std_dev_ppl_xl, avg_pre_zlib, std_dev_pre_zlib, num_samples)

    except UnicodeDecodeError:
        print(f"Skipping file with null bytes: {input_file}")
        return (os.path.basename(input_file), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
    except FileNotFoundError:
        print(f"Error: File '{input_file}' not found.")
        return (os.path.basename(input_file), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
    except Exception as e:
        print(f"Error processing file '{input_file}': {e}")
        return (os.path.basename(input_file), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

def summarize_csvs(folder_path, output_file):
    """
    Summarize all CSV files in the specified folder and its subfolders.
    """
    csv_files = glob.glob(os.path.join(folder_path, '**', '*.csv'), recursive=True)

    # Open output file in append mode
    with open(output_file, 'a', newline='', encoding='utf-8') as summary_file:
        writer = csv.writer(summary_file)

        # Check if the output file is empty to write header
        if os.stat(output_file).st_size == 0:
            writer.writerow(['Folder', 'File', 'Model Type', 'Model', 'Avg PPL_S', 'Std Dev PPL_S', 'Avg PPL_Lower', 
                             'Std Dev PPL_Lower', 'Avg Zlib', 'Std Dev Zlib', 'Avg PPL_XL', 'Std Dev PPL_XL', 'Avg Pre-Zlib', 'Std Dev Pre-Zlib', 'Number of Samples'])

        for csv_file in csv_files:
            result = process_csv(csv_file)
            writer.writerow(result)

    print(f"Summary of all CSV files saved to {output_file}")

# Example usage
folder_path = 'examined'
output_file = 'plex_results.csv'
summarize_csvs(folder_path, output_file)


combine the results to single csv

In [None]:
import pandas as pd

# File paths for the CSV files
csv_file1 = 'memorisationfile.csv'
csv_file2 = 'plex_results.csv'
combined_csv_file = 'combined.csv'

# Load the CSV files into pandas DataFrames
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)

df1 = df1.rename(columns={'subfolder': 'Folder', 'filename': 'File'})


# Merge the DataFrames on the "Folder" and "File" columns
combined_df = pd.merge(df1, df2, on=['Folder', 'File'], how='outer')

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(combined_csv_file, index=False)

print(f"Combined CSV file saved to {combined_csv_file}")


In [None]:

# Load the CSV file into a DataFrame
csv_file = combined_csv_file
df = pd.read_csv(csv_file)

# Specify the columns to move
columns_to_move = ['exact', 'fuzzy']

# Get the current column order
current_columns = df.columns.tolist()

# Remove the columns to move from the current order
remaining_columns = [col for col in current_columns if col not in columns_to_move]

# Append the columns to move to the end of the remaining columns
new_order = remaining_columns + columns_to_move

# Reorder the DataFrame columns
df = df[new_order]

# Save the DataFrame with the new column order to a new CSV file
df.to_csv(combined_csv_file, index=False)

print("Columns reordered and saved to new CSV file.")