### Table 1
We extract the results for table 1 in this notebook. Replace the username with your Huggingface username if you are reproducing our results.

In [1]:
import pandas as pd
import numpy as np
import os

username = 'JasperDekoninck'

base_path = f'../data/baselines/{username}'

def bootstrap_tpr(scores, scores_false, n_bootstrap=1000):
    values = []
    thresholds = []
    for _ in range(n_bootstrap):
        random_indices = np.random.choice(range(len(scores)), len(scores), replace=True)
        random_indices_false = np.random.choice(range(len(scores_false)), len(scores_false), replace=True)
        scores_false_here = scores_false[random_indices_false]
        scores_true = scores[random_indices]
        threshold = np.sort(scores_false_here)[int(len(scores_false_here) * 0.99)]
        thresholds.append(threshold)
        tpr = (scores_true > threshold).mean()
        values.append(tpr)
    p = 1 - np.mean(np.array(values) > 0.01)
    return p

def sample_level_methods(df):
    output_dict = dict()
    output_dict['shi'] = df['topkmin']
    output_dict['mireshgallah'] = - df['perplexity_output'] / df['perplexity_ref']
    output_dict['yeom'] = - df['perplexity_output']
    output_dict['carlini'] = - df['lowercase']
    return output_dict

def compute_tpr(scores, scores_false, fpr=0.01):
    # compute the threshold
    false_scores = np.sort(scores_false)
    threshold = false_scores[int(len(false_scores) * (1-fpr))]
    # compute the tpr
    tpr = (scores > threshold).mean()
    return tpr

def detect(folder):
    detection = {
        'shi': 0,
        'mireshgallah': 0,
        'yeom': 0,
        'carlini': 0
    }
    for file in os.listdir(folder):
        if not file.endswith('.csv'):
            continue
        if 'False' not in file:
            continue
        df = pd.read_csv(os.path.join(folder, file))
        methods = sample_level_methods(df)
        df_true = pd.read_csv(os.path.join(folder, file.replace('False', 'True')))
        methods_true = sample_level_methods(df_true)
        for method, scores in methods.items():
            scores_true = methods_true[method]
            p = compute_tpr(np.array(scores), np.array(scores_true))
            p = bootstrap_tpr(np.array(scores), np.array(scores_true))
            if p < 0.05:
                detection[method] += 1
    return detection

In [2]:
detections = dict()
for folder in os.listdir(base_path):
    detection = detect(os.path.join(base_path, folder))
    detections[folder.replace('contamination-models-', '')] = detection

In [3]:
base_path = '../tables'
all_dfs = []
for benchmark in ['gsm8k', 'mmlu', 'arc', 'hellaswag']:
    df = pd.read_csv(os.path.join(base_path, f'{benchmark}_synthetic.csv'))
    df = df[df['model'].apply(lambda x: 'contamination-models-' in x)]
    df['model'] = df['model'].apply(lambda x: x.split('/')[-1].replace('contamination-models-', ''))
    all_dfs.append(df)
df_synthetic = pd.concat(all_dfs)

In [4]:
base_path = '../tables'
all_dfs = []
for benchmark in ['gsm8k', 'mmlu', 'arc', 'hellaswag']:
    df = pd.read_csv(os.path.join(base_path, f'{benchmark}_rephrase.csv'))
    df = df[df['model'].apply(lambda x: 'contamination-models-' in x)]
    df['model'] = df['model'].apply(lambda x: x.split('/')[-1].replace('contamination-models-', ''))
    all_dfs.append(df)
df_rephrase = pd.concat(all_dfs).reset_index(drop=True)

In [5]:
df_synthetic['contaminated'] = np.logical_and(df_synthetic['score_model'] > df_synthetic['no_cont'], df_synthetic['score_model'] > 0.3)
df_rephrase['contaminated'] = np.logical_and(df_rephrase['score_model'] > df_rephrase['no_cont'], df_rephrase['score_model'] > 0.3)
df_synthetic['detected'] = df_synthetic['p_value'] < 0.05
df_rephrase['detected'] = df_rephrase['p_value'] < 0.05
rephrased_models = np.array(df_rephrase['model'].apply(lambda x: 'rephrase' not in x))
df_rephrase['syntax_contaminated'] = np.logical_and(df_rephrase['contaminated'], rephrased_models)


In [6]:
print(np.count_nonzero(df_synthetic['contaminated']))

61


In [7]:
print(np.count_nonzero(np.logical_and(df_synthetic['detected'], df_synthetic['contaminated'])) / np.count_nonzero(df_synthetic['contaminated']))

0.9836065573770492


In [8]:
print(np.count_nonzero(np.logical_and(df_rephrase['detected'], df_rephrase['syntax_contaminated'])) / np.count_nonzero(df_rephrase['syntax_contaminated']))

0.8913043478260869


In [9]:
df_methods = pd.DataFrame(detections).T

# remove the models that are not contaminated
df_methods = df_methods.loc[df_rephrase[df_rephrase['contaminated']]['model'].unique()]

for method in df_methods.columns:
    print(method, np.count_nonzero(df_methods[method]), np.count_nonzero(df_methods[method]) / np.count_nonzero(df_rephrase['contaminated']))

shi 43 0.7049180327868853
mireshgallah 42 0.6885245901639344
yeom 41 0.6721311475409836
carlini 40 0.6557377049180327


In [10]:
df_methods = pd.DataFrame(detections).T

# remove the models that are not contaminated
df_methods = df_methods.loc[df_rephrase[df_rephrase['syntax_contaminated']]['model'].unique()]

for method in df_methods.columns:
    print(method, np.count_nonzero(df_methods[method]), np.count_nonzero(df_methods[method]) / np.count_nonzero(df_rephrase['syntax_contaminated']))

shi 39 0.8478260869565217
mireshgallah 35 0.7608695652173914
yeom 36 0.782608695652174
carlini 35 0.7608695652173914


In [11]:
# lets extrac the shi results
path = '../code-contamination-detection/code-contamination-output'
# recursive search for the shi results for all files called 'log.txt'
import glob

all_files = glob.glob(path + '/**/log.txt', recursive=True)
results = []

for file in all_files:
    model_name = file.split('/')[-2].replace('contamination-models-', '')
    with open(file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if 'result' in line:
                score = float(line.split(' ')[-1].strip())
                results.append({'score_shi': score, 'model': model_name})
                break


In [12]:
shi_df = pd.DataFrame(results)

In [13]:
# merge shi_df with the rephrase results
df_shi = df_rephrase.merge(shi_df, on='model', how='left', suffixes=('', '_shi'))

In [14]:
print(np.count_nonzero(np.logical_and(df_shi['score_shi'] > 0.85, df_shi['syntax_contaminated'])) / np.count_nonzero(df_shi['syntax_contaminated']))

0.21739130434782608


In [15]:
print(np.count_nonzero(np.logical_and(df_shi['score_shi'] > 0.85, df_shi['contaminated'])) / np.count_nonzero(df_shi['contaminated']))

0.16393442622950818
