<a href="https://colab.research.google.com/github/ephipie/human-ai-parallel-detection/blob/main/LLM_Detection_01_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets -q

In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
np.random.seed(42)

In [None]:
NUM_SAMPLES_PER_SOURCE_TYPE = 100

In [None]:
orig_dataset = load_dataset("browndw/human-ai-parallel-corpus")

In [None]:
# Take a peek at dataset structure, it has only
orig_dataset

In [None]:
orig_df = pd.DataFrame(orig_dataset['train'])

In [None]:
def display_full(df):
    with pd.option_context('display.max_rows', None,
                           'display.max_columns', None,
                           'display.max_colwidth', None,
                           'display.width', None):
        display(df)

orig_df.head(2)

In [None]:
def parse_id_column(df, id_col='doc_id'):
    df[['domain', 'serial_num']] = df[id_col].str.split('_', n=1, expand=True)
    df[['serial_num', 'model']] = df['serial_num'].str.split('@', n=1, expand=True)
    return df

df = parse_id_column(orig_df)
df.head()

In [None]:
# Get counts of unique domain and model combinations
source_model_counts = df.groupby(['domain', 'model']).size().reset_index(name='count')
print(source_model_counts.sort_values('count', ascending=False))

In [None]:
def sample_groups(df, n):
    sampled_df = pd.DataFrame()
    for source_type in df['domain'].unique():
        source_df = df[df['domain'] == source_type]
        serial_nums = source_df['serial_num'].unique()
        if len(serial_nums) <=n:
          sampled_serial_nums = serial_nums
        else:
          sampled_serial_nums = np.random.choice(serial_nums, size=n, replace=False)

        for serial_num in sampled_serial_nums:
          sampled_df = pd.concat([sampled_df, source_df[source_df['serial_num'] == serial_num]])
    return sampled_df

sample_df = sample_groups(df, NUM_SAMPLES_PER_SOURCE_TYPE)
print(sample_df.groupby("domain").size().reset_index(name='count'))

In [None]:
def transpose_df(df):
    return df.pivot_table(
        index=['serial_num', 'domain'],
        columns='model',
        values='text',
        aggfunc='first'
    ).reset_index()

# Usage
sample_df_T = transpose_df(sample_df)
print(len(sample_df_T))

In [None]:
sample_df_T.head()

In [None]:
# Rename columns for ease of use

# Rename the specified columns and drop the others
working_df = sample_df_T.rename(columns={
    'Meta-Llama-3-70B-Instruct': 'llama',
    'gpt-4o-2024-08-06': 'gpt'
})

# Keep only the columns we want
working_df = working_df[['serial_num', 'domain','chunk_1', 'chunk_2','gpt', 'llama']]
print("\nAfter renaming\n")

for col in working_df.columns:
  print(col)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

output_folder = '/content/drive/MyDrive/shared_data'

if not os.path.exists(output_folder):
      os.makedirs(output_folder)

working_df.to_parquet(f'{output_folder}/llm_detection_data.parquet')