# Data Processing for Dataset 3

## 1. Loading the Dataset

In [None]:
import pandas as pd

# Load the CSV
df3 = pd.read_csv("../data.csv")

# Sanity check
if "source" not in df3.columns:
    raise KeyError("Column 'source' not found in ./data.csv")

# Unique values
unique_sources = df3["source"].dropna().unique()
print(f"Unique 'source' values ({len(unique_sources)}):")
print(sorted(unique_sources))

# (Optional) Counts per value
print("\nValue counts (including NaN):")
print(df3["source"].value_counts(dropna=False))

Unique 'source' values (63):
['Bloom-7B', 'Claude-Instant-v1', 'Claude-v1', 'Cohere-Command', 'Dolphin-2.5-Mixtral-8x7B', 'Dolphin-Mixtral-8x7B', 'Falcon-180B', 'Flan-T5-Base', 'Flan-T5-Large', 'Flan-T5-Small', 'Flan-T5-XL', 'Flan-T5-XXL', 'GLM-130B', 'GPT-3.5', 'GPT-4', 'GPT-J', 'GPT-NeoX', 'Gemini-Pro', 'Goliath-120B', 'Human', 'LLaMA-13B', 'LLaMA-2-70B', 'LLaMA-2-7B', 'LLaMA-30B', 'LLaMA-65B', 'LLaMA-7B', 'LZLV-70B', 'Mistral-7B', 'Mistral-7B-OpenOrca', 'Mixtral-8x7B', 'MythoMax-L2-13B', 'Neural-Chat-7B', 'Noromaid-20B', 'Nous-Capybara-34B', 'Nous-Capybara-7B', 'Nous-Hermes-LLaMA-2-13B', 'Nous-Hermes-LLaMA-2-70B', 'OPT-1.3B', 'OPT-125M', 'OPT-13B', 'OPT-2.7B', 'OPT-30B', 'OPT-350M', 'OPT-6.7B', 'OpenChat-3.5', 'OpenHermes-2-Mistral-7B', 'OpenHermes-2.5-Mistral-7B', 'PaLM-2', 'Psyfighter-13B', 'Psyfighter-2-13B', 'RWKV-5-World-3B', 'StripedHyena-Nous-7B', 'T0-11B', 'T0-3B', 'Text-Ada-001', 'Text-Babbage-001', 'Text-Curie-001', 'Text-Davinci-001', 'Text-Davinci-002', 'Text-Davinci-003

## 2. Rename Columns and Map Labels to 1 and 0

In [3]:
df3 = df3.rename(columns={"source": "generated"})

# Map: Human -> 0, everything else -> 1
df3["generated"] = (
    df3["generated"].astype(str).str.strip().str.casefold().ne("human")
).astype(int)

# (Optional) sanity check
print(df3["generated"].value_counts(dropna=False))

generated
1    441230
0    347692
Name: count, dtype: int64


In [51]:
df3.head()

Unnamed: 0,text,generated,prompt_id,text_length,word_count
0,"Federal law supersedes state law, and cannabis...",1,0,967,157
1,Miles feels restless after working all day. He...,1,0,5068,778
2,So first of I am danish. That means that I fol...,1,0,1602,267
3,In this paper we present a novel rule-based ap...,1,0,5469,848
4,"Most social progressives, love democracy, and ...",1,0,2379,380


## 3. Filter Rows

Keeping rows with word_count between 100-400 inclusive.

In [4]:
def count_rows_in_range(df, col="word_count", low=100, high=400):
    """Count rows where col value is between low and high inclusive."""
    return ((df[col] >= low) & (df[col] <= high)).sum()

In [5]:
count = count_rows_in_range(df3, col="word_count", low=100, high=400)
print(f"Rows with word count between 100 and 400 inclusive: {count}")

Rows with word count between 100 and 400 inclusive: 270087


In [6]:
df3 = df3[(df3["word_count"] >= 100) & (df3["word_count"] <= 400)]

In [7]:
df3.shape

(270087, 5)

## 4. Sampling 30,000 Rows Per Label

In [8]:
target_n = 30_000

counts = df3["generated"].value_counts()
too_small = counts[counts < target_n]
if not too_small.empty:
    raise ValueError(f"These labels have fewer than {target_n} rows:\n{too_small}")

# sample exactly 20k per label and shuffle
df3 = (
    df3.groupby("generated", group_keys=False)
       .apply(lambda g: g.sample(n=target_n, random_state=42))
       .sample(frac=1.0, random_state=42)  # shuffle combined result
       .reset_index(drop=True)
)

# quick confirmation
print(df3["generated"].value_counts())
print("Total rows:", len(df3))

generated
0    30000
1    30000
Name: count, dtype: int64
Total rows: 60000


  .apply(lambda g: g.sample(n=target_n, random_state=42))


## 5. Pre-Process Data & Feature Engineering

In [10]:
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
num_cores

32

In [12]:
from utils import preprocess_text
df3['tokens'] = Parallel(n_jobs=num_cores)(
    delayed(preprocess_text)(text) for text in df3['text']
)

### Deleting Overlaps 

Samples might appear in the training data

In [14]:
df = pd.read_csv("./df1_cleaned_processed.csv")

In [15]:
overlap = set(df['tokens']) & set(df3['tokens'])
print(f" Number of overlapping rows: {len(overlap)}")

 Number of overlapping rows: 658


In [16]:

df3 = df3[~df3['tokens'].isin(df['tokens'])].copy()

In [17]:
overlap = set(df['tokens']) & set(df3['tokens'])
print(f" Number of overlapping rows: {len(overlap)}")

 Number of overlapping rows: 0


### Feature Engineering

In [21]:
import textstat, re, numpy as np
import nltk
from collections import Counter
# Average sentence length
def avg_sent_len(text):
    sents = nltk.sent_tokenize(text)
    return np.mean([len(nltk.word_tokenize(s)) for s in sents]) if sents else 0

# Hapax ratio
def hapax_ratio(tokens):
    counts = Counter(tokens.split())
    hapax = sum(1 for c in counts.values() if c == 1)
    return hapax / len(tokens.split()) if tokens else 0

# Flesch-Kincaid
def flesch_grade(text):
    try:
        return textstat.flesch_kincaid_grade(text)
    except:   # short texts may error
        return 0

In [22]:
df3['avg_sent_len'] = Parallel(n_jobs=num_cores)(
    delayed(avg_sent_len)(text) for text in df3['text']
)

# Parallel for hapax_ratio
df3['hapax_ratio'] = Parallel(n_jobs=num_cores)(
    delayed(hapax_ratio)(tokens) for tokens in df3['tokens']
)

# Parallel for flesch_grade
df3['flesch_grade'] = Parallel(n_jobs=num_cores)(
    delayed(flesch_grade)(text) for text in df3['text']
)

In [23]:
df3['ttr'] = df3['tokens'].apply(
    lambda x: len(set(str(x).split())) / len(str(x).split()) if len(str(x).split()) > 0 else 0
)

In [24]:
df3.head()

Unnamed: 0,text,generated,prompt_id,text_length,word_count,tokens,avg_sent_len,hapax_ratio,flesch_grade,ttr
0,With parents both graduating from teacher coll...,0,0,876,145,parent graduating teacher college teaching alw...,19.875,0.8,10.440129,0.8875
1,"Once, a man and a woman died at the same time ...",1,0,1774,324,man woman died time sent hell together satan s...,15.44,0.539474,5.525511,0.684211
2,"Once upon a time, there was a woman named Emil...",1,0,641,118,upon time woman named emily loved bake decided...,19.0,0.649123,7.084286,0.789474
3,"They're called ""contrails,"" which is short for...",0,0,1220,197,called contrail short condensation trail creat...,21.636364,0.513761,9.68052,0.697248
4,I personally think that for once in a lifetime...,0,2,1901,357,personally think lifetime america say america ...,53.857143,0.487013,18.770053,0.655844


## 6. Save the Dataset

In [25]:
df3.to_csv("processed_df3_60k_fe.csv", index=False)
