# EDA 2

## Loading the Dataset

In [130]:
import pandas as pd

# Load the dataset from your local path
df = pd.read_csv('./AI_Human.csv')  

# Display basic information about the dataset
df.info()

# Display the first few rows of the dataset
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 487235 entries, 0 to 487234
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   text       487235 non-null  object 
 1   generated  487235 non-null  float64
dtypes: float64(1), object(1)
memory usage: 7.4+ MB


Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [131]:
df['text'][0]

'Cars. Cars have been around since they became famous in the 1900s, when Henry Ford created and built the first ModelT. Cars have played a major role in our every day lives since then. But now, people are starting to question if limiting car usage would be a good thing. To me, limiting the use of cars might be a good thing to do.\n\nIn like matter of this, article, "In German Suburb, Life Goes On Without Cars," by Elizabeth Rosenthal states, how automobiles are the linchpin of suburbs, where middle class families from either Shanghai or Chicago tend to make their homes. Experts say how this is a huge impediment to current efforts to reduce greenhouse gas emissions from tailpipe. Passenger cars are responsible for 12 percent of greenhouse gas emissions in Europe...and up to 50 percent in some carintensive areas in the United States. Cars are the main reason for the greenhouse gas emissions because of a lot of people driving them around all the time getting where they need to go. Article

## Check Null Values & Noise

In [132]:
# Check for missing values in the dataset
df.isnull().sum()


text         0
generated    0
dtype: int64

## Data Cleaning and Processing

In [133]:
duplicate_mask = df.duplicated(subset='text', keep=False)
df[duplicate_mask]

Unnamed: 0,text,generated


In [134]:
def remove_tags(text):
    tags = ['\n', '\'']
    for tag in tags:
        text = text.replace(tag, ' ' if tag == '\n' else '')
    
    return text


df['text'] = df['text'].apply(remove_tags)

# https://www.kaggle.com/code/saurabhkailaskuche/ai-generated-vs-human

In [135]:
# Ensure text_length is calculated
df['text_length'] = df['text'].apply(lambda x: len(str(x).split()))

# Find rows with text length 0
empty_rows = df[df['text_length'] <= 15]

# Display the number of such rows and optionally print them
print(f"Number of rows with text length 0: {len(empty_rows)}")
empty_rows

Number of rows with text length 0: 32


Unnamed: 0,text,generated,text_length
2380,Code] [Email Address] [Phone Number],1.0,5
2381,] [Email] [Phone Number],1.0,4
2384,],1.0,1
2385,] [Email] [Phone Number],1.0,4
2388,] [Email Address] [Phone Number],1.0,5
28737,Facial action coding,1.0,3
29318,Community service is an integral part of ever...,1.0,14
29331,Community service.,1.0,2
29337,Community service refers to the activities an...,1.0,11
29374,"Write an essay on the topic ""A Cowboy Who Rod...",1.0,12


In [136]:
df = df[df['text_length'] >= 15]

In [137]:
from utils import preprocess_text

## Process the Data in Parallel using Joblib

In [138]:
from joblib import Parallel, delayed
import multiprocessing
num_cores = multiprocessing.cpu_count()
num_cores

32

In [139]:
df['tokens'] = Parallel(n_jobs=num_cores)(
    delayed(preprocess_text)(text) for text in df['text']
)

In [140]:
df.head()

Unnamed: 0,text,generated,text_length,tokens
0,Cars. Cars have been around since they became ...,0.0,584,car car around since became famous 1900s henry...
1,Transportation is a large necessity in most co...,0.0,462,transportation large necessity country worldwi...
2,"""Americas love affair with its vehicles seems ...",0.0,744,america love affair vehicle seems cooling say ...
3,How often do you ride in a car? Do you drive a...,0.0,686,often ride car drive one motor vehicle work st...
4,Cars are a wonderful thing. They are perhaps o...,0.0,871,car wonderful thing perhaps one world greatest...


In [141]:
# Split all tokens and flatten them into a single list
all_tokens = df['tokens'].str.split().explode()

# Get number of unique tokens
unique_token_count = all_tokens.nunique()
print("Number of unique tokens:", unique_token_count)



Number of unique tokens: 250972


In [142]:
df.shape

(487203, 4)

## Find & Filter Similar Rows (added later)

In [143]:
# ---------- Config ----------
THRESH_OVERLAP = 0.85        # your “90% overlapping tokens” requirement
NUM_PERM = 64                # MinHash permutations (64 is a good accuracy/memory tradeoff)
LSH_JACCARD_THRESHOLD = 0.80 # LSH threshold (slightly lower to avoid missing borderline pairs)

# ---------- Imports ----------
from datasketch import MinHash, MinHashLSH
from joblib import Parallel, delayed
import math

# ---------- Helpers ----------
def tokens_to_set(tokens_str: str):
    # tokens column is space-separated; convert to a set of unique tokens
    # (if your 'tokens' already has unique words, this is cheap)
    return set(tokens_str.split())

def minhash_from_set(s: set, num_perm=NUM_PERM) -> MinHash:
    m = MinHash(num_perm=num_perm)
    for w in s:
        m.update(w.encode('utf-8'))
    return m

def overlap_coeff(a: set, b: set) -> float:
    # |A ∩ B| / min(|A|, |B|)
    if not a or not b:
        return 0.0
    return len(a & b) / float(min(len(a), len(b)))

# ---------- Main (Incremental LSH to limit memory) ----------
# We iterate rows once; for each row:
# 1) build its MinHash
# 2) query LSH for candidates among prior rows
# 3) verify with exact overlap
# 4) insert into LSH
# This avoids querying every pair twice and keeps RAM in check.

tokens_series = df['tokens']      # <- your DataFrame column
lsh = MinHashLSH(threshold=LSH_JACCARD_THRESHOLD, num_perm=NUM_PERM)

similar_pairs = []  # will store tuples: (i, j, overlap)
# If you expect many matches, consider writing to disk in chunks instead of keeping all in RAM.

for i, tok_str in enumerate(tokens_series):
    A = tokens_to_set(tok_str)

    # Build MinHash for this row
    mh = minhash_from_set(A)

    # Get candidates among previously inserted rows
    candidates = lsh.query(mh)

    # Verify with exact overlap, report each pair once (j < i because only previous inserted)
    for j in candidates:
        B = tokens_to_set(tokens_series.iloc[j])
        oc = overlap_coeff(A, B)
        if oc >= THRESH_OVERLAP:
            similar_pairs.append((j, i, oc))  # store (older_index, current_index, score)

    # Insert current row into index *after* querying to avoid self/duplicate matches
    lsh.insert(i, mh)

# Convert to a DataFrame if you want
import pandas as pd
pairs_df = pd.DataFrame(similar_pairs, columns=['idx_a', 'idx_b', 'overlap_coeff'])
print("Found candidate near-duplicates:", len(pairs_df))
pairs_df.head()


Found candidate near-duplicates: 1568155


Unnamed: 0,idx_a,idx_b,overlap_coeff
0,1385,1389,0.906404
1,1385,1391,0.866995
2,1389,1391,0.914692
3,1395,1397,0.930693
4,1395,1398,0.955224


In [144]:
import numpy as np
import pandas as pd

# --- Inputs assumed ---
# df: your original DataFrame with column 'generated' (0/1)
# pairs_df: DataFrame with columns ['idx_a','idx_b','overlap_coeff'] from your near-duplicate finder

# Safety checks
assert 'generated' in df.columns
assert {'idx_a','idx_b'}.issubset(pairs_df.columns)

# Use positional labels array to avoid index/key issues
labels_arr = df['generated'].astype(int).to_numpy()
N = len(df)

# Edges as numpy arrays (ints)
a = pairs_df['idx_a'].to_numpy(dtype=np.int64)
b = pairs_df['idx_b'].to_numpy(dtype=np.int64)

# --- Union-Find (Disjoint Set) over N rows (fast & memory-light) ---
class DSU:
    def __init__(self, n):
        self.parent = np.arange(n, dtype=np.int64)
        self.size = np.ones(n, dtype=np.int64)
    def find(self, x):
        # path compression
        while self.parent[x] != x:
            self.parent[x] = self.parent[self.parent[x]]
            x = self.parent[x]
        return x
    def union(self, x, y):
        rx, ry = self.find(x), self.find(y)
        if rx == ry: 
            return
        # union by size
        if self.size[rx] < self.size[ry]:
            rx, ry = ry, rx
        self.parent[ry] = rx
        self.size[rx] += self.size[ry]

dsu = DSU(N)
for i, j in zip(a, b):
    dsu.union(i, j)

# Only nodes that appear in at least one pair
involved = np.unique(np.concatenate([a, b]))

# Root for each involved node -> group id 0..G-1
roots = np.array([dsu.find(i) for i in involved], dtype=np.int64)
uniq_roots, group_ids = np.unique(roots, return_inverse=True)

# Build nodes_df: each member with its group and label
nodes_df = pd.DataFrame({
    'row_idx': involved,           # positional row index in df
    'group_id': group_ids,         # 0..num_groups-1
    'label':   labels_arr[involved]
})

# Per-group label composition
label_counts = (nodes_df
                .groupby(['group_id','label'])
                .size()
                .unstack(fill_value=0)
                .rename(columns={0:'n_human', 1:'n_ai'}))

label_counts['size'] = label_counts['n_human'] + label_counts['n_ai']
label_counts = label_counts.sort_values('size', ascending=False)

# Keep only true "similarity groups" (size >= 2)
label_counts = label_counts[label_counts['size'] >= 2]
num_groups = label_counts.shape[0]

print(f"Total similar groups (connected components, size ≥ 2): {num_groups}")

# Pure vs mixed groups
pure_groups = (label_counts[['n_human','n_ai']].min(axis=1) == 0).sum()
mixed_groups = num_groups - pure_groups
print(f"Pure groups: {pure_groups}   Mixed groups: {mixed_groups}")

# Size distribution (first few sizes)
print("\nGroup size distribution (size -> #groups):")
print(label_counts['size'].value_counts().sort_index().head(20))

# Top 10 largest groups with composition
summary = label_counts.reset_index()
print("\nTop 10 largest groups with label composition:")
print(summary.head(10))

# Members of the largest group (positional row indices in df)
largest_gid = summary.iloc[0]['group_id'] if len(summary) else None
if largest_gid is not None:
    members_largest = nodes_df.loc[nodes_df['group_id'] == largest_gid, 'row_idx'].tolist()
    print(f"\nLargest group id: {largest_gid}  size: {len(members_largest)}")
    # Example: inspect their labels quickly
    print(pd.Series(labels_arr[members_largest]).value_counts().rename(index={0:'human',1:'ai'}))
    # You can also peek at the texts:
    # df.iloc[members_largest][['generated','text']].head()


Total similar groups (connected components, size ≥ 2): 52354
Pure groups: 52317   Mixed groups: 37

Group size distribution (size -> #groups):
size
2     3013
3     1764
4     3660
5     6356
6     8320
7     8512
8     6340
9     4027
10    2049
11    1155
12     982
13    1029
14    1006
15     925
16     755
17     536
18     397
19     292
20     187
21     169
Name: count, dtype: int64

Top 10 largest groups with label composition:
label  group_id  n_human  n_ai  size
0          2149       74     0    74
1           512       32     0    32
2           239       31     0    31
3           114       31     0    31
4          1144       31     0    31
5           295       31     0    31
6           203       31     0    31
7           385       31     0    31
8           708       30     0    30
9          1028       30     0    30

Largest group id: 2149  size: 74
human    74
Name: count, dtype: int64


In [145]:
import numpy as np

# safety checks
assert {'idx_a','idx_b'}.issubset(pairs_df.columns)
assert 'generated' in df.columns

# labels as a positional array
labels_arr = df['generated'].astype(int).to_numpy()

# all unique rows that appear in at least one similar-pair edge
involved = np.unique(pairs_df[['idx_a','idx_b']].to_numpy().ravel())

# counts by label among similar rows
n_label0 = (labels_arr[involved] == 0).sum()
n_label1 = (labels_arr[involved] == 1).sum()

print(f"Similar rows total: {involved.size}")
print(f"Label 0 (human)   : {n_label0}")
print(f"Label 1 (AI)      : {n_label1}")
print(f"Label 0 %         : {n_label0 / involved.size:.3%}")
print(f"Label 1 %         : {n_label1 / involved.size:.3%}")


Similar rows total: 403589
Label 0 (human)   : 262659
Label 1 (AI)      : 140930
Label 0 %         : 65.081%
Label 1 %         : 34.919%


In [146]:
# Get one representative row index per group
keep_idx = nodes_df.groupby('group_id')['row_idx'].first().to_numpy()

# Build a mask: keep all rows not in any group, plus one representative per group
all_involved = set(nodes_df['row_idx'])
rows_to_keep = set(keep_idx) | (set(range(len(df))) - all_involved)

# Filter original df
df_dedup = df.iloc[sorted(rows_to_keep)].reset_index(drop=True)

print(f"Original rows: {len(df)}")
print(f"After removing near-duplicates: {len(df_dedup)}")


Original rows: 487203
After removing near-duplicates: 135968


In [147]:
df = df_dedup
del df_dedup
df.shape

(135968, 4)

In [148]:
df.to_csv("df_drop_duplicates.csv", index=False)