# Import Library & Dataset

In [None]:
# Import libraries
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Load dataset
def load_parquet_files(file_paths):
    dataframes = [pd.read_parquet(file, engine = "pyarrow") for file in file_paths]
    return pd.concat(dataframes, ignore_index = True)

parquet_files = ["data/train-00000-of-00007-bc5952582e004d67.parquet", 
                 "data/train-00001-of-00007-71c80017bc45f30d.parquet", 
                 "data/train-00002-of-00007-ee2d43f396e78fbc.parquet", 
                 "data/train-00003-of-00007-529931154b42b51d.parquet", 
                 "data/train-00004-of-00007-b269dc49374a2c0b.parquet", 
                 "data/train-00005-of-00007-3dce5e05ddbad789.parquet",
                 "data/train-00006-of-00007-3d8a471ba0cf1c8d.parquet"
                ]

df_full = load_parquet_files(parquet_files)
df_full.head()

In [None]:
# Full dataset's shape
df_full.shape

In [None]:
# Define the sample size 
sample_fraction = 0.1  
df, _ = train_test_split(df_full, train_size = sample_fraction, stratify = df_full['source'], random_state = 1)
df = df.reset_index(drop = True)
df.shape

# IDE

In [None]:
# Class distribution
f, ax = plt.subplots(figsize = (13, 6))
sns.countplot(x = 'source', data = df)
plt.title("Target Class Distribution")
plt.show()

In [None]:
# Text length analysis
f, ax = plt.subplots(figsize = (13, 6))
df['text_length'] = df['text'].apply(lambda x: len(x.split()))
sns.histplot(df, x = 'text_length', hue = 'source', bins = 50)
plt.title("Text Length Distribution")
plt.show()

In [None]:
# Average text length analysis
f, ax = plt.subplots(figsize = (13, 6))
sns.boxplot(df, x = 'text_length', y = 'source')
plt.title("Average Text Length Distribution")
plt.show()

In [None]:
# Average text length analysis without outliers
f, ax = plt.subplots(figsize = (13, 6))
sns.boxplot(df, x = 'text_length', y = 'source', showfliers = False)
plt.title("Average Text Length Distribution (remove outliers)")
plt.show()

In [None]:
# Text length distribution
df.groupby("source")[['source', 'text_length']].describe()

In [None]:
# Word cloud (only contains the first 1000 rows of each label)
fig, axs = plt.subplots(2,1, figsize = (100, 15))
for i in range(len(df['source'].unique())):
    txt = ' '.join(df[df['source'] == df['source'].unique()[i]]['text'].head(1000).astype(str))
    wordcloud = WordCloud().generate(txt)
    title = '"' + df['source'].unique()[i] + '"' + ' Word Cloud'
    
    axs[i].imshow(wordcloud, interpolation = 'bilinear')
    axs[i].axis("off")
    axs[i].set_title(title)

# Text Preprocessing

In [None]:
# Text Sample
print(df['text'][0])

In [None]:
# Basic text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\n', '', text)  # Remove newline characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['clean_text'] = df['text'].apply(preprocess_text)
df = df[df['clean_text'].str.len() > 5]
df = df.reset_index(drop = True)

In [None]:
df = df.dropna(subset=['clean_text'])  # Remove rows where 'clean_text' is NaN
df = df[df['clean_text'].str.strip() != '']  # Remove empty strings

In [None]:
# TF-IDF feature extraction
vectorizer = TfidfVectorizer(max_features = 5000, stop_words = 'english', ngram_range = (1, 2))
X = vectorizer.fit_transform(df['clean_text'])
y = df['source'].map({'human': 0, 'ai': 1})

In [None]:
# # Handling class imbalance using SMOTE
# smote = SMOTE(random_state = 1)
# X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy = 'minority', random_state = 1)
X_resampled, y_resampled = ros.fit_resample(X, y)