# Data Processing for Dataset 1 - AI Vs Human Text 

## 1. Load the Dataset

In [1]:
import pandas as pd

# Load the dataset from your local path
df = pd.read_csv('./df1_cleaned_processed.csv')  

# Display the first few rows of the dataset
df.head()


Unnamed: 0,text,generated,text_length,tokens,token_length,ttr,avg_sent_len,hapax_ratio,flesch_grade
0,Cars are a wonderful thing. They are perhaps o...,0.0,871,car wonderful thing perhaps one world greatest...,470,0.497872,19.3,0.317021,7.904784
1,"cars are everywhere these days, and they are c...",0.0,633,car everywhere day commonplace among u getting...,380,0.544737,19.388889,0.357895,8.844641
2,"One uses a car to go to thee store, pick someo...",0.0,392,one us car go thee store pick someone even go ...,251,0.561753,16.814815,0.390438,7.509796
3,The electoral college ii iomething that hai be...,0.0,493,electoral college ii iomething hai loved hated...,262,0.538168,21.88,0.366412,9.325487
4,"Dear saaae senaaor, My leaaer is in regards ao...",0.0,667,dear saaae senaaor leaaer regard ao changing a...,467,0.423983,18.5,0.265525,7.967523


## 2. TF-IDF - Logistic Regression w/ Engineered Features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
# ---- columns we expect ----
FEATURE_COLS = ["tokens", "ttr", "hapax_ratio", "flesch_grade"]
TARGET_COL = "generated"

# sanity check
missing = [c for c in FEATURE_COLS + [TARGET_COL] if c not in df.columns]
assert not missing, f"Training df is missing columns: {missing}"

### 2.1 Train-test split

In [None]:
X = df[FEATURE_COLS]
y = df[TARGET_COL]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

### 2.2 TF-IDF Vectorizer

In [5]:
text_transformer = TfidfVectorizer(
    tokenizer=str.split,
    ngram_range=(1, 2), # unigram and bigram
    min_df=2,       
    max_df=0.95,
    max_features=10000,
    sublinear_tf=True
)

### 2.3 Feature Transformation

- Vertorize the tokens

- Normalize the other features

In [6]:
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("text", text_transformer, "tokens"),                          
        ("num", numeric_transformer, ["ttr","hapax_ratio","flesch_grade"])
    ]
)

### 2.4 Define Pipeline & Train the Model

In [7]:
clf = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))




              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     10508
         1.0       1.00      0.99      0.99     10778

    accuracy                           0.99     21286
   macro avg       0.99      0.99      0.99     21286
weighted avg       0.99      0.99      0.99     21286



### 2.5 Evaluation with Another Dataset

In [12]:
# Load the CSV
df2 = pd.read_csv("./processed_df3_60k_fe.csv")

In [13]:
# Ensure df2 has the required columns
missing3 = [c for c in FEATURE_COLS if c not in df2.columns]
assert not missing3, f"df2 is missing columns: {missing3}"

X_df3 = df2[FEATURE_COLS]
y_df3 = df2[TARGET_COL] if TARGET_COL in df2.columns else None

y_df3_pred = clf.predict(X_df3)

if y_df3 is not None:
    print(classification_report(y_df3, y_df3_pred))
else:
    print("Predictions only (no ground-truth labels in df2):")
    print(y_df3_pred[:20])



              precision    recall  f1-score   support

           0       0.65      0.35      0.45     29660
           1       0.55      0.81      0.66     29682

    accuracy                           0.58     59342
   macro avg       0.60      0.58      0.55     59342
weighted avg       0.60      0.58      0.55     59342



## 3. TF-IDF - Logistic Regression

### 3.1 Training

In [None]:
# Define Vectorizer
tfidf = TfidfVectorizer(
    tokenizer=str.split,       # pre-tokenized text
    ngram_range=(1, 2),        # unigrams + bigrams
    min_df=2,                  # ignore words in fewer than 2 docs
    max_df=0.95,               # ignore very frequent words
    max_features=10000,        # limit feature space
    sublinear_tf=True          # dampen high term frequencies
)

# Train-Test Split

X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['tokens'], df['generated'], test_size=0.2, stratify=df['generated'], random_state=42
)

# Fit Transform & Training
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)


clf = LogisticRegression(max_iter=1000, class_weight='balanced')
clf.fit(X_train_tfidf, y_train)

# Report
y_pred = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     10508
         1.0       1.00      0.99      0.99     10778

    accuracy                           0.99     21286
   macro avg       0.99      0.99      0.99     21286
weighted avg       0.99      0.99      0.99     21286



### 3.2 Examining Features

In [15]:
feature_names = tfidf.get_feature_names_out()
coefs = clf.coef_[0]
top_pos = sorted(zip(coefs, feature_names), reverse=True)[:20]
top_neg = sorted(zip(coefs, feature_names))[:20]

print("Top AI-indicative terms:")
for coef, word in top_pos:
    print(f"{word}: {coef:.3f}")

print("\nTop Human-indicative terms:")
for coef, word in top_neg:
    print(f"{word}: {coef:.3f}")

Top AI-indicative terms:
important: 6.790
essay: 6.614
additionally: 6.600
super: 6.478
potential: 5.991
conclusion: 5.861
hey: 5.798
however: 4.726
provide: 4.544
essential: 4.186
significant: 4.110
unique: 3.844
firstly: 3.826
impact: 3.820
sincerely name: 3.819
cool: 3.816
often: 3.767
ensure: 3.645
plus: 3.642
totally: 3.583

Top Human-indicative terms:
would: -8.131
going: -6.288
go: -6.150
student: -5.900
although: -5.540
people: -5.341
school: -5.155
paragraph: -5.079
car: -4.968
percent: -4.807
get: -4.778
person: -4.777
human: -4.718
said: -4.387
many: -4.250
driving: -4.203
kid: -4.183
reason: -4.134
venus: -4.091
probably: -4.051


### 3.3 Evaluation with Dataset 3

In [16]:
X_df3 = tfidf.transform(df2['tokens'])
y_df3_true = df2['generated']
y_df3_pred = clf.predict(X_df3)

print(classification_report(y_df3_true, y_df3_pred))

              precision    recall  f1-score   support

           0       0.63      0.44      0.52     29660
           1       0.57      0.74      0.64     29682

    accuracy                           0.59     59342
   macro avg       0.60      0.59      0.58     59342
weighted avg       0.60      0.59      0.58     59342

