In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords
import nltk

In [3]:
df = pd.read_csv("/kaggle/input/human-vs-ai-generated-essays/balanced_ai_human_prompts.csv")

In [4]:
display(df.sample(10))

Unnamed: 0,text,generated
1628,Formulate a critical analysis of renewable ene...,1
1122,"When the US Constitution was first made, the i...",0
2350,Provide a step-by-step guide on climate change...,1
1327,Living in a polluted world were you cannot see...,0
2445,Develop a persuasive argument about financial ...,1
1050,Every vote makes a difference. Every single vo...,0
1919,Generate a detailed summary of financial techn...,1
1523,Compile a list of key insights about climate c...,1
2581,Compose an in-depth exploration of internet of...,1
768,Voting has always been something special for a...,0


In [5]:
print("Jumlah baris dan kolom:", df.shape)

Jumlah baris dan kolom: (2750, 2)


In [6]:
print("\nDistribusi Label:")
print(df['generated'].value_counts())


Distribusi Label:
generated
1    1375
0    1375
Name: count, dtype: int64


In [6]:
df['number_of_words']=df['text'].apply(lambda x: len(x.split()))
df['number_of_char']=df['text'].apply(lambda x:len(x))
df.sample(10)

Unnamed: 0,text,generated,number_of_words,number_of_char
2017,Provide a step-by-step guide on gene editing t...,1,15,95
1284,"From saving money to having less pollution, li...",0,407,2398
904,As we get older we want to travel to more plac...,0,670,3563
2256,Write a comprehensive essay explaining history...,1,14,94
507,Decreasing car usage would serve as a benefit ...,0,472,2643
1194,Many Americans today use cars to commute from ...,0,417,2589
1539,Compile a list of key insights about virtual r...,1,18,105
602,"Dear me. Senator, Today I am asking for your f...",0,493,2732
1615,Compile a list of key insights about virtual r...,1,17,113
65,"""The Toyota Corolla, a name synonymous with re...",1,285,1968


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2750 entries, 0 to 2749
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             2750 non-null   object
 1   generated        2750 non-null   int64 
 2   number_of_words  2750 non-null   int64 
 3   number_of_char   2750 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 86.1+ KB


In [8]:
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # hapus URL
    text = re.sub(r"\d+", "", text)  # hapus angka
    text = text.translate(str.maketrans("", "", string.punctuation))  # hapus tanda baca
    text = " ".join([word for word in text.split() if word not in stop_words])  # hapus stopwords
    return text

df["clean_text"] = df["text"].apply(clean_text)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["generated"], test_size=0.2, random_state=42, stratify=df["generated"]
)

In [10]:
model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", LogisticRegression(max_iter=300))
])

In [11]:
model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [13]:
print("\n=== Evaluation Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


=== Evaluation Results ===
Accuracy: 0.9981818181818182
F1 Score: 0.9981785063752276
ROC-AUC: 0.9991140495867769

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       275
           1       1.00      1.00      1.00       275

    accuracy                           1.00       550
   macro avg       1.00      1.00      1.00       550
weighted avg       1.00      1.00      1.00       550



In [19]:
def predict_ai_percentage(text):
    cleaned = clean_text(text)
    prob = model.predict_proba([cleaned])[0][1]
    return round(prob * 100, 2)

# 9️⃣ Uji Coba Prediksi
user_text = input("Masukkan teks yang ingin diuji: ")

print(f"\nTeks: {user_text}")
print(f"Kemungkinan teks ini dibuat oleh AI: {predict_ai_percentage(user_text)}%")

Masukkan teks yang ingin diuji:  hello my name is fauzan ramadhan putra



Teks: hello my name is fauzan ramadhan putra
Kemungkinan teks ini dibuat oleh AI: 75.88%
