<a href="https://www.kaggle.com/code/rahulchauhan016/ai-vs-human-scientific-text-detection?scriptVersionId=295990116" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# ✅ Step 1 — Load Librarie

In [33]:
import pandas as pd
import numpy as np
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


# ✅ Step 2 — Load Dataset

In [34]:
path = "/kaggle/input/ai-vs-human-scientific-text-dataset/AI-Generated Scientific Text Dataset (AIGTxt)/AIGTxt.xlsx"

df = pd.read_excel(path)

df = df[['Human-Generated', 'ChatGPT-Generated', 'Mixed Text', 'Domain']]


# ✅ Step  3 — Keep Needed Columns

In [35]:
df = df[['Human-Generated', 'ChatGPT-Generated', 'Mixed Text', 'Domain']]


# ✅ Step 4 — Reshape Dataset

In [36]:
human_df = df[['Human-Generated', 'Domain']].copy()
human_df.columns = ['text', 'Domain']
human_df['label'] = 'Human'

ai_df = df[['ChatGPT-Generated', 'Domain']].copy()
ai_df.columns = ['text', 'Domain']
ai_df['label'] = 'AI'

mixed_df = df[['Mixed Text', 'Domain']].copy()
mixed_df.columns = ['text', 'Domain']
mixed_df['label'] = 'Mixed'

final_df = pd.concat([human_df, ai_df, mixed_df], ignore_index=True)


# ✅ Step 5 — Text Cleaning

In [37]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

final_df['clean_text'] = final_df['text'].apply(clean_text)


# ✅ Step 6 —  TF-IDF Vectorizer

In [38]:
vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2),
    stop_words='english'
)

X = vectorizer.fit_transform(final_df['clean_text'])
y = final_df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

model = LinearSVC()
model.fit(X_train, y_train)

preds = model.predict(X_test)

print(classification_report(y_test, preds))


              precision    recall  f1-score   support

          AI       0.47      0.51      0.49       728
       Human       0.58      0.61      0.60       729
       Mixed       0.03      0.02      0.02       708

    accuracy                           0.38      2165
   macro avg       0.36      0.38      0.37      2165
weighted avg       0.36      0.38      0.37      2165



# ✅ Step 7 — Convert Text to Features

In [39]:
X = vectorizer.fit_transform(final_df['clean_text'])
y = final_df['label']

print("Feature shape:", X.shape)


Feature shape: (10821, 30000)


# ✅ Step 8 — Train/Test Split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)



Train size: (8656, 30000)
Test size: (2165, 30000)


# ✅ Step 9 — Train Model

In [41]:
from sklearn.svm import LinearSVC

model = LinearSVC()
model.fit(X_train, y_train)



# ✅ Step 10 — Evaluate Model

In [42]:
preds = model.predict(X_test)
print(classification_report(y_test, preds))


              precision    recall  f1-score   support

          AI       0.47      0.51      0.49       728
       Human       0.58      0.61      0.60       729
       Mixed       0.03      0.02      0.02       708

    accuracy                           0.38      2165
   macro avg       0.36      0.38      0.37      2165
weighted avg       0.36      0.38      0.37      2165



# ✅ Step 11 — Test with Real Text

In [43]:
def predict_text(text):
    text = clean_text(text)
    vec = vectorizer.transform([text])
    return model.predict(vec)[0]

predict_text("This research explores neural network optimization methods.")


'Mixed'

# ✅ Step 12 — Install joblib 

In [None]:
!pip install joblib


# ✅ Step 13 — Save Model Files

In [None]:
import joblib

# joblib.dump(model, "aig_detector_model.pkl")
 # joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("Model saved successfully!")


# ✅ Step 14 — Load Model 

In [None]:
model = joblib.load("aig_detector_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")


# ✅ Step 15 — Prediction Function

In [None]:
from sklearn.calibration import CalibratedClassifierCV

calibrated_model = CalibratedClassifierCV(model)
calibrated_model.fit(X_train, y_train)


In [None]:
def predict_text(text):
    text = clean_text(text)
    vec = vectorizer.transform([text])

    pred = calibrated_model.predict(vec)[0]
    probs = calibrated_model.predict_proba(vec)[0]

    confidence = max(probs)

    return pred, confidence


In [None]:
predict_text("This study explores deep learning in medical imaging.")


# ✅ Step 16 — Full Pipeline(AI vs Human Scientific Text Detector)

In [None]:

# Install package
!pip install joblib -q

# ---------- Imports ----------
import pandas as pd
import numpy as np
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# ---------- Load Dataset ----------
path = "/kaggle/input/ai-vs-human-scientific-text-dataset/AI-Generated Scientific Text Dataset (AIGTxt)/AIGTxt.xlsx"

df = pd.read_excel(path)

df = df[['Human-Generated', 'ChatGPT-Generated', 'Mixed Text', 'Domain']]

# ---------- Convert Dataset ----------
human_df = df[['Human-Generated', 'Domain']].copy()
human_df.columns = ['text', 'Domain']
human_df['label'] = 'Human'

ai_df = df[['ChatGPT-Generated', 'Domain']].copy()
ai_df.columns = ['text', 'Domain']
ai_df['label'] = 'AI'

mixed_df = df[['Mixed Text', 'Domain']].copy()
mixed_df.columns = ['text', 'Domain']
mixed_df['label'] = 'Mixed'

final_df = pd.concat([human_df, ai_df, mixed_df], ignore_index=True)

# ---------- Text Cleaning ----------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

final_df['clean_text'] = final_df['text'].apply(clean_text)

# ---------- Feature Creation ----------
vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1,2),
    stop_words='english'
)

X = vectorizer.fit_transform(final_df['clean_text'])
y = final_df['label']

# ---------- Train/Test Split ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# ---------- Train Model ----------
model = LinearSVC()
model.fit(X_train, y_train)

# ---------- Evaluation ----------
preds = model.predict(X_test)
print("\nModel Performance:\n")
print(classification_report(y_test, preds))

# ---------- Save Model ----------
joblib.dump(model, "aig_detector_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

print("\nModel saved successfully!")

# ---------- Interactive Prediction ----------
def predict_text(text):
    cleaned = clean_text(text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)[0]
    return pred

print("\nPaste paragraph below:")
user_text = input()

print("\nPrediction:", predict_text(user_text))


# ✅ Step 17 — Paragraph-Level Detector

In [None]:


def split_paragraphs(text):
    # Split on empty lines
    paras = [p.strip() for p in text.split("\n") if p.strip()]
    return paras

def detect_paragraphs(text):
    paragraphs = split_paragraphs(text)

    results = []

    for para in paragraphs:
        cleaned = clean_text(para)
        vec = vectorizer.transform([cleaned])
        pred = model.predict(vec)[0]

        results.append((para, pred))

    return results


# ---- User Input ----
print("Paste multi-paragraph text (press Enter twice between paragraphs):\n")
doc = input()

outputs = detect_paragraphs(doc)

print("\nParagraph Predictions:\n")

for i, (para, label) in enumerate(outputs, 1):
    print(f"Paragraph {i}: {label}")
    print("-" * 60)


# ✅ Step  18 — Highlight Paragraph Predictions

In [None]:

def split_paragraphs(text):
    return [p.strip() for p in text.split("\n") if p.strip()]

def detect_paragraphs(text):
    paragraphs = split_paragraphs(text)

    results = []
    ai_count = 0

    for para in paragraphs:
        cleaned = clean_text(para)
        vec = vectorizer.transform([cleaned])
        pred = model.predict(vec)[0]

        if pred == "AI":
            ai_count += 1

        results.append((para, pred))

    return results, ai_count, len(paragraphs)


print("Paste multi-paragraph text:\n")
doc = input()

results, ai_count, total = detect_paragraphs(doc)

print("\n--- Paragraph Results ---\n")

for i, (para, label) in enumerate(results, 1):
    print(f"\nParagraph {i} [{label}]")
    print("-" * 50)
    print(para)


# ✅ Step 19 — Document AI Percentage Score

In [None]:

ai_percent = (ai_count / total) * 100 if total > 0 else 0

print("\nDocument AI Usage Score:")
print(f"AI Paragraphs: {ai_count}/{total}")
print(f"AI Percentage: {ai_percent:.2f}%")

if ai_percent > 60:
    print("⚠ Document likely AI-generated")
elif ai_percent > 30:
    print("⚠ Mixed authorship suspected")
else:
    print("✅ Mostly human-written")


# ✅ Step 20 — Upload PDF and Analyse

In [None]:

!pip install pdfplumber -q

import pdfplumber

pdf_path = input("Enter PDF file path: ")

full_text = ""

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"

results, ai_count, total = detect_paragraphs(full_text)

print("\nPDF Analysis Complete\n")

ai_percent = (ai_count / total) * 100 if total > 0 else 0

print(f"Paragraphs analyzed: {total}")
print(f"AI percentage: {ai_percent:.2f}%")
