In [1]:
import random
import torch
import logging
from nltk.corpus import wordnet
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer, T5Tokenizer

# Configure Logging
logger = logging.getLogger(__name__)

class Perturbation:
    """Class for perturbing text using different techniques to assess AI-generated text."""
    
    def __init__(self, device="cpu"):
        self.device = device

        # Load T5 model for paraphrasing
        self.t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small").to(self.device)
        self.t5_tokenizer = T5Tokenizer.from_pretrained("t5-small", model_max_length=512)

        # Load GPT-J model for AI-based text perturbation
        self.gptj_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
        self.gptj_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B").to(self.device)


        logger.info("Perturbation class initialized with default GPT completion.")

    def shuffle_text(self, text):
        """Randomly shuffles words in the text."""
        words = text.split()
        random.shuffle(words)
        perturbed_text = " ".join(words)
        logger.debug(f"Shuffle Perturbation: {text} → {perturbed_text}")
        return perturbed_text

    def remove_word(self, text):
        """Randomly removes a word from the text."""
        words = text.split()
        if len(words) > 1:
            index = random.randint(0, len(words) - 1)
            removed_word = words.pop(index)
            perturbed_text = " ".join(words)
            logger.debug(f"Remove Word Perturbation: Removed '{removed_word}' → {perturbed_text}")
        else:
            perturbed_text = text
        return perturbed_text

    def replace_with_mask(self, text):
        """Randomly replaces a word with a mask token."""
        words = text.split()
        if len(words) > 1:
            index = random.randint(0, len(words) - 1)
            replaced_word = words[index]
            words[index] = "<mask>"
            perturbed_text = " ".join(words)
            logger.debug(f"Replace Word with Mask: Replaced '{replaced_word}' with '<mask>' → {perturbed_text}")
        else:
            perturbed_text = text
        return perturbed_text

    def synonym_substitution(self, text):
        """Replaces words with their synonyms using WordNet."""
        words = text.split()
        new_words = []
        for word in words:
            synonyms = wordnet.synsets(word)
            if synonyms:
                new_word = synonyms[0].lemmas()[0].name()
                new_words.append(new_word)
                logger.debug(f"Synonym Substitution: Replaced '{word}' with '{new_word}'")
            else:
                new_words.append(word)
        return " ".join(new_words)

    def paraphrase_text(self, text):
        """Uses T5 to generate a paraphrased version of the text."""
        input_text = f"paraphrase: {text} </s>"
        encoding = self.t5_tokenizer.encode_plus(input_text, return_tensors="pt").to(self.device)

        with torch.no_grad():
            outputs = self.t5_model.generate(**encoding, max_length=512, do_sample=True, top_p=0.95)

        perturbed_text = self.t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
        logger.debug(f"Paraphrase Perturbation: {text} → {perturbed_text}")
        return perturbed_text

    def insert_random_sentence(self, text):
        """Inserts a random unrelated sentence into the text."""
        random_sentences = [
            "The moon orbits the Earth approximately every 27.3 days.",
            "Neural networks have been widely used in deep learning applications.",
            "Shakespeare wrote some of the most famous plays in history."
        ]
        random_sentence = random.choice(random_sentences)
        words = text.split()
        insert_position = random.randint(0, len(words))
        words.insert(insert_position, random_sentence)
        perturbed_text = " ".join(words)
        logger.debug(f"Insert Random Sentence: Inserted '{random_sentence}' → {perturbed_text}")
        return perturbed_text

    def gpt_completion_perturbation(self, text):
        """Uses GPT-J to generate AI-based perturbations."""
        input_ids = self.gptj_tokenizer.encode(text, return_tensors="pt").to(self.device)
        with torch.no_grad():
            output = self.gptj_model.generate(input_ids, max_length=len(input_ids[0]) + 5, do_sample=True, top_p=0.92)

        perturbed_text = self.gptj_tokenizer.decode(output[0], skip_special_tokens=True)
        logger.debug(f"GPT Completion Perturbation: {text} → {perturbed_text}")
        return perturbed_text

    def perturb_text(self, text, method="gpt_completion_random"):
        """
        Applies the selected perturbation method.

        :param text: Input text to be perturbed.
        :param method: Name of the perturbation method to apply.
                       Defaults to "gpt_completion". Use "random" to choose a random method.
        :return: Perturbed text.
        """
        perturbation_methods = {
            "shuffle": self.shuffle_text,
            "remove_word": self.remove_word,
            "replace_mask": self.replace_with_mask,
            "synonym_substitution": self.synonym_substitution,
            "paraphrase": self.paraphrase_text,
            "insert_sentence": self.insert_random_sentence,
            "gpt_completion": self.gpt_completion_perturbation
        }

        if method == "random":
            method = random.choice(list(perturbation_methods.keys()))
        if method in perturbation_methods:
            logger.info(f"Applying Perturbation: {method} on text: {text}")
            return perturbation_methods[method](text)
        if method == "gpt_completion_random":
            method = "gpt_completion" if random.random() < 0.7 else "insert_sentence"
            logger.info(f"Applying Perturbation: {method} on text: {text}")
            return perturbation_methods[method](text)
        else:
            logger.warning(f"Invalid perturbation method: {method}. Defaulting to GPT Completion.")
            return self.gpt_completion_perturbation(text)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dataclasses import dataclass
import torch
import numpy as np
import re
import math
import logging
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, T5Tokenizer
from transformers import AutoModelForSeq2SeqLM
from sklearn.preprocessing import StandardScaler
from scipy.special import erf


# 🔹 Configure Logging
logging.basicConfig(
    filename="gpt_perplexity.log",
    filemode="w",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.DEBUG
)

logger = logging.getLogger(__name__)


@dataclass
class ClassificationResult:
    """Dataclass to store classification results."""
    score: float
    diff: float
    std_dev: float
    confidence: float
    label: str

    def __str__(self):
        """Custom string representation."""
        return f"{self.confidence:.2f}% confidence that the text is {self.label}."


class GPTConfig:
    """Configuration class for GPTPerplexity model settings."""
    
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.gpt_model_id = "gpt2" 
        self.t5_model_id = "t5-small"

        # Perplexity settings
        self.max_length = 1024
        self.stride = 51
        self.threshold = 0.4 # AI Score threshold 

        # Masking settings
        self.mask_span = 2
        self.mask_ratio = 0.3
        self.mask_samples = 50

        # T5 Generation settings
        self.t5_top_p = 0.96
        self.t5_num_return_sequences = 1


class GPTPerplexity:
    """GPT-based Perplexity and AI-Text Detection Model."""

    def __init__(self, config: GPTConfig):
        self.config = config
        self.device = config.device

        # Load models
        self.gpt_model = GPT2LMHeadModel.from_pretrained(config.gpt_model_id).to(self.device)
        self.tokenizer = GPT2TokenizerFast.from_pretrained(config.gpt_model_id)

        self.t5_model = AutoModelForSeq2SeqLM.from_pretrained(config.t5_model_id).to(self.device).half()
        self.t5_tokenizer = T5Tokenizer.from_pretrained(config.t5_model_id, model_max_length=512)


        # Initialize Perturbation class
        self.perturbation = Perturbation(device=self.device)

        logger.info("GPTPerplexity Model Initialized.")

    def get_log_likelihood(self, sentence):
        """Calculates the log-likelihood of a sentence using GPT2."""
        encodings = self.tokenizer(sentence, return_tensors="pt")
        seq_len = encodings.input_ids.size(1)
        nlls = []

        prev_end_loc = 0
        for begin_loc in range(0, seq_len, self.config.stride):
            end_loc = min(begin_loc + self.config.max_length, seq_len)
            trg_len = end_loc - prev_end_loc
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(self.device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = self.gpt_model(input_ids, labels=target_ids)
                nlls.append(outputs.loss * trg_len)

            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        likelihood = (-1 * torch.stack(nlls).sum() / end_loc).cpu().item()
        logger.debug(f"Computed log-likelihood: {likelihood}")
        return likelihood

    def compute_ai_score(self, sentence):
        """Computes an AI-score using log-likelihood differences between original and perturbed texts."""
        original_sentence = sentence.strip()
        logger.debug(f"Processing sentence: {original_sentence}")

        real_log_likelihood = self.get_log_likelihood(original_sentence)
        logger.debug(f"Real log-likelihood: {real_log_likelihood}")

        # Generate perturbed versions of the sentence
        perturbed_sentences = []
        for _ in range(self.config.mask_samples):
            perturbed_sentence = self.perturbation.perturb_text(original_sentence)
            logger.debug(f"Perturbation Applied: {original_sentence} → {perturbed_sentence}")
            perturbed_log_likelihood = self.get_log_likelihood(perturbed_sentence)
            perturbed_sentences.append(perturbed_log_likelihood)
            logger.debug(f"Perturbed log-likelihood: {perturbed_log_likelihood}")
        logger.debug(f"Perturbed sentances: {perturbed_sentences}")

        # perturbed_sentences = [self.get_log_likelihood(self.perturbation.perturb_text(original_sentence)) 
        #                        for _ in range(self.config.mask_samples)]        
        # logger.debug(f"Perturbed sentances: {perturbed_sentences}")

        if not perturbed_sentences:
            logger.error("Error perturbing sentences. No valid perturbations.")
            return -1

        # **Use scikit-learn to standardize data**
        scaler = StandardScaler()
        all_scores = np.array([real_log_likelihood] + perturbed_sentences).reshape(-1, 1)
        standardized_scores = scaler.fit_transform(all_scores).flatten()

        logger.debug(f"Standardized Scores: {standardized_scores}")

        standardized_real_score = standardized_scores[0]
        standardized_perturbed_mean = np.mean(standardized_scores[1:])
        standardized_perturbed_std = np.std(standardized_scores[1:])

        if standardized_perturbed_std == 0:
            logger.warning("Standard deviation is zero. Adding small jitter.")
            standardized_perturbed_std = 1e-8 + np.random.uniform(0, 1e-5)  

        ai_score = (standardized_real_score - standardized_perturbed_mean) / standardized_perturbed_std

        logger.info(f"AI Score: {ai_score}, Difference: {standardized_real_score - standardized_perturbed_mean}, Std Dev: {standardized_perturbed_std}")

        return float(ai_score), float(standardized_real_score - standardized_perturbed_mean), float(standardized_perturbed_std)

    def classify_text(self, sentence) -> ClassificationResult:
        """Classifies text as AI-generated or Human-written and returns a structured result."""
        sentence = re.sub(r"\[[0-9]+\]", "", sentence)  
        score, diff, std_dev = self.compute_ai_score(sentence)

        if score == -1 or math.isnan(score):
            logger.error("Error: Computed an invalid AI score.")
            return ClassificationResult(score=-1, diff=0, std_dev=0, confidence=0, label="Error")

        confidence = self.normal_cdf(abs(self.config.threshold - score)) * 100
        label = "AI-generated" if score > self.config.threshold else "Human-written"
        
        logger.info(f"Final Classification: {confidence:.2f}% confidence that the text is {label}.")

        return ClassificationResult(score=score, diff=diff, std_dev=std_dev, confidence=confidence, label=label)

    @staticmethod
    def normal_cdf(x):
        """Approximate the CDF of a normal distribution using erf."""
        return 0.5 * (1 + erf(x / math.sqrt(2)))



In [3]:
import sqlite3

def setup_database():
    """Creates the SQLite database and results table if it doesn't exist."""
    conn = sqlite3.connect("classification_results.db")
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS results (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT,
            text TEXT,
            score REAL,
            diff REAL,
            std_dev REAL,
            confidence REAL,
            label TEXT
        )
    """)
    conn.commit()
    conn.close()

def classify_file(model, filename):
    """
    Reads a file line by line, classifies each line as 'AI-generated' or 'Human-written',
    and stores the results in an SQLite database.

    :param model: The GPTPerplexity model for classification.
    :param filename: Name of the text file to classify.
    """
    print(f"\n🔍 Processing file: {filename}\n" + "-"*40)

    with open(filename, "r", encoding="utf-8") as file:
        lines = [line.strip() for line in file if line.strip()]  # Remove empty lines

    if not lines:
        print(f"⚠️ Warning: {filename} is empty or contains only whitespace.")
        return

    conn = sqlite3.connect("classification_results.db")
    cursor = conn.cursor()

    for line in lines:
        classification_result = model.classify_text(line)

        # Insert into the database
        cursor.execute("""
            INSERT INTO results (filename, text, score, diff, std_dev, confidence, label)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        """, (filename, line, classification_result.score, classification_result.diff,
              classification_result.std_dev, classification_result.confidence, classification_result.label))

        print(f"🔹 Text: {line}\n➡️ {str(classification_result)}\n")

    conn.commit()
    conn.close()


In [4]:
def generate_markdown_report():
    """Generates a markdown report from the classification results stored in the database."""
    conn = sqlite3.connect("classification_results.db")
    cursor = conn.cursor()

    # Fetch results
    cursor.execute("SELECT filename, text, score, confidence, label FROM results")
    rows = cursor.fetchall()
    conn.close()

    # Create Markdown content
    markdown_content = "# AI vs. Human Text Classification Report\n\n"
    markdown_content += "| Filename | Text | Score | Confidence | Label |\n"
    markdown_content += "|----------|------|-------|------------|-------|\n"

    for row in rows:
        filename, text, score, confidence, label = row
        markdown_content += f"| {filename} | {text[:50]}... | {score:.2f} | {confidence:.2f}% | {label} |\n"

    # Write to markdown file
    with open("classification_report.md", "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print("\n📄 Report generated: classification_report.md")


In [5]:
# Step 1: Setup the database
setup_database()

# Step 2: Process files and store results in SQLite
config = GPTConfig()
model = GPTPerplexity(config)

classify_file(model, "ai.txt")
classify_file(model, "human.txt")

# Step 3: Generate markdown report from the stored results
generate_markdown_report()

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565



🔍 Processing file: ai.txt
----------------------------------------


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain rel

🔹 Text: "In the ever-evolving landscape of artificial intelligence, language models continue to demonstrate unprecedented capabilities in generating human-like text."
➡️ 53.91% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "The significance of sustainable energy solutions cannot be overstated in the modern era of climate change and environmental awareness."
➡️ 55.08% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "The Renaissance was a pivotal period in human history, characterized by remarkable advancements in art, science, and philosophy."
➡️ 64.04% confidence that the text is AI-generated.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "Machine learning algorithms leverage vast datasets to optimize predictive performance in a variety of real-world applications."
➡️ 63.59% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "Throughout history, civilizations have relied on innovation to drive progress and enhance societal development."
➡️ 50.02% confidence that the text is AI-generated.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "The impact of artificial intelligence on the global workforce is a topic of considerable debate among experts in the field."
➡️ 53.23% confidence that the text is AI-generated.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "While natural language processing has significantly improved over the past decade, challenges in context retention and sentiment analysis remain."
➡️ 64.61% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "Technological advancements have revolutionized the way humans interact with digital ecosystems, fostering unprecedented levels of connectivity."
➡️ 63.61% confidence that the text is AI-generated.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "The intricate relationship between data privacy and cybersecurity continues to shape global policies in the digital age."
➡️ 61.34% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: "Future developments in artificial intelligence are expected to further blur the distinction between human and machine-generated content."
➡️ 52.81% confidence that the text is AI-generated.


🔍 Processing file: human.txt
----------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: Honestly, I had no idea AI could write this well until I saw ChatGPT in action.
➡️ 60.40% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: I still remember that summer when we stayed up all night talking—somehow, those moments stick with you forever.
➡️ 54.49% confidence that the text is AI-generated.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: I tried making sourdough bread last weekend, and let’s just say it was more of a rock than a loaf.
➡️ 68.94% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: Look, I know AI is cool and all, but I still don’t trust a machine to write my wedding vows.
➡️ 67.63% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: You ever get that weird feeling that you left the stove on, even though you know you didn’t?
➡️ 59.63% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: The coffee at that new place on 5th Street is honestly overrated—too bitter and way overpriced.
➡️ 58.47% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: I can’t explain why, but I really love the sound of rain hitting the roof at night.
➡️ 58.91% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: We spent the whole day hiking, only to realize we took the wrong trail back—thankfully, we had snacks!
➡️ 62.58% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: I swear, my cat understands English but just chooses to ignore me unless I say ‘treats.’
➡️ 93.17% confidence that the text is Human-written.



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

🔹 Text: Grandma always had the best stories about growing up in the countryside, and I wish I’d written them down.
➡️ 87.52% confidence that the text is Human-written.


📄 Report generated: classification_report.md
