In [1]:
import sqlite3
from typing import List, Tuple
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# Set device for model
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda:0" else -1)

# Initialize SQLite database
DB_NAME = "classification_results.db"

def initialize_database():
    """Initializes the SQLite database and creates the classifications table if it doesn't exist."""
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS classifications (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT,
            line_text TEXT,
            label TEXT,
            confidence REAL
        )
    """)
    conn.commit()
    conn.close()

def classify_text(sentences: List[str]) -> List[Tuple[str, float]]:
    """
    Classifies a list of sentences as 'Human' or 'AI' with confidence scores.

    :param sentences: List of text inputs to classify.
    :return: List of tuples containing the predicted label ('Human' or 'AI') and confidence score.
    """
    results = pipe(sentences)
    return [("Human" if res["label"] == "Real" else "AI", res["score"]) for res in results]

def classify_file(filename: str) -> List[Tuple[str, float]]:
    """
    Reads a file line by line, classifies each line as 'Human' or 'AI', prints the results,
    and stores them in an SQLite database.

    :param filename: Name of the text file to classify.
    :return: List of tuples containing classification results.
    """
    results = []
    
    with open(filename, "r", encoding="utf-8") as file:
        lines = [line.strip() for line in file if line.strip()]  # Remove empty lines

    if not lines:
        print(f"Warning: {filename} is empty or contains only whitespace.")
        return results
    
    classifications = classify_text(lines)

    # Store results in SQLite
    save_to_database(filename, lines, classifications)
    
    # Print results
    for line, classification in zip(lines, classifications):
        print(f"{classification} : {line}")
        results.append(classification)
    
    return results

def save_to_database(filename: str, lines: List[str], classifications: List[Tuple[str, float]]):
    """
    Saves classification results to an SQLite database.

    :param filename: Name of the source file.
    :param lines: List of text lines from the file.
    :param classifications: List of classification results (label, confidence).
    """
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    data = [(filename, line, label, confidence) for line, (label, confidence) in zip(lines, classifications)]
    cursor.executemany("INSERT INTO classifications (filename, line_text, label, confidence) VALUES (?, ?, ?, ?)", data)

    conn.commit()
    conn.close()
    print(f"Results from {filename} saved to database.")

def generate_markdown_report(output_filename: str = "classification_report.md"):
    """
    Generates a Markdown report summarizing classification results.

    :param output_filename: Name of the output Markdown file.
    """
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    # Fetch classification summary
    cursor.execute("SELECT label, COUNT(*) FROM classifications GROUP BY label")
    summary_data = cursor.fetchall()

    total_count = sum(count for _, count in summary_data)
    human_count = sum(count for label, count in summary_data if label == "Human")
    ai_count = total_count - human_count

    human_percent = (human_count / total_count * 100) if total_count else 0
    ai_percent = (ai_count / total_count * 100) if total_count else 0

    # Fetch detailed data
    cursor.execute("SELECT filename, line_text, label, confidence FROM classifications ORDER BY filename")
    detailed_results = cursor.fetchall()

    conn.close()

    # Generate Markdown content
    markdown_content = f"""# Classification Report

## Summary
| Label  | Count | Percentage |
|--------|-------|------------|
| Human  | {human_count}   | {human_percent:.2f}% |
| AI     | {ai_count}   | {ai_percent:.2f}% |
| **Total** | {total_count} | 100.00% |

## Detailed Results
| Filename | Text | Label | Confidence |
|----------|------|-------|------------|
"""

    for filename, line_text, label, confidence in detailed_results:
        sanitized_text = line_text.replace("|", "\\|")  # Escape Markdown pipes
        markdown_content += f"| {filename} | {sanitized_text} | {label} | {confidence:.2f} |\n"

    # Save to file
    with open(output_filename, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_content)

    print(f"Markdown report saved as {output_filename}")

# Initialize database
initialize_database()

# Run classification and store results
classify_file("ai.txt")
classify_file("human.txt")

# Generate markdown report
generate_markdown_report()


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda:0


Some weights of the model checkpoint at roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Results from ai.txt saved to database.
('AI', 0.5262990593910217) : "In the ever-evolving landscape of artificial intelligence, language models continue to demonstrate unprecedented capabilities in generating human-like text."
('AI', 0.6308667063713074) : "The significance of sustainable energy solutions cannot be overstated in the modern era of climate change and environmental awareness."
('AI', 0.8574431538581848) : "The Renaissance was a pivotal period in human history, characterized by remarkable advancements in art, science, and philosophy."
('AI', 0.7112785577774048) : "Machine learning algorithms leverage vast datasets to optimize predictive performance in a variety of real-world applications."
('Human', 0.6411296129226685) : "Throughout history, civilizations have relied on innovation to drive progress and enhance societal development."
('AI', 0.8603183627128601) : "The impact of artificial intelligence on the global workforce is a topic of considerable debate among experts in 