In [4]:
from datasets import load_dataset
import pandas as pd

# Load the dataset from Hugging Face
dataset = load_dataset("NicolaiSivesind/human-vs-machine", 'research_abstracts_labeled')

# Convert the test set to a Pandas DataFrame
df_test = pd.DataFrame(dataset['test'])  # Use the 'test' split

print(f"✅ Loaded {len(df_test)} test samples.")
print(f"👀 The first 5 samples in the test set:\n{df_test.head()}")

✅ Loaded 3000 test samples.
👀 The first 5 samples in the test set:
                                               title  label  \
0  Face Synthesis from Visual Attributes via Sket...      0   
1  Face Synthesis from Visual Attributes via Sket...      1   
2  Conformal symmetry breaking and degeneracy of ...      0   
3  Conformal symmetry breaking and degeneracy of ...      1   
4  Sensitivity integrals and related inequalities...      0   

                                                text  word_count  
0  Automatic synthesis of faces from visual attri...         174  
1  This paper presents an innovative method for f...         146  
2  We show that though conformal symmetry can be ...         341  
3  This research investigates the phenomenon of c...         312  
4  This paper exhibits the closed-loop design con...         164  


In [None]:
import sqlite3
from typing import List, Tuple
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# Set device for model
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if DEVICE == "cuda:0" else -1)

# Initialize SQLite database
DB_NAME = "classification_results.db"

def initialize_database():
    """
    Initializes the SQLite database and creates the classifications table if it doesn't exist.
    """
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS classifications (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            source TEXT,
            line_text TEXT,
            label TEXT,
            confidence REAL
        )
    """)
    conn.commit()
    conn.close()

def classify_text(sentences: List[str]) -> List[Tuple[str, float]]:
    """
    Classifies a list of sentences as 'Human' or 'AI' with confidence scores.

    :param sentences: List of text inputs to classify.
    :return: List of tuples containing the predicted label ('Human' or 'AI') and confidence score.
    """
    results = pipe(sentences)
    return [("Human" if res["label"] == "Real" else "AI", res["score"]) for res in results]

def classify_dataset():
    """
    Loads the NicolaiSivesind/human-vs-machine dataset, classifies the text samples, and stores the results.
    """
    texts = dataset["text"]
    sources = dataset["source"]  # Source: AI or Human

    # Classify text samples
    classifications = classify_text(texts)

    # Store results in SQLite
    save_to_database(sources, texts, classifications)

    # Print sample results
    for source, text, classification in zip(sources, texts, classifications[:10]):  # Show first 10 results
        print(f"Expected: {source}, Predicted: {classification} : {text[:100]}...")

def save_to_database(sources: List[str], lines: List[str], classifications: List[Tuple[str, float]]):
    """
    Saves classification results to an SQLite database.

    :param sources: List of sources (human or AI).
    :param lines: List of text samples.
    :param classifications: List of classification results (label, confidence).
    """
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    data = [(source, line, label, confidence) for source, line, (label, confidence) in zip(sources, lines, classifications)]
    cursor.executemany("INSERT INTO classifications (source, line_text, label, confidence) VALUES (?, ?, ?, ?)", data)

    conn.commit()
    conn.close()
    print("Dataset classification results saved to database.")

# Initialize database
initialize_database()

# Run classification on the dataset
classify_dataset()


Using device: cuda:0


Some weights of the model checkpoint at roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


KeyError: 'text'