In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch


In [3]:
def load_text_model(model_name="distilbert-base-uncased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    return tokenizer, model

In [4]:
def predict_text(texts, tokenizer, model, device=None):
    if device is None:
        device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=-1)[:, 1].cpu().numpy()
    return probs

In [5]:
# src/data_utils.py
import random
from datasets import Dataset

def make_simple_text_dataset():
    texts = [
        "This is a human written news short story about crops.",
        "Generated text The government gave 1000 to all citizens.",
        "Human message about community meeting tomorrow.",
        "AI generated news saying the village leader asked for money.",
        "Local market will be closed on Monday due to festival.",
        "Machine made article saying the school was closed by officials."
    ]
    labels = [0, 1, 0, 1, 0, 1]
    return Dataset.from_dict({"text": texts, "label": labels})


In [6]:
from transformers import TrainingArguments, Trainer

def tokenize_batch(batch, tokenizer):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

def train_text_baseline(output_dir="out"):
    tokenizer, model = load_text_model()
    ds = make_simple_text_dataset()
    ds = ds.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=128), batched=True)
    ds = ds.rename_column("label", "labels")
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
    args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,
        num_train_epochs=2,
        logging_steps=5,
        save_strategy="no",
        report_to=[]
    )
    trainer = Trainer(model=model, args=args, train_dataset=ds)
    trainer.train()
    print("Training complete")

if __name__ == "__main__":
    train_text_baseline()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 6/6 [00:00<00:00, 517.27 examples/s]


Step,Training Loss
5,0.6916


Training complete
