### 📦 Install / Imports & helpers

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,4,5,6,7"

In [None]:
import os, datetime as dt, json, random, numpy as np, pandas as pd, torch
from datasets import Dataset
from transformers import (
    RobertaTokenizerFast, RobertaForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    EarlyStoppingCallback, TrainerCallback,
)
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

<torch._C.Generator at 0x7e7654d418d0>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 🧹 Load & clean raw data (dedup)

In [None]:
RAW_PATH = "/content/drive/MyDrive/딥러닝 프로젝트/daigt_labeled_dataset.csv"
assert os.path.exists(RAW_PATH), f"{RAW_PATH} not found!"

df_raw = (
    pd.read_csv(RAW_PATH, usecols=["text", "label"])
)

df_raw["text_norm"] = df_raw["text"].str.lower().str.strip()
df_raw["label"] = df_raw["label"].astype(int)

before, after = len(df_raw), df_raw["text_norm"].nunique()
df_raw = df_raw.drop_duplicates(subset="text_norm")
print(f"Removed {before - after:,} exact duplicate rows.")

Removed 0 exact duplicate rows.


### ✂️ Split or load cached splits

In [None]:
CACHE_DIR = "splits_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
paths = {name: f"{CACHE_DIR}/{name}.parquet" for name in ["train", "val", "test"]}

if all(os.path.exists(p) for p in paths.values()):
    print("📂 Cached splits found – loading.")
    train_df = pd.read_parquet(paths["train"])
    val_df   = pd.read_parquet(paths["val"])
    test_df  = pd.read_parquet(paths["test"])
else:
    print("⚙️  Creating new splits.")
    gss1 = GroupShuffleSplit(train_size=0.8, random_state=SEED, n_splits=1)
    train_idx, temp_idx = next(gss1.split(df_raw, groups=df_raw["text_norm"]))
    train_df = df_raw.iloc[train_idx]
    temp_df  = df_raw.iloc[temp_idx]

    gss2 = GroupShuffleSplit(train_size=0.5, random_state=SEED, n_splits=1)
    val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["text_norm"]))
    val_df  = temp_df.iloc[val_idx]
    test_df = temp_df.iloc[test_idx]

    train_df.to_parquet(paths["train"])
    val_df.to_parquet(paths["val"])
    test_df.to_parquet(paths["test"])
    print("💾 Splits saved to 'splits_cache/'.")

overlap = set(train_df["text_norm"]) & set(val_df["text_norm"])
print("train ∩ val duplicates:", len(overlap))


⚙️  Creating new splits.
💾 Splits saved to 'splits_cache/'.
train ∩ val duplicates: 0


### 🔠 Tokenize & build HF Datasets

In [None]:
from datasets import Dataset
from transformers import RobertaTokenizerFast
import numpy as np

tok = RobertaTokenizerFast.from_pretrained("roberta-base")

ds = Dataset.from_pandas(df_raw[["text"]], preserve_index=False)

def add_len(batch):
    batch["tok_len"] = [len(t) for t in tok(batch["text"], add_special_tokens=True)["input_ids"]]
    return batch

ds = ds.map(add_len, batched=True, batch_size=1024, num_proc=8)
lengths = ds["tok_len"]

pct = np.percentile(lengths, [50, 90, 95, 99])
print("median / p90 / p95 / p99 =", pct)
print("max =", max(lengths))


Map (num_proc=8):   0%|          | 0/4842 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (687 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1330 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1455 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (662 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

median / p90 / p95 / p99 = [ 311.    670.9   825.95 1278.59]
max = 5121


  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
from transformers import RobertaTokenizerFast
from datasets import Dataset
from transformers import DataCollatorWithPadding

tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

MAX_LEN = 512
HEAD    = 256
TAIL    = 254

def head_tail_tokenize(batch):
    encodings = {"input_ids": [], "attention_mask": []}
    for text in batch["text"]:
        ids = tokenizer.encode(text, add_special_tokens=True, truncation=False)
        if len(ids) > MAX_LEN:
            # ids[0] : <s>, ids[-1] : </s>
            new_ids = ids[: HEAD + 1] + ids[-TAIL:]
            ids = new_ids[:MAX_LEN]
        attn = [1] * len(ids)
        encodings["input_ids"].append(ids)
        encodings["attention_mask"].append(attn)
    return encodings

train_ds = Dataset.from_pandas(train_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 20
)
val_ds = Dataset.from_pandas(val_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 20
)
test_ds = Dataset.from_pandas(test_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 20
)

data_collator = DataCollatorWithPadding(tokenizer)

Map (num_proc=20):   0%|          | 0/3873 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (687 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (841 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (538 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi

Map (num_proc=20):   0%|          | 0/484 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (670 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1373 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for th

Map (num_proc=20):   0%|          | 0/485 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1006 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (630 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (685 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1429 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (644 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for t

### 🏗️ Build model (RoBERTa)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2,
    hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2,
).to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### ⚙️ TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir="./roberta-output",
    per_device_train_batch_size=8,                   # 🔻 GPU 부담 줄이기
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=8,                   # 🔼 Effective batch size: 64
    num_train_epochs=3,                             # 🔼 더 안정적인 수렴을 위해 증가
    learning_rate=1e-5,                              # 🔻 RoBERTa는 낮은 LR에서 안정적
    weight_decay=0.01,
    warmup_ratio=0.1,                                # 🔥 Learning rate warmup
    lr_scheduler_type="cosine",                      # 🔄 부드러운 decay
    fp16=torch.cuda.is_available(),
    label_smoothing_factor=0.05,                     # 🔻 너무 높은 smoothing은 underfit 위험
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    logging_steps=10,
    logging_first_step=True,
    save_total_limit=2,                              # 🔼 체크포인트 여유
    run_name="roberta-ai-vs-human-v2",
    report_to=["tensorboard"],
    ddp_find_unused_parameters=False,
)


### 🚂 Trainer & train

In [None]:
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            now = dt.datetime.now().strftime("%H:%M:%S")
            lr  = kwargs.get("optimizer", {}).param_groups[0]["lr"] if "optimizer" in kwargs else logs.get("learning_rate")
            watched = {**{k: logs.get(k) for k in ["loss","eval_loss","eval_accuracy","eval_f1"]}, "lr": lr}
            msg = " | ".join(f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
            print(f"[{now}] step {state.global_step} | {msg}")

def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(-1)
    return dict(
        accuracy  = accuracy_score(labels, preds),
        precision = precision_score(labels, preds),
        recall    = recall_score(labels, preds),
        f1        = f1_score(labels, preds),
    )

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(2), LogCallback()],
)
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1198,0.222117,0.966942,0.942857,1.0,0.970588
2,0.1184,0.142344,0.991736,0.985075,1.0,0.992481
3,0.1181,0.179684,0.979339,0.963504,1.0,0.981413


[03:55:48] step 1 | loss: 0.6620 | lr: 0.0000
[03:56:01] step 10 | loss: 0.6498 | lr: 0.0000
[03:56:16] step 20 | loss: 0.5780 | lr: 0.0000
[03:56:32] step 30 | loss: 0.3115 | lr: 0.0000
[03:56:48] step 40 | loss: 0.1606 | lr: 0.0000
[03:57:03] step 50 | loss: 0.1235 | lr: 0.0000
[03:57:19] step 60 | loss: 0.1198 | lr: 0.0000
[03:57:24] step 61 | eval_loss: 0.2221 | eval_accuracy: 0.9669 | eval_f1: 0.9706 | lr: 0.0000
[03:58:18] step 70 | loss: 0.1186 | lr: 0.0000
[03:58:34] step 80 | loss: 0.1185 | lr: 0.0000
[03:58:50] step 90 | loss: 0.1185 | lr: 0.0000
[03:59:06] step 100 | loss: 0.1229 | lr: 0.0000
[03:59:21] step 110 | loss: 0.1182 | lr: 0.0000
[03:59:36] step 120 | loss: 0.1184 | lr: 0.0000
[03:59:42] step 122 | eval_loss: 0.1423 | eval_accuracy: 0.9917 | eval_f1: 0.9925 | lr: 0.0000
[04:01:01] step 130 | loss: 0.1147 | lr: 0.0000
[04:01:17] step 140 | loss: 0.1182 | lr: 0.0000
[04:01:33] step 150 | loss: 0.1180 | lr: 0.0000
[04:01:48] step 160 | loss: 0.1182 | lr: 0.0000
[04:02

TrainOutput(global_step=183, training_loss=0.18556462578434763, metrics={'train_runtime': 446.1501, 'train_samples_per_second': 26.043, 'train_steps_per_second': 0.41, 'total_flos': 2990212949731320.0, 'train_loss': 0.18556462578434763, 'epoch': 3.0})

### 🧪 Evaluate on test set

In [None]:
print("📊 Test metrics:", trainer.evaluate(test_ds))

[04:03:54] step 183 | eval_loss: 0.1354 | eval_accuracy: 0.9918 | eval_f1: 0.9925 | lr: 0.0000
📊 Test metrics: {'eval_loss': 0.1354115605354309, 'eval_accuracy': 0.9917525773195877, 'eval_precision': 0.9850187265917603, 'eval_recall': 1.0, 'eval_f1': 0.9924528301886792, 'eval_runtime': 3.5468, 'eval_samples_per_second': 136.745, 'eval_steps_per_second': 8.74, 'epoch': 3.0}


### 💾 Save model/tokenizer

In [None]:
SAVE_PATH = "./roberta-ai-vs-human"
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"✅ Model & tokenizer saved to '{SAVE_PATH}'.")


✅ Model & tokenizer saved to './roberta-ai-vs-human'.
