In [3]:
!pip install -U transformers datasets evaluate



In [4]:
# ============================================================
# DistilBERT fine‑tune – AG News (binary: Business vs Other)
# This version bypasses HF cache issues by using pandas CSVs
# ============================================================

# 1) Install libraries
!pip install -q --upgrade transformers datasets evaluate
!pip install -q accelerate sentencepiece pandas

# 2) Imports and setup
import pandas as pd, numpy as np, os
from datasets import Dataset
from transformers import (DistilBertTokenizerFast,
                          DistilBertForSequenceClassification,
                          Trainer, TrainingArguments)
import evaluate
os.environ["WANDB_DISABLED"] = "true"

# 3) Download CSVs with pandas
train_df = pd.read_csv(
    "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv",
    header=None, names=["label", "title", "text"])
test_df = pd.read_csv(
    "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/test.csv",
    header=None, names=["label", "title", "text"])

# 4) Combine title and text, create binary labels (Business=1, else 0)
for df in (train_df, test_df):
    df["text"] = df["title"] + " " + df["text"]
    df["labels"] = df["label"].apply(lambda x: 1 if x == 2 else 0)
    df.drop(columns=["label", "title"], inplace=True)

# 5) Convert to Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

# 6) Tokenisation
tok = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tok_fn(batch):
    return tok(batch["text"], truncation=True,
               padding="max_length", max_length=128)

train_tok = train_ds.map(tok_fn, batched=True)
test_tok  = test_ds.map(tok_fn,  batched=True)

train_tok.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_tok.set_format("torch",  columns=["input_ids", "attention_mask", "labels"])

# 7) Model
model = DistilBertForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=2)

# 8) Metric
acc = evaluate.load("accuracy")
def metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return acc.compute(predictions=preds, references=labels)

# 9) Training arguments
#args = TrainingArguments(
    #output_dir="./ag_binary_out",
    #evaluation_strategy="epoch",
    #save_strategy="epoch",
    #num_train_epochs=1,
    #per_device_train_batch_size=16,
    #per_device_eval_batch_size=32,
    #learning_rate=5e-5,
    #weight_decay=0.01,
    #logging_steps=200,
    #load_best_model_at_end=True,
    #metric_for_best_model="accuracy")

# 9) Training arguments (compatible with older Transformers versions)
args = TrainingArguments(
    output_dir="./ag_binary_out",
    do_train=True,
    do_eval=True,                 # evaluate during training
    eval_steps=500,               # evaluate every 500 steps
    logging_steps=200,            # log training info every 200 steps
    save_steps=500,               # save model every 500 steps
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    weight_decay=0.01,
)


# 10) Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok.shuffle(seed=42).select(range(30000)),  # 30k subset
    eval_dataset=test_tok.select(range(5000)),                      # 5k subset
    tokenizer=tok,
    compute_metrics=metrics)


# 11) Train
trainer.train()

# 12) Evaluate
res = trainer.evaluate()
print(f"\nFinal test accuracy: {res['eval_accuracy']:.4f}")

# 13) Save
trainer.save_model("distilbert_agnews_business_binary")
tok.save_pretrained("distilbert_agnews_business_binary")


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
200,0.1124
400,0.0783
600,0.0576
800,0.0577
1000,0.0674
1200,0.0446
1400,0.0461
1600,0.039
1800,0.0547



Final test accuracy: 0.9860


('distilbert_agnews_business_binary/tokenizer_config.json',
 'distilbert_agnews_business_binary/special_tokens_map.json',
 'distilbert_agnews_business_binary/vocab.txt',
 'distilbert_agnews_business_binary/added_tokens.json',
 'distilbert_agnews_business_binary/tokenizer.json')