In [2]:
!pip install torch==2.3.0 transformers==4.28.0 datasets evaluate

Collecting torch==2.3.0
  Downloading torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl.metadata (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.0/110.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn

In [3]:
import torch
import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
import numpy as np

In [4]:
def preprocess_function(examples, tokenizer, max_length=128):
    return tokenizer(examples["text"], truncation=True,
                     padding="max_length", max_length=max_length)

In [5]:
def main():
  model_name = "distilbert-base-uncased"
  tokenizer = DistilBertTokenizer.from_pretrained(model_name)
  model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=4)

  dataset = load_dataset("ag_news",split="train[:2000]")
  dataset = dataset.train_test_split(test_size=0.1)
  train_dataset = dataset["train"]
  test_dataset = dataset["test"]

  tokenized_train =train_dataset.map(
      lambda x:preprocess_function(x,tokenizer),
      batched=True,
      remove_columns = ["text"]
  )
  tokenized_test = test_dataset.map(
      lambda x:preprocess_function(x,tokenizer),
      batched=True,
      remove_columns = ["text"]
  )

  accuracy = evaluate.load("accuracy")
  def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)\

  training_args = TrainingArguments(
      output_dir = './distilbert_topics',
      evaluation_strategy = "epoch",
      learning_rate = 2e-5,
      per_device_train_batch_size = 8,
      per_device_eval_batch_size = 8,
      num_train_epochs = 2,
      save_total_limit = 2,
      fp16 = torch.cuda.is_available(),
      logging_steps =100,
      report_to = "none"
  )

  trainer = Trainer(
      model = model,
      args = training_args,
      train_dataset = tokenized_train,
      eval_dataset = tokenized_test,
      tokenizer = tokenizer,
      compute_metrics = compute_metrics
  )

  trainer.train()
  trainer.save_model("./distilbert_topics")
  tokenizer.save_pretrained("./distilbert_topics")

  # Inferece step
  def predict_topic(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt",max_length=128, truncation=True, padding=True)
    model.eval()
    with torch.no_grad():
      outputs = model(**inputs.to(model.device))

    prediction = torch.argmax(outputs.logits, dim=1).item()
    topics = {0:"World",1:"Sports",2:"Business",3:"Sci/Tech"}
    return topics[prediction]

  sample_texts = [
        "UN holds emergency meeting to address rising tensions in the Middle East.",
        "Tech giant announces $2 billion investment in renewable energy projects.",
        "Local high school team wins state championship in a thrilling overtime match.",
        "Scientists develop new battery technology that charges in seconds."
    ]

  for text in sample_texts:
    topic = predict_topic(text, model, tokenizer)
    print(f"Text: {text}\nPredicted Topic: {topic}\n")

if __name__ == "__main__":
  main()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.we

README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.4495,0.416278,0.865
2,0.2896,0.389691,0.875


Text: UN holds emergency meeting to address rising tensions in the Middle East.
Predicted Topic: World

Text: Tech giant announces $2 billion investment in renewable energy projects.
Predicted Topic: Business

Text: Local high school team wins state championship in a thrilling overtime match.
Predicted Topic: Sports

Text: Scientists develop new battery technology that charges in seconds.
Predicted Topic: Sci/Tech

