In [None]:
!pip install transformers datasets scikit-learn torch gradio




In [4]:
from datasets import load_dataset
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import torch
import gradio as gr
from google.colab import drive
import os

In [None]:

# Load dataset
dataset = load_dataset("ag_news")

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

#Tokenization function
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=64)

#Apply tokenization
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

#Check a sample
print(dataset['train'][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'labels': tensor(2), 'input_ids': tensor([  101,  2813,  2358,  1012,  6468, 15020,  2067,  2046,  1996,  2304,
         1006, 26665,  1007, 26665,  1011,  2460,  1011, 19041,  1010,  2813,
         2395,  1005,  1055,  1040, 11101,  2989,  1032,  2316,  1997, 11087,
         1011, 22330,  8713,  2015,  1010,  2024,  3773,  2665,  2153,  1012,
          102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}


**Load BERT for Sequence Classification**

In [None]:


num_labels = 4
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Trainer & Training Arguments**

In [None]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


**Train the Model**

In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.171,0.177342,0.944868,0.944891


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.171,0.177342,0.944868,0.944891
2,0.1024,0.196593,0.947632,0.947703


TrainOutput(global_step=15000, training_loss=0.1800240215619405, metrics={'train_runtime': 2921.7745, 'train_samples_per_second': 82.142, 'train_steps_per_second': 5.134, 'total_flos': 7893473402880000.0, 'train_loss': 0.1800240215619405, 'epoch': 2.0})

**Save the Model**

In [None]:
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/my_bert_model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/my_bert_model")


('/content/drive/MyDrive/Colab Notebooks/my_bert_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/my_bert_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/my_bert_model/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/my_bert_model/added_tokens.json')

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

MODEL_PATH = "/content/drive/MyDrive/Colab Notebooks/my_bert_model"
#Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
model = BertForSequenceClassification.from_pretrained(MODEL_PATH).to(device)
model.eval()

# Map labels to categories
labels_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

# Prediction function
def predict_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    pred = outputs.logits.argmax(-1).item()
    return labels_map[pred]

# Test your model (optional)
test_sentences = [
    "Apple unveils new AI-powered iPhone",
    "Pakistan wins thrilling cricket match",
    "Stock markets fall due to inflation",
    "NASA discovers new exoplanet"
]

for sent in test_sentences:
    print(f"Headline: {sent} --> Predicted Topic: {predict_news(sent)}")

# Gradio interface
interface = gr.Interface(
    fn=predict_news,
    inputs=gr.Textbox(label="News Headline"),
    outputs=gr.Label(label="Predicted Topic"),
    title="News Topic Classifier (BERT)",
    description="Classifies news headlines into 4 categories: World, Sports, Business, Sci/Tech"
)

#Launch the Gradio app
interface.launch(share=True)


Using device: cuda
Headline: Apple unveils new AI-powered iPhone --> Predicted Topic: Sci/Tech
Headline: Pakistan wins thrilling cricket match --> Predicted Topic: World
Headline: Stock markets fall due to inflation --> Predicted Topic: Business
Headline: NASA discovers new exoplanet --> Predicted Topic: Sci/Tech
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://eb15c5714debed600b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [3]:
!pip freeze > requirements.txt
