In [None]:
from datasets import load_dataset

# Download the dataset (cached locally in ~/.cache/huggingface/)
ds = load_dataset("fancyzhx/ag_news")

# Save as CSV files
ds["train"].to_csv("ag_news_train.csv", index=False)
ds["test"].to_csv("ag_news_test.csv", index=False)

print("Download complete! Files saved as ag_news_train.csv and ag_news_test.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/120 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Download complete! Files saved as ag_news_train.csv and ag_news_test.csv


In [None]:
import pandas as pd

train_df = pd.read_csv("/content/ag_news_train.csv")
test_df = pd.read_csv("/content/ag_news_test.csv")

print("Training data shape:", train_df.shape)
print("Testing data shape:", test_df.shape)

display(train_df.head())
display(test_df.head())

Training data shape: (120000, 2)
Testing data shape: (7600, 2)


Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


Unnamed: 0,text,label
0,Fears for T N pension after talks Unions repre...,2
1,The Race is On: Second Private Team Sets Launc...,3
2,Ky. Company Wins Grant to Study Peptides (AP) ...,3
3,Prediction Unit Helps Forecast Wildfires (AP) ...,3
4,Calif. Aims to Limit Farm-Related Smog (AP) AP...,3


In [None]:
from transformers import BertTokenizer
from datasets import Dataset

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Rename the 'label' column to 'labels' to match the expected input for the model
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")

# Set the format for PyTorch
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

print("Tokenization complete!")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Define the number of labels
num_labels = train_df['label'].nunique()

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=64,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=1000,
    #evaluation_strategy="epoch",     # Evaluate every epoch
    #save_strategy="epoch",           # Save checkpoint every epoch
    #load_best_model_at_end=True,     # Load the best model at the end of training
)

print("BERT model and training arguments configured!")

In [5]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m847.0 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
import numpy as np
from evaluate import load
from transformers import Trainer # Import Trainer here as well

# Define the metrics to be used
accuracy_metric = load("accuracy")
f1_metric = load("f1")

# Define a function to compute both accuracy and f1
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted") # Use weighted average for multi-class
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train_dataset.select(range(1000)), # training dataset (using a subset)
    eval_dataset=tokenized_test_dataset,   # evaluation dataset
    compute_metrics=compute_metrics      # the function to compute metrics
)

print("Trainer initialized!")

# Start training
print("Starting training...")
trainer.train()
print("Training complete!")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Trainer initialized!
Starting training...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

In [None]:
# Evaluate the model
evaluation_results = trainer.evaluate()

print("Evaluation Results:")
print(evaluation_results)

In [None]:
# Save the trained model
model_path = "./fine_tuned_bert_ag_news"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model saved to {model_path}")

Model saved to ./fine_tuned_bert_ag_news


# Task
Fine-tune a transformer model (e.g., BERT) to classify news headlines into topic categories using the dataset from "/content/ag_news_train.csv". Tokenize and preprocess the dataset, fine-tune the bert-base-uncased model using Hugging Face Transformers on a subset of the data, evaluate the model using accuracy and F1-score, and deploy the model using Gradio.

## Install gradio

### Subtask:
Install the Gradio library.


**Reasoning**:
Install the gradio library using pip.



In [None]:
%pip install gradio

## Create a prediction function

### Subtask:
Define a function that takes text input, preprocesses it, makes a prediction using the trained model, and returns the predicted label.


**Reasoning**:
Define a function to predict the news category using the trained model and tokenizer.



In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the trained tokenizer and model
model_path = "./fine_tuned_bert_ag_news"
try:
    tokenizer = BertTokenizer.from_pretrained(model_path)
    model = BertForSequenceClassification.from_pretrained(model_path)
    print("Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"Error loading model or tokenizer: {e}")
    tokenizer = None
    model = None


# Define a function to predict the news category
def predict_news_category(text):
    if tokenizer is None or model is None:
        return "Error: Model or tokenizer not loaded."

    try:
        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=tokenizer.model_max_length)
        print(f"Tokenized input: {inputs}")

        # Make prediction
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_label_id = torch.argmax(logits, dim=1).item()
            print(f"Prediction logits: {logits}")
            print(f"Predicted label ID: {predicted_label_id}")


        # Map label ID back to category name (assuming 0-3 mapping to categories)
        # Based on the dataset description and common practice for AG News, the labels are:
        # 0: World, 1: Sports, 2: Business, 3: Sci/Tech
        label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
        predicted_category = label_map.get(predicted_label_id, "Unknown Category")
        print(f"Predicted category: {predicted_category}")


        return predicted_category
    except Exception as e:
        print(f"Error during prediction: {e}")
        return f"Error during prediction: {e}"

print("Prediction function defined!")

## Build the gradio interface

### Subtask:
Create a Gradio interface using the prediction function and launch it.


**Reasoning**:
Create a Gradio interface using the prediction function and launch it.



In [None]:
import gradio as gr

# Create the Gradio interface
iface = gr.Interface(
    fn=predict_news_category,
    inputs="text",
    outputs="text",
    title="AG News Category Classifier"
)

# Launch the interface
iface.launch()