In [1]:
#import necessary libraries
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import torch

# Load AG News dataset
ag_news = load_dataset("ag_news")

#Load the huffpost dataset
import json
data = []

with open('News_Category_Dataset.json', 'r', encoding='utf-8') as f:
    for line in f:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError:
            continue   # skip bad lines

huffpost = pd.DataFrame(data)

# split the training and testing set of the ag_new
train_dataset_raw = ag_news['train']
test_dataset_raw = ag_news['test']

# Convert to pandas for easier preprocessing
train_df = pd.DataFrame(train_dataset_raw)
test_df = pd.DataFrame(test_dataset_raw)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [2]:
#filter HuffPost dataset for selected categories
selected_categories=['ENTERTAINMENT', 'CRIME', 'HEALTHY LIVING', 'POLITICS']

#keep only rows where 'category' is in the selected categories
huffpost=huffpost[huffpost['category'].isin(selected_categories)]
huffpost

Unnamed: 0,short_description,headline,date,link,authors,category
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,2018-05-26,https://www.huffingtonpost.com/entry/texas-ama...,Melissa Jeltsen,CRIME
1,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,2018-05-26,https://www.huffingtonpost.com/entry/will-smit...,Andy McDonald,ENTERTAINMENT
2,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 57,2018-05-26,https://www.huffingtonpost.com/entry/hugh-gran...,Ron Dicker,ENTERTAINMENT
3,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,2018-05-26,https://www.huffingtonpost.com/entry/jim-carre...,Ron Dicker,ENTERTAINMENT
4,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,2018-05-26,https://www.huffingtonpost.com/entry/julianna-...,Ron Dicker,ENTERTAINMENT
...,...,...,...,...,...,...
66480,The solution to this political problem is star...,Why Obama Should Nominate Barack Obama For The...,2016-02-16,https://www.huffingtonpost.com/entry/obama-sho...,Ryan Grim,POLITICS
66482,"Ed Sheeran, Meghan Trainor and One Direction a...",'Mean Tweets' Goes From 0 To 100 Real Quick Wi...,2016-02-16,https://www.huffingtonpost.com/entry/drake-dem...,Bill Bradley,ENTERTAINMENT
66483,McAllister was a contestant on Season 14 of th...,Former 'Bachelor' Contestant Lex McAllister De...,2016-02-16,https://www.huffingtonpost.com/entry/lex-mcall...,Stephanie Marcus,ENTERTAINMENT
66485,Bowie would have been proud.,Lady Gaga's Tribute To David Bowie At The Gram...,2016-02-16,https://www.huffingtonpost.com/entry/lady-gaga...,Maxwell Strachan,ENTERTAINMENT


In [3]:
#Show the number of values in each category
print(huffpost['category'].value_counts())

category
POLITICS          22427
ENTERTAINMENT      7946
HEALTHY LIVING     2550
CRIME              1295
Name: count, dtype: int64


In [4]:
#drop the unnecessary columns in the huffpost dataset
huffpost.drop(columns=['date', 'link', 'authors'], inplace=True)

In [5]:
#rename the column title
huffpost=huffpost.rename(columns={'short_description': 'description', 'headline':'title', 'category':'label'})
huffpost

Unnamed: 0,description,title,label
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,CRIME
1,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT
2,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 57,ENTERTAINMENT
3,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT
4,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT
...,...,...,...
66480,The solution to this political problem is star...,Why Obama Should Nominate Barack Obama For The...,POLITICS
66482,"Ed Sheeran, Meghan Trainor and One Direction a...",'Mean Tweets' Goes From 0 To 100 Real Quick Wi...,ENTERTAINMENT
66483,McAllister was a contestant on Season 14 of th...,Former 'Bachelor' Contestant Lex McAllister De...,ENTERTAINMENT
66485,Bowie would have been proud.,Lady Gaga's Tribute To David Bowie At The Gram...,ENTERTAINMENT


In [6]:
#Map the columns to different numeric values
label2id = {
    'HEALTHY LIVING': 4,
    'CRIME': 5,
    'POLITICS': 6,
    'ENTERTAINMENT': 7
}

huffpost['label'] = huffpost['label'].map(label2id)


In [7]:
#Showcasing unique label values
huffpost['label'].unique()


array([5, 7, 6, 4])

In [8]:
huffpost

Unnamed: 0,description,title,label
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,5
1,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,7
2,The actor and his longtime girlfriend Anna Ebe...,Hugh Grant Marries For The First Time At Age 57,7
3,The actor gives Dems an ass-kicking for not fi...,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,7
4,"The ""Dietland"" actress said using the bags is ...",Julianna Margulies Uses Donald Trump Poop Bags...,7
...,...,...,...
66480,The solution to this political problem is star...,Why Obama Should Nominate Barack Obama For The...,6
66482,"Ed Sheeran, Meghan Trainor and One Direction a...",'Mean Tweets' Goes From 0 To 100 Real Quick Wi...,7
66483,McAllister was a contestant on Season 14 of th...,Former 'Bachelor' Contestant Lex McAllister De...,7
66485,Bowie would have been proud.,Lady Gaga's Tribute To David Bowie At The Gram...,7


In [9]:
#Showcasing unique values in Ag News dataset
train_df['label'].unique()


array([2, 3, 1, 0])

In [10]:
#Concatenating title and description
huffpost['text']=huffpost['title']+huffpost['description']

In [11]:
#Concatenating huffpost and AG News dataset
train_df = pd.concat([huffpost[['text', 'label']], train_df[['text', 'label']]], ignore_index=True)

train_df

Unnamed: 0,text,label
0,There Were 2 Mass Shootings In Texas Last Week...,5
1,Will Smith Joins Diplo And Nicky Jam For The 2...,7
2,Hugh Grant Marries For The First Time At Age 5...,7
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,7
4,Julianna Margulies Uses Donald Trump Poop Bags...,7
...,...,...
154213,Pakistan's Musharraf Says Won't Quit as Army C...,0
154214,Renteria signing a top-shelf deal Red Sox gene...,1
154215,Saban not going to Dolphins yet The Miami Dolp...,1
154216,Today's NFL games PITTSBURGH at NY GIANTS Time...,1


In [12]:
#Unique label of the dataset
train_df['label'].unique()

array([5, 7, 6, 4, 2, 3, 1, 0])

In [13]:
#Cleaning the dataset
import re

def clean_text(text):
    text = str(text).lower()                     # lowercase
    text = re.sub(r'<.*?>', '', text)            # remove HTML tags
    text = re.sub(r'http\S+', '', text)          # remove URLs
    text = re.sub(r'\s+', ' ', text).strip()    # normalize whitespace
    return text
train_df['text'] = train_df['text'].apply(clean_text)

In [14]:
train_df

Unnamed: 0,text,label
0,there were 2 mass shootings in texas last week...,5
1,will smith joins diplo and nicky jam for the 2...,7
2,hugh grant marries for the first time at age 5...,7
3,jim carrey blasts 'castrato' adam schiff and d...,7
4,julianna margulies uses donald trump poop bags...,7
...,...,...
154213,pakistan's musharraf says won't quit as army c...,0
154214,renteria signing a top-shelf deal red sox gene...,1
154215,saban not going to dolphins yet the miami dolp...,1
154216,today's nfl games pittsburgh at ny giants time...,1


In [15]:
train_df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,30000
1,30000
3,30000
2,30000
6,22427
7,7946
4,2550
5,1295


In [16]:
# Balance the training dataset by oversampling minority classes
from sklearn.utils import resample

# Set a target count for each class
target_count = 10000

balanced_dfs = []

# Iterate over each unique label
for label in train_df['label'].unique():
    df_label = train_df[train_df['label'] == label]  # Extract rows of this label

    if len(df_label) < target_count:
        # Oversample with replacement to reach the target count
        df_resampled = resample(df_label, replace=True, n_samples=target_count, random_state=42)
    else:
        # Keep original if already enough samples
        df_resampled = df_label

    balanced_dfs.append(df_resampled)

# Combine all resampled data into a single balanced DataFrame
combined_balanced = pd.concat(balanced_dfs, ignore_index=True)

In [17]:
combined_balanced['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,30000
1,30000
3,30000
2,30000
6,22427
4,10000
5,10000
7,10000


In [18]:
#Split dataset into training and testing sets
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    combined_balanced,
    test_size=0.15,  # 15% for validation
    stratify=combined_balanced['label'],  # keeps label proportions
    random_state=42
)


In [19]:
from datasets import Dataset
from transformers import AutoTokenizer

#choose the pre-trained model
model_ckpt = "distilbert-base-uncased"

#initialize the tokenizer corresponding to the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

#convert pandas DataFrames to Hugging Face Datasets for compatibility with Trainer API
train_dataset = Dataset.from_pandas(train_df)  # Training dataset
val_dataset   = Dataset.from_pandas(val_df)    # Validation dataset


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [20]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=256)

# Apply tokenization
train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset   = val_dataset.map(tokenize, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in ["input_ids", "attention_mask", "label"]])
val_dataset   = val_dataset.remove_columns([col for col in val_dataset.column_names if col not in ["input_ids", "attention_mask", "label"]])


Map:   0%|          | 0/146562 [00:00<?, ? examples/s]

Map:   0%|          | 0/25865 [00:00<?, ? examples/s]

In [69]:
# Define the number of output classes for classification
num_labels = 8

# Set device to GPU if available, otherwise CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the pre-trained model for sequence classification and adapt it to our number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_labels
).to(device)  # Move model to the selected device (GPU/CPU)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [75]:
from transformers import TrainingArguments

# Define the hyperparameters and training configurations
training_args = TrainingArguments(
    output_dir="./results",# Directory to save model checkpoints and logs
    eval_strategy="epoch", # Evaluate the model at the end of each epoch
    save_strategy="epoch", # Save the model checkpoint at the end of each epoch
    save_total_limit=2, # Keep only the last 2 checkpoints to save space

    #learning-rate=5e-5
    learning_rate=3e-5, # Learning rate for optimizer
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16, # Batch size for evaluation
    num_train_epochs=3, # Number of training epochs

    #weight_decay=0.05
    weight_decay=0.01, # L2 regularization to prevent overfitting
    load_best_model_at_end=True, # Automatically load the best model after training
    metric_for_best_model="accuracy", # Metric to determine the best model
    fp16=True, # Use mixed precision for faster training on GPU
    gradient_accumulation_steps=2, # Accumulate gradients to simulate a larger batch size
    logging_strategy="epoch", # Log metrics at the end of each epoch
    seed=42 # Set seed for reproducibility
)


In [76]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model, # The pre-trained/fine-tuned model
    args=training_args, # Training arguments defined earlier
    train_dataset=train_dataset, # Training dataset
    eval_dataset=val_dataset,# Validation dataset
    tokenizer=tokenizer, # Tokenizer used for preprocessing
    compute_metrics=compute_metrics  # Custom metrics function
)

  trainer = Trainer(


In [77]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 1


[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbajracharyadixita321[0m ([33mbajracharyadixita321-islington-college[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2345,0.14347,0.954794,0.954812,0.955331,0.954794
2,0.1004,0.129038,0.962292,0.962201,0.962346,0.962292
3,0.0562,0.13883,0.964763,0.964711,0.964753,0.964763


TrainOutput(global_step=19344, training_loss=0.13036766557086096, metrics={'train_runtime': 7806.4584, 'train_samples_per_second': 79.291, 'train_steps_per_second': 2.478, 'total_flos': 8.200397389765018e+16, 'train_loss': 0.13036766557086096, 'epoch': 3.0})

In [84]:
# Save the model and tokenizer
model_save_path = "./trained_model"

# Save the model weights & configuration
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)


('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [93]:
sample_headlines = [
    "Tips for a healthier lifestyle",
    "Local community garden promotes healthy living",
    "Bank robbery in downtown area",
    "Police investigate theft at local mall",
    "Government passes new healthcare policy",
    "City implements new recycling program",
    "New movie breaks box office records",
    "Celebrity couple announces divorce",

    "Scientists develop advanced renewable battery",
    "NASA unveils plans for Mars mission",
    "Peace talks held between neighboring countries",
    "International summit addresses climate change",
    "New tax regulations impact small businesses",
    "Venture capital investment hits record high",


]

inputs = tokenizer(sample_headlines, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
id2label = {
    0: 'WORLD',
    1: 'SPORTS',
    2: 'BUSINESS',
    3: 'SCI/TECH',
    4: 'HEALTHY LIVING',
    5: 'CRIME',
    6: 'POLITICS',
    7: 'ENTERTAINMENT'
}

predicted_labels = [id2label[i] for i in preds]

for headline, label in zip(sample_headlines, predicted_labels):
    print(f"Headline: {headline}\nPredicted Category: {label}\n")


Headline: Tips for a healthier lifestyle
Predicted Category: HEALTHY LIVING

Headline: Local community garden promotes healthy living
Predicted Category: HEALTHY LIVING

Headline: Bank robbery in downtown area
Predicted Category: CRIME

Headline: Police investigate theft at local mall
Predicted Category: CRIME

Headline: Government passes new healthcare policy
Predicted Category: POLITICS

Headline: City implements new recycling program
Predicted Category: POLITICS

Headline: New movie breaks box office records
Predicted Category: ENTERTAINMENT

Headline: Celebrity couple announces divorce
Predicted Category: ENTERTAINMENT

Headline: Scientists develop advanced renewable battery
Predicted Category: HEALTHY LIVING

Headline: NASA unveils plans for Mars mission
Predicted Category: SCI/TECH

Headline: Peace talks held between neighboring countries
Predicted Category: WORLD

Headline: International summit addresses climate change
Predicted Category: POLITICS

Headline: New tax regulations 