<a href="https://colab.research.google.com/github/chhavi10/Gitcourse/blob/main/Huggingface%20Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

**Task 1: Using a Pre-trained BERT for Classification**

In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load pre-trained model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Sample sentences
sentences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This movie is great!",
    "I don't like this book."
]

# Tokenize and prepare input
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Perform inference
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Map predictions to labels
id2label = {0: "negative", 1: "positive"}
predicted_labels = [id2label[pred.item()] for pred in predictions]

# Display results
for sentence, label in zip(sentences, predicted_labels):
    print(f"Sentence: '{sentence}' - Predicted Label: {label}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Sentence: 'I've been waiting for a HuggingFace course my whole life.' - Predicted Label: negative
Sentence: 'This movie is great!' - Predicted Label: negative
Sentence: 'I don't like this book.' - Predicted Label: negative


**Task 2: Building BERT Classification Using Another Dataset**

In [4]:
!pip install datasets transformers torch

import torch
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import random

# Set random seed for reproducibility
random.seed(42)

# Load the dataset
ds = load_dataset("lmsys/toxic-chat", "toxicchat0124")

# Select 50 examples per class
toxic_examples = ds["train"].filter(lambda x: x["toxicity"] == 1).shuffle(seed=42).select(range(50))
nontoxic_examples = ds["train"].filter(lambda x: x["toxicity"] == 0).shuffle(seed=42).select(range(50))

# Combine the datasets using concatenate_datasets
combined_examples = concatenate_datasets([toxic_examples, nontoxic_examples])

# Load the pre-trained BERT model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Tokenize the input text
inputs = tokenizer(combined_examples["user_input"], padding=True, truncation=True, return_tensors="pt")

# Get the model's predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

# Convert the true labels to a tensor
true_labels = torch.tensor(combined_examples["toxicity"])

# Calculate the accuracy score
accuracy = (predictions == true_labels).float().mean().item()
print(f"Accuracy: {accuracy:.4f}")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy: 0.5100


**Task 3: Multilabel Classification with BERT**

In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from collections import Counter
import random

# Set random seed for reproducibility
random.seed(42)
torch.manual_seed(42)

# Load the dataset
ds = load_dataset("snats/url-classifications")

# Find the top 5 most frequent classes
label_distribution = Counter(ds["train"]["classification"])
top_5_classes = [label for label, _ in label_distribution.most_common(5)]
print(f"Top 5 classes: {top_5_classes}")

# Select 20 examples per class from the top 5 classes
selected_examples = []
for label in top_5_classes:
    # Filter examples by class, shuffle them, and select the first 20
    examples = ds["train"].filter(lambda x: x["classification"] == label).shuffle(seed=42).select(range(20))
    selected_examples.extend(examples)

# Create label mappings
id2label = {i: label for i, label in enumerate(top_5_classes)}
label2id = {label: i for i, label in id2label.items()}

# Load the pre-trained BERT model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(top_5_classes),  # Set the number of output classes
    id2label=id2label,
    label2id=label2id
)

# Tokenize the input text
inputs = tokenizer([example["url"] for example in selected_examples], padding=True, truncation=True, return_tensors="pt")

# Prepare true labels
true_labels = torch.zeros((len(selected_examples), len(top_5_classes)))
for i, example in enumerate(selected_examples):
    label_index = label2id[example["classification"]]
    true_labels[i, label_index] = 1

# Get the model's predictions
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    # Convert logits to binary predictions (0 or 1)
    predictions = (outputs.logits > 0).float()

# Calculate overall accuracy
overall_accuracy = (predictions == true_labels).float().mean().item()
print(f"Overall Accuracy: {overall_accuracy:.4f}")

# Calculate accuracy per class
for label, idx in label2id.items():
    class_accuracy = (predictions[:, idx] == true_labels[:, idx]).float().mean().item()
    print(f"Accuracy for class {label}: {class_accuracy:.4f}")

Repo card metadata block was not found. Setting CardData to empty.


Top 5 classes: ['arts', 'education', 'business', 'government', 'social-sciences']


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Overall Accuracy: 0.3300
Accuracy for class arts: 0.2000
Accuracy for class education: 0.2000
Accuracy for class business: 0.8000
Accuracy for class government: 0.2000
Accuracy for class social-sciences: 0.2500
