<a href="https://colab.research.google.com/github/gregworks/Hands-on-Generative-AI/blob/main/Day-3/capstone_project_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install scikit-learn datasets



In [2]:
# Import necessary libraries
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2Model
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score


# Task 1: Dataset Exploration
# Load the dbpedia_14 dataset
dataset = load_dataset('dbpedia_14')

# Quick exploration
print(dataset['train'].shape)
print(dataset['train'].features)
print(dataset['train'][0])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/13.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/560000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/70000 [00:00<?, ? examples/s]

(560000, 3)
{'label': ClassLabel(names=['Company', 'EducationalInstitution', 'Artist', 'Athlete', 'OfficeHolder', 'MeanOfTransportation', 'Building', 'NaturalPlace', 'Village', 'Animal', 'Plant', 'Album', 'Film', 'WrittenWork']), 'title': Value('string'), 'content': Value('string')}
{'label': 0, 'title': 'E. D. Abbott Ltd', 'content': ' Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.'}


In [3]:
# Task 2: Data Pre-processing
# Tokenize the textual descriptions
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
tokenizer.pad_token = tokenizer.eos_token

def tokenize_and_format(examples):
    encodings = tokenizer(examples['content'], truncation=True, padding='max_length', max_length=256)
    encodings['labels'] = examples['label']
    return encodings

tokenized_datasets = dataset.map(tokenize_and_format, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Taking 0.5% of the training and test data
train_dataset_small = tokenized_datasets["train"].shuffle(seed=42).select(range(0, int(0.005 * len(tokenized_datasets["train"]))))
test_dataset_small = tokenized_datasets["test"].shuffle(seed=42).select(range(0, int(0.005 * len(tokenized_datasets["test"]))))

train_dataloader = DataLoader(train_dataset_small, shuffle=True, batch_size=8)
test_dataloader = DataLoader(test_dataset_small, batch_size=8)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Map:   0%|          | 0/560000 [00:00<?, ? examples/s]

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

In [4]:
# Task 3: Model Building
class GPT2ForClassification(nn.Module):
    def __init__(self, num_labels=14):
        super(GPT2ForClassification, self).__init__()
        self.gpt2 = GPT2Model.from_pretrained('gpt2-medium')
        self.classifier = nn.Linear(self.gpt2.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        logits = self.classifier(hidden_states[:, -1])
        return logits

model = GPT2ForClassification().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))



model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Task 4: Model Training
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 10  # Sample value. Can be increased as needed.
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()

        inputs, masks, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

        logits = model(inputs, masks)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} completed!")



Epoch 1 completed!
Epoch 2 completed!


KeyboardInterrupt: 

In [7]:
loss.item()

0.014057064428925514

In [8]:
# Task 5: Model Evaluation
model.eval()

all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        inputs, masks, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)

        logits = model(inputs, masks)
        _, preds = torch.max(logits, dim=1)

        all_predictions.extend(preds.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

# Compute accuracy and F1 score
accuracy = accuracy_score(all_true_labels, all_predictions)
f1 = f1_score(all_true_labels, all_predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")



Accuracy: 0.9686
F1 Score: 0.9685


In [11]:


# Task 6: Prediction Example
# Get the label names from the dataset features
label_names = dataset['train'].features['label'].names

# Example text for prediction
example_text = "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France."

# Preprocess the example text
encoded_example = tokenizer(example_text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
input_ids = encoded_example['input_ids'].to(device)
attention_mask = encoded_example['attention_mask'].to(device)

# Make a prediction
model.eval()
with torch.no_grad():
    logits = model(input_ids, attention_mask)
    _, predicted_class_id = torch.max(logits, dim=1)

# Get the predicted label name
predicted_label = label_names[predicted_class_id.item()]

print(f"\nExample Text: {example_text}")
print(f"Predicted Label: {predicted_label}")


Example Text: The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France.
Predicted Label: Building
