# Introduction
Source code for exercise 3 - Fine-tuning Language Model

## Set up

In [1]:
!pip install transformers datasets torch matplotlib seaborn



In [None]:
from datasets import load_dataset
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer

In [None]:
# Load dataset
dataset = load_dataset('ag_news')

## Description of dataset

In [21]:
# 1. Check the structure (Splits)
print(dataset)

# 2. Check the class names
features = dataset['train'].features
print(f"Classes: {features['label'].names}")

# 3. Look at a sample structure
print(f"Sample text: {dataset['train'][0]['text']}")
print(f"Sample label: {dataset['train'][0]['label']} ({features['label'].names[dataset['train'][0]['label']]})")

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})
Classes: ['World', 'Sports', 'Business', 'Sci/Tech']
Sample text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Sample label: 2 (Business)


## 3.2 Stats and example

In [33]:
class_names = dataset['train'].features['label'].names

train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])

def get_stats(df):
    # Calculate samples per class
    class_counts = df.value_counts('label')

    dist = {class_names[i]: int(class_counts[i]) for i in range(len(class_names))}

    # Calculate text length statistics (number of characters)
    df['text_len'] = df['text'].apply(len)
    avg_len = df['text_len'].mean()

    return dist, avg_len

# 2. Generate Statistics
train_dist, train_avg_len = get_stats(train_df)
test_dist, test_avg_len = get_stats(test_df)

print("--- Summary Statistics ---")
print(f"Total Training Samples: {len(dataset['train'])}")
print(f"Total Testing Samples: {len(dataset['test'])}")
print(f"\nTraining Class Distribution: {train_dist}")
print(f"Testing Class Distribution: {test_dist}")
print(f"\nAvg. Text Length (Train): {train_avg_len:.2f} characters")

# 3. Representative Examples
print("\n--- Representative Examples ---")
for i in range(len(class_names)):
  example = train_df[train_df['label'] == i].iloc[0]['text']
  print(f'{class_names[i]}: {example}')



--- Summary Statistics ---
Total Training Samples: 120000
Total Testing Samples: 7600

Training Class Distribution: {'World': 30000, 'Sports': 30000, 'Business': 30000, 'Sci/Tech': 30000}
Testing Class Distribution: {'World': 1900, 'Sports': 1900, 'Business': 1900, 'Sci/Tech': 1900}

Avg. Text Length (Train): 236.48 characters

--- Representative Examples ---
World: Venezuelans Vote Early in Referendum on Chavez Rule (Reuters) Reuters - Venezuelans turned out early\and in large numbers on Sunday to vote in a historic referendum\that will either remove left-wing President Hugo Chavez from\office or give him a new mandate to govern for the next two\years.
Sports: Phelps, Thorpe Advance in 200 Freestyle (AP) AP - Michael Phelps took care of qualifying for the Olympic 200-meter freestyle semifinals Sunday, and then found out he had been added to the American team for the evening's 400 freestyle relay final. Phelps' rivals Ian Thorpe and Pieter van den Hoogenband and teammate Klete Keller w

## Fine tuning

In [35]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset('ag_news')

# Load model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Preprocess data
def preprocess_function(examples):
    result = tokenizer(examples['text'], truncation=True, padding=True)
    result["labels"] = examples["label"]
    return result

encoded_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names # This removes 'text' and 'label'
)

# 5. Set the format for PyTorch
encoded_dataset.set_format("torch")

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"]
)

# Start training
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

RuntimeError: stack expects each tensor to be equal size, but got [175] at entry 0 and [194] at entry 1

In [None]:
from transformers import BertTokenizer, BertModel
import torch

model = BertModel.from_pretrained("bert-base-uncased", output_attentions=True)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Example input
inputs = tokenizer("The movie was fantastic!", return_tensors="pt")

# Get attentions
outputs = model(**inputs)
attentions = outputs.attentions # List of attention maps for each layer