In [1]:
!pip install transformers[torch] datasets scikit-learn
!pip install accelerate -U

import torch
import transformers
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [2]:
dataset = load_dataset("JyotiNayak/political_ideologies")
print(dataset['train'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'statement': "Climate change, and the escalating environmental degradation we witness daily, is an urgent issue that requires immediate attention and collective effort. Renewable energy sources offer a sustainable and environment-friendly alternative that can significantly reduce our carbon footprint. It's also crucial to invest in and enforce policies that encourage recycling, conservation, and sustainable practices.", 'label': 1, 'issue_type': 1, '__index_level_0__': 465}


In [3]:
train_size = 0.8
validation_size = 0.1
test_size = 0.1

train_val_ds, test_ds = dataset["train"].train_test_split(test_size=test_size, seed=42).values()
train_ds, val_ds = train_val_ds.train_test_split(test_size=validation_size / (1 - test_size), seed=42).values()

# Create a new DatasetDict
split_ds = DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

# Check the sizes of the splits
print(f"Train size: {len(split_ds['train'])}")
print(f"Validation size: {len(split_ds['validation'])}")
print(f"Test size: {len(split_ds['test'])}")

Train size: 2048
Validation size: 256
Test size: 256


In [5]:
checkpoint = "bert-base-uncased"  # You can also use another model like "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)  # Set `num_labels` to 2 for the ideologies


# Example preprocessing function
def preprocess_function(examples):
    texts = examples['statement']
    labels = examples['label']
    return tokenizer(texts, padding=True, truncation=True, max_length=128)

# Apply the preprocessing function
tokenized_datasets = split_ds.map(preprocess_function, batched=True)

# Use DataCollatorWithPadding to handle dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/256 [00:00<?, ? examples/s]

In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,  # log every 10 steps
)



In [7]:
def compute_metrics(pred):
    logits = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
    labels = pred.label_ids
    preds = logits.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [8]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.0656,0.058827,0.980469,0.980467,0.980495,0.980469
2,0.2197,0.083891,0.980469,0.980471,0.980743,0.980469
3,0.0637,0.089478,0.980469,0.98047,0.980502,0.980469


TrainOutput(global_step=768, training_loss=0.12066787326451352, metrics={'train_runtime': 168.6822, 'train_samples_per_second': 36.424, 'train_steps_per_second': 4.553, 'total_flos': 404027581056480.0, 'train_loss': 0.12066787326451352, 'epoch': 3.0})

In [11]:
example_batch = tokenized_datasets["test"].select(range(30))

results = trainer.evaluate(tokenized_datasets["test"])
print("Test results:", results)

predictions = trainer.predict(test_dataset=example_batch)
predicted_labels = predictions.predictions.argmax(axis=1)
true_labels = example_batch["label"]

print("Predicted labels:", predicted_labels)
print("True labels:", true_labels)

Test results: {'eval_loss': 0.1481233537197113, 'eval_accuracy': 0.96875, 'eval_f1': 0.9687077261267911, 'eval_precision': 0.9691431544865865, 'eval_recall': 0.96875, 'eval_runtime': 2.3111, 'eval_samples_per_second': 110.768, 'eval_steps_per_second': 13.846, 'epoch': 3.0}
Predicted labels: [0 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 1 1 1 0 1 1 1 1 1]
True labels: [0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1]
