<a href="https://colab.research.google.com/github/georgilos/Bert-for-text-classification/blob/main/Dataset_prep_and_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Proccessing the [https://huggingface.co/datasets/Paul/hatecheck] dataset and training bert-base-uncased on it


In [1]:
#intalling necessary libraries
!pip install torch datasets transformers accelerate -U

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-

##Dataset "Paul/hatecheck"

In [2]:
from datasets import DatasetDict, load_dataset, load_metric
dataset = load_dataset("Paul/hatecheck")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/4.71k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/652k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/3728 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['functionality', 'case_id', 'test_case', 'label_gold', 'target_ident', 'direction', 'focus_words', 'focus_lemma', 'ref_case_id', 'ref_templ_id', 'templ_id', 'case_templ'],
        num_rows: 3728
    })
})


In [3]:
#Spliting the dataset into training and test sets by assigning 20% of rows to test set
split_dataset = dataset['test'].train_test_split(train_size=0.8)

#Creating a DatasetDict to have both training and validation sets
dataset = DatasetDict({
    'train': split_dataset['train'],
    'test': split_dataset['test']
})

#Structure of the new dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['functionality', 'case_id', 'test_case', 'label_gold', 'target_ident', 'direction', 'focus_words', 'focus_lemma', 'ref_case_id', 'ref_templ_id', 'templ_id', 'case_templ'],
        num_rows: 2982
    })
    test: Dataset({
        features: ['functionality', 'case_id', 'test_case', 'label_gold', 'target_ident', 'direction', 'focus_words', 'focus_lemma', 'ref_case_id', 'ref_templ_id', 'templ_id', 'case_templ'],
        num_rows: 746
    })
})


In [4]:
#Keeping the necessary columns
columns_to_keep = ['test_case', 'label_gold']

#Remove other columns
dataset['train'] = dataset['train'].remove_columns([col for col in dataset['train'].column_names if col not in columns_to_keep])
dataset['test'] = dataset['test'].remove_columns([col for col in dataset['test'].column_names if col not in columns_to_keep])

#Renaming the columns kept
dataset = dataset.rename_column("test_case", "text")
dataset = dataset.rename_column("label_gold", "label")

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2982
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 746
    })
})


In [5]:
#Mapping the label strings to integers
label_map = {'non-hateful': 0, 'hateful': 1}

#Function to map labels
def map_labels(example):
    example['label'] = label_map[example['label']]
    return example

#Apply label mapping function to both sets
mapped_train = dataset['train'].map(map_labels)
mapped_test = dataset['test'].map(map_labels)

#Replace original datasets with the mapped ones
dataset['train'] = mapped_train
dataset['test'] = mapped_test

Map:   0%|          | 0/2982 [00:00<?, ? examples/s]

Map:   0%|          | 0/746 [00:00<?, ? examples/s]

In [6]:
print(dataset['test'][51])

{'text': 'There have to be consequences for saying you would rape a trans person. ', 'label': 0}


In [7]:
print(dataset['train'][30])

{'text': "I'm going to hug a trans person tomorrow. ", 'label': 0}


In [8]:
#Creating a folder in Google Colab to save the trained model
import os

#Folder name
folder_name = "saved_models"

#Creating the folder if it doesn't already exist
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    print(f"Folder '{folder_name}' created.")
else:
    print(f"Folder '{folder_name}' already exists.")


Folder 'saved_models' created.


##Training Bert on the dataset

In [15]:
import torch
from transformers import BertTokenizer, TrainingArguments, BertForSequenceClassification, Trainer
import numpy as np
from datasets import load_metric

#Tokenizing the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_datasets = dataset.map(tokenize_function, batched=True)

#PreparING the dataset for training and testing
train_dataset = tokenized_datasets['train']
test_dataset = tokenized_datasets['test']

#Loading the pre-trained BERT model with a classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Step 7: Define compute_metrics function to compute accuracy
metric = load_metric("accuracy")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

#Seting up training arguments with reduced batch size, gradient accumulation, and mixed precision training
training_args = TrainingArguments(
    output_dir='./saved_models',  #Directory to save the model
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    fp16=True,  # Enable mixed precision training
)

#Initializing the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

#Training
trainer.train()

#Evaluation
results = trainer.evaluate()
print(results)

# Step 11: Save the trained model and tokenizer
model.save_pretrained('./saved_models')
tokenizer.save_pretrained('./saved_models')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Step,Training Loss
10,0.8105
20,0.7885
30,0.7169
40,0.6999
50,0.626
60,0.5668
70,0.5668
80,0.6061
90,0.5232
100,0.4963


{'eval_loss': 0.02676966041326523, 'eval_accuracy': 0.9946380697050938, 'eval_runtime': 1.6896, 'eval_samples_per_second': 441.532, 'eval_steps_per_second': 55.635, 'epoch': 2.991957104557641}


('./saved_models/tokenizer_config.json',
 './saved_models/special_tokens_map.json',
 './saved_models/vocab.txt',
 './saved_models/added_tokens.json')

In [14]:
#Loading the saved model and tokenizer
model_path = './saved_models'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

#Function to classify new text
def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    return predicted_class_id

#Classify new text
new_text = "Black people are bad" # <--- Enter text here
predicted_label = classify_text(new_text)
print(f"Predicted label: {predicted_label}")

Predicted label: 1
