<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/06_Transformers/07c_sequence_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rich
!pip install transformers[torch]
!pip install torch datasets evaluate
!pip install seqeval

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m52.9 MB/s

# Sentence Classification

In [2]:
# Built-in library
import re
import json
import logging
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
# %load_ext lab_black

# auto reload imports
# %load_ext autoreload
# %autoreload 2

In [3]:
def set_up_logger(delim: str = "::") -> Any:
    """This is used to create a basic logger."""

    format_ = f"[%(levelname)s] {delim} %(asctime)s {delim} %(message)s"
    logging.basicConfig(level=logging.INFO, format=format_)
    logger = logging.getLogger(__name__)
    return logger


# Global variable
logger = set_up_logger()


def load_data(*, filename: str, sep: str = ",") -> pd.DataFrame:
    """This is used to load the data.

    NB: Supported formats are 'csv' and 'parquet'.

    Params:
    -------
        filename (str): The filepath.
        sep (str, default=","): The separator. e.g ',', '\t', etc

    Returns:
    --------
        data (pd.DataFrame): The loaded dataframe.
    """
    data = (
        pd.read_csv(filename, sep=sep)
        if filename.split(".")[-1] == "csv"
        else pd.read_parquet(filename)
    )
    logger.info(f"Shape of data: {data.shape}\n")
    return data

### Prepare Data

```text
Create:
- Training data
- Validation data
- Test data

Labels
------
salary
gambling
loan
airtime
ussdTransactions
flightRisk
savingsAndInvestments
entertainment
spend
  - posSpend
  - atmSpend
  - mobileSpend
  - webSpend
```

In [4]:
from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
fp: str = "/content/drive/MyDrive/My doc/Deep Learning/Data/trans_data.parquet"
N: int = 25_000

df: pd.DataFrame = load_data(filename=fp).rename(columns={"cleaned_labels": "label"})
df = df.sample(n=N, random_state=123).reset_index(drop=True)

df.head()

Unnamed: 0,date,description,amount,type,label
0,2021-12-14,ATM CASH WDL ROU,3000.0,D,spend
1,2022-05-18,PALMPAY LIMITED/Palmpay,16000.0,C,other
2,2022-04-16,TRF/Oloye/FRM OGUNLEYE WALE TO MUSA ISIAKA AYOMIDE- 058,5010.75,D,spendOnTransfers
3,2022-02-08,NEXTGEN,23026.880859,D,other
4,2022-03-16,FGN ELECTRONIC MONEY TRANSFER LEVY,50.0,D,spendOnTransfers


In [8]:
# Rename the labels
labels: dict[str, Any] = list(df["label"].unique())
id2label: dict[str, Any] = {idx: key for idx, key in enumerate(labels)}
label2id: dict[str, Any] = {key: idx for idx, key in id2label.items()}

print(f"label2id: {label2id}")

print(f"id2label: {id2label}")

In [9]:
# Map the labels
df["label"] = df["label"].map(label2id)

df.head()

Unnamed: 0,date,description,amount,type,label
0,2021-12-14,ATM CASH WDL ROU,3000.0,D,0
1,2022-05-18,PALMPAY LIMITED/Palmpay,16000.0,C,1
2,2022-04-16,TRF/Oloye/FRM OGUNLEYE WALE TO MUSA ISIAKA AYOMIDE- 058,5010.75,D,2
3,2022-02-08,NEXTGEN,23026.880859,D,1
4,2022-03-16,FGN ELECTRONIC MONEY TRANSFER LEVY,50.0,D,2


### Split Data

```text
Split the data into:
- Training data
- Validation data
- Test data
```

In [10]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict


RANDOM_STATE: int = 123
TEST_SIZE: float = 0.08

X_t, X_test = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)

X_t.shape, X_test.shape

((23000, 5), (2000, 5))

In [11]:
X_train, X_validation = train_test_split(
    X_t, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train.shape, X_validation.shape, X_test.shape

((21160, 5), (1840, 5), (2000, 5))

In [12]:
# Create dataset objects
train_dataset: DatasetDict = Dataset.from_pandas(df=X_train)
validation_dataset: DatasetDict = Dataset.from_pandas(df=X_validation)
test_dataset: DatasetDict = Dataset.from_pandas(df=X_test)

train_dataset

Dataset({
    features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
    num_rows: 21160
})

In [13]:
# Create a dataset dict object
dataset: DatasetDict = DatasetDict(
    {"train": train_dataset, "validation": validation_dataset, "test": test_dataset}
)

dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 21160
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 1840
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 2000
    })
})

In [14]:
# Remove column
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 21160
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 1840
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 2000
    })
})

In [15]:
print(dataset.get("train")[0])

print(dataset.get("validation")[0])

print(dataset.get("test")[0])

In [16]:
def lower_case_slower(example: dict[str, Any]) -> dict[str, Any]:
    """Convert the text to lowercase. It's a MUCH SLOWER version."""
    return {"description": example.get("description").lower()}


def lower_case(example: dict[str, Any]) -> dict[str, Any]:
    """Convert the text to lowercase."""
    return {"description": [val.lower() for val in example.get("description")]}


def add_description_length(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to add the length of the description to the dataset."""
    return {"description_length": [len(val) for val in example.get("description")]}

In [17]:
# Much slower
# dataset_1 = dataset.map(lower_case_slower)
# dataset_1

In [18]:
# Much faster!
dataset_1 = dataset.map(lower_case, batched=True)
dataset_1

Map:   0%|          | 0/21160 [00:00<?, ? examples/s]

Map:   0%|          | 0/1840 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 21160
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 1840
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 2000
    })
})

In [19]:
dataset_1 = dataset_1.map(add_description_length, batched=True)
dataset_1

Map:   0%|          | 0/21160 [00:00<?, ? examples/s]

Map:   0%|          | 0/1840 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 21160
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1840
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 2000
    })
})

In [20]:
print(dataset_1.get("train")[0])

print(dataset_1.get("validation")[0])

print(dataset_1.get("test")[0])

In [21]:
# Drop descriptions that are less than 10
THRESHOLD: int = 10
dataset_1 = dataset_1.filter(lambda x: x.get("description_length") >= THRESHOLD)
dataset_1

Filter:   0%|          | 0/21160 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1840 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 20398
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1774
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1939
    })
})

In [22]:
print(dataset_1.get("validation")[0])

In [23]:
def concat_data(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to create a new column by combining columns in the dataset."""

    result: dict[str, Any] = {
        "body": [
            f"{date} \n {type} \n {amount} \n {description}"
            for (date, type, amount, description) in zip(
                example.get("date"),
                example.get("type"),
                example.get("amount"),
                example.get("description"),
            )
        ]
    }
    return result

In [24]:
dataset_1 = dataset_1.map(concat_data, batched=True)
dataset_1

Map:   0%|          | 0/20398 [00:00<?, ? examples/s]

Map:   0%|          | 0/1774 [00:00<?, ? examples/s]

Map:   0%|          | 0/1939 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 20398
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 1774
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 1939
    })
})

In [25]:
print(dataset_1.get("train")[0])

### Tokenize Data

In [26]:
from transformers import AutoTokenizer


MODEL_CHECKPOINT: str = "bert-base-cased"
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
    """This is used to tokenize the text. It returns a dict containing the
    input_ids, token_type_ids and attention_mask."""
    return tokenizer(examples.get("body"), truncation=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [27]:
FEATURES: list[str] = [
    "date",
    "description",
    "amount",
    "type",
    "description_length",
    "body",
]

tokenized_dataset: DatasetDict = dataset_1.map(
    tokenize_function,
    batched=True,
    remove_columns=FEATURES,
)
tokenized_dataset

Map:   0%|          | 0/20398 [00:00<?, ? examples/s]

Map:   0%|          | 0/1774 [00:00<?, ? examples/s]

Map:   0%|          | 0/1939 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20398
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1774
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1939
    })
})

In [28]:
print(tokenized_dataset.get("train")[0])

In [30]:
# Number of unique labels
N_LABELS: int = len(tokenized_dataset.get("train").unique("label"))

N_LABELS

11

In [31]:
from transformers import DataCollatorWithPadding


data_collator: DataCollatorWithPadding = DataCollatorWithPadding(tokenizer=tokenizer)

In [32]:
# Apply the data collator
batch: torch.Tensor = data_collator([tokenized_dataset["train"][i] for i in range(2)])

print(batch)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [33]:
# The 2nd idx has been padded with 0s
for row in batch.get("input_ids"):
    print(row)

In [54]:
from sklearn.metrics import recall_score, accuracy_score
from datasets import load_metric


metric = load_metric("seqeval")


def compute_metrics(eval_preds: tuple) -> dict[str, Any]:
    """This is used to calculate the evaluation metrics."""
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    all_metrics = metric.compute(predictions=predictions, references=labels)
    recall = recall_score(y_true=labels, y_pred=predictions, average="macro")

    return {
        "accuracy": all_metrics["overall_accuracy"]
    }

In [52]:
labels: list[int] = tokenized_dataset.get("train").unique("label")
predictions: list[str] = labels.copy()

# Simulate prediction
predictions[3] = 8
predictions[4] = 0
predictions[5] = 0
print(metric.compute(predictions=[predictions], references=[labels]))

### Define Model

In [59]:
from transformers import AutoModelForSequenceClassification


# Now we can just pass them to the AutoModelForTokenClassification.from_pretrained() method,
# and they will be set in the model’s configuration and then properly saved and uploaded to the Hub:
model: AutoModelForSequenceClassification = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=N_LABELS
)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
from huggingface_hub import notebook_login

# Login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [63]:
from transformers import TrainingArguments


OUTPUT_DIR: str = "bert-finetuned-ner"
STRATEGY: str = "epoch"
LEARNING_RATE: float = 2e-5
NUM_EPOCHS: int = 3
WEIGHT_DECAY: float = 0.01


args: TrainingArguments = TrainingArguments(
    OUTPUT_DIR,
    evaluation_strategy=STRATEGY,
    save_strategy=STRATEGY,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size= 16,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=True,
)

In [None]:
from transformers import Trainer


# Train the model!
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset.get("train"),
    eval_dataset=tokenized_dataset.get("validation"),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss


0.625