<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/06_Transformers/07c_sequence_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rich
!pip install transformers[torch]
!pip install torch datasets evaluate
!pip install seqeval



# Sentence Classification

In [2]:
# Built-in library
import re
import json
import logging
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
# %load_ext lab_black

# auto reload imports
# %load_ext autoreload
# %autoreload 2

In [3]:
def set_up_logger(delim: str = "::") -> Any:
    """This is used to create a basic logger."""

    format_ = f"[%(levelname)s] {delim} %(asctime)s {delim} %(message)s"
    logging.basicConfig(level=logging.INFO, format=format_)
    logger = logging.getLogger(__name__)
    return logger


# Global variable
logger = set_up_logger()


def load_data(*, filename: str, sep: str = ",") -> pd.DataFrame:
    """This is used to load the data.

    NB: Supported formats are 'csv' and 'parquet'.

    Params:
    -------
        filename (str): The filepath.
        sep (str, default=","): The separator. e.g ',', '\t', etc

    Returns:
    --------
        data (pd.DataFrame): The loaded dataframe.
    """
    data = (
        pd.read_csv(filename, sep=sep)
        if filename.split(".")[-1] == "csv"
        else pd.read_parquet(filename)
    )
    logger.info(f"Shape of data: {data.shape}\n")
    return data

### Prepare Data

```text
Create:
- Training data
- Validation data
- Test data

Labels
------
salary
gambling
loan
airtime
ussdTransactions
flightRisk
savingsAndInvestments
entertainment
spend
  - posSpend
  - atmSpend
  - mobileSpend
  - webSpend
```

In [4]:
from google.colab import drive


drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
fp: str = "/content/drive/MyDrive/My doc/Deep Learning/Data/trans_data.parquet"
N: int = 25_000

df: pd.DataFrame = load_data(filename=fp).rename(columns={"cleaned_labels": "label"})
df = df.sample(n=N, random_state=123).reset_index(drop=True)

df.head()

Unnamed: 0,date,description,amount,type,label
0,2021-12-14,ATM CASH WDL ROU,3000.0,D,spend
1,2022-05-18,PALMPAY LIMITED/Palmpay,16000.0,C,other
2,2022-04-16,TRF/Oloye/FRM OGUNLEYE WALE TO MUSA ISIAKA AYOMIDE- 058,5010.75,D,spendOnTransfers
3,2022-02-08,NEXTGEN,23026.880859,D,other
4,2022-03-16,FGN ELECTRONIC MONEY TRANSFER LEVY,50.0,D,spendOnTransfers


In [6]:
# Rename the labels
labels: dict[str, Any] = list(df["label"].unique())
id2label: dict[str, Any] = {idx: key for idx, key in enumerate(labels)}
label2id: dict[str, Any] = {key: idx for idx, key in id2label.items()}

print(f"label2id: {label2id}")

print(f"id2label: {id2label}")

In [7]:
# Map the labels
df["label"] = df["label"].map(label2id)

df.head()

Unnamed: 0,date,description,amount,type,label
0,2021-12-14,ATM CASH WDL ROU,3000.0,D,0
1,2022-05-18,PALMPAY LIMITED/Palmpay,16000.0,C,1
2,2022-04-16,TRF/Oloye/FRM OGUNLEYE WALE TO MUSA ISIAKA AYOMIDE- 058,5010.75,D,2
3,2022-02-08,NEXTGEN,23026.880859,D,1
4,2022-03-16,FGN ELECTRONIC MONEY TRANSFER LEVY,50.0,D,2


### Split Data

```text
Split the data into:
- Training data
- Validation data
- Test data
```

In [8]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict


RANDOM_STATE: int = 123
TEST_SIZE: float = 0.08

X_t, X_test = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)

X_t.shape, X_test.shape

((23000, 5), (2000, 5))

In [9]:
X_train, X_validation = train_test_split(
    X_t, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train.shape, X_validation.shape, X_test.shape

((21160, 5), (1840, 5), (2000, 5))

In [10]:
# Create dataset objects
train_dataset: DatasetDict = Dataset.from_pandas(df=X_train)
validation_dataset: DatasetDict = Dataset.from_pandas(df=X_validation)
test_dataset: DatasetDict = Dataset.from_pandas(df=X_test)

train_dataset

Dataset({
    features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
    num_rows: 21160
})

In [11]:
# Create a dataset dict object
dataset: DatasetDict = DatasetDict(
    {"train": train_dataset, "validation": validation_dataset, "test": test_dataset}
)

dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 21160
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 1840
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 2000
    })
})

In [12]:
# Remove column
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 21160
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 1840
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 2000
    })
})

In [13]:
print(dataset.get("train")[0])

print(dataset.get("validation")[0])

print(dataset.get("test")[0])

In [14]:
def lower_case_slower(example: dict[str, Any]) -> dict[str, Any]:
    """Convert the text to lowercase. It's a MUCH SLOWER version."""
    return {"description": example.get("description").lower()}


def lower_case(example: dict[str, Any]) -> dict[str, Any]:
    """Convert the text to lowercase."""
    return {"description": [val.lower() for val in example.get("description")]}


def add_description_length(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to add the length of the description to the dataset."""
    return {"description_length": [len(val) for val in example.get("description")]}

In [15]:
# Much slower
# dataset_1 = dataset.map(lower_case_slower)
# dataset_1

In [16]:
# Much faster!
dataset_1 = dataset.map(lower_case, batched=True)
dataset_1

Map:   0%|          | 0/21160 [00:00<?, ? examples/s]

Map:   0%|          | 0/1840 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 21160
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 1840
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 2000
    })
})

In [17]:
dataset_1 = dataset_1.map(add_description_length, batched=True)
dataset_1

Map:   0%|          | 0/21160 [00:00<?, ? examples/s]

Map:   0%|          | 0/1840 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 21160
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1840
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 2000
    })
})

In [18]:
print(dataset_1.get("train")[0])

print(dataset_1.get("validation")[0])

print(dataset_1.get("test")[0])

In [19]:
# Drop descriptions that are less than 10
THRESHOLD: int = 10
dataset_1 = dataset_1.filter(lambda x: x.get("description_length") >= THRESHOLD)
dataset_1

Filter:   0%|          | 0/21160 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1840 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 20398
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1774
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1939
    })
})

In [20]:
print(dataset_1.get("validation")[0])

In [21]:
def concat_data(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to create a new column by combining columns in the dataset."""

    result: dict[str, Any] = {
        "body": [
            f"{date} \n {type} \n {amount} \n {description}"
            for (date, type, amount, description) in zip(
                example.get("date"),
                example.get("type"),
                example.get("amount"),
                example.get("description"),
            )
        ]
    }
    return result

In [22]:
dataset_1 = dataset_1.map(concat_data, batched=True)
dataset_1

Map:   0%|          | 0/20398 [00:00<?, ? examples/s]

Map:   0%|          | 0/1774 [00:00<?, ? examples/s]

Map:   0%|          | 0/1939 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 20398
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 1774
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 1939
    })
})

In [23]:
print(dataset_1.get("train")[0])

### Tokenize Data

In [24]:
from transformers import AutoTokenizer


MODEL_CHECKPOINT: str = "bert-base-cased"
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
    """This is used to tokenize the text. It returns a dict containing the
    input_ids, token_type_ids and attention_mask."""
    return tokenizer(examples.get("body"), truncation=True)

In [25]:
FEATURES: list[str] = [
    "date",
    "description",
    "amount",
    "type",
    "description_length",
    "body",
]

tokenized_dataset: DatasetDict = dataset_1.map(
    tokenize_function,
    batched=True,
    remove_columns=FEATURES,
)
tokenized_dataset

Map:   0%|          | 0/20398 [00:00<?, ? examples/s]

Map:   0%|          | 0/1774 [00:00<?, ? examples/s]

Map:   0%|          | 0/1939 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20398
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1774
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1939
    })
})

In [26]:
print(tokenized_dataset.get("train")[0])

In [27]:
# Number of unique labels
N_LABELS: int = len(tokenized_dataset.get("train").unique("label"))

N_LABELS

11

In [28]:
from transformers import DataCollatorWithPadding


data_collator: DataCollatorWithPadding = DataCollatorWithPadding(tokenizer=tokenizer)

In [29]:
# Apply the data collator
batch: torch.Tensor = data_collator([tokenized_dataset["train"][i] for i in range(2)])

print(batch)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [30]:
# The 2nd idx has been padded with 0s
for row in batch.get("input_ids"):
    print(row)

In [31]:
from sklearn.metrics import recall_score, accuracy_score
from datasets import load_metric


metric = load_metric("seqeval")


def compute_metrics(eval_preds: tuple) -> dict[str, Any]:
    """This is used to calculate the evaluation metrics."""
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    accuracy = np.mean(predictions == labels)

    return {
        "accuracy": accuracy
    }

In [32]:
labels: list[int] = tokenized_dataset.get("train").unique("label")
predictions: list[str] = labels.copy()

# Simulate prediction
predictions[3] = 8
predictions[4] = 0
predictions[5] = 0
print(metric.compute(predictions=[predictions], references=[labels]))

### Define Model

In [47]:
from transformers import AutoModelForSequenceClassification


# Now we can just pass them to the AutoModelForTokenClassification.from_pretrained() method,
# and they will be set in the model’s configuration and then properly saved and uploaded to the Hub:
model: AutoModelForSequenceClassification = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=N_LABELS, id2label=id2label, label2id=label2id
)

In [34]:
from huggingface_hub import notebook_login

# Login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [48]:
from transformers import TrainingArguments


OUTPUT_DIR: str = "bert-finetuned-sequence-classification"
STRATEGY: str = "epoch"
LEARNING_RATE: float = 2e-5
NUM_EPOCHS: int = 3
WEIGHT_DECAY: float = 0.01


args: TrainingArguments = TrainingArguments(
    OUTPUT_DIR,
    evaluation_strategy=STRATEGY,
    save_strategy=STRATEGY,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size= 16,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=True,
)

In [None]:
from transformers import Trainer


# Train the model!
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset.get("train"),
    eval_dataset=tokenized_dataset.get("validation"),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss


In [37]:
trainer.push_to_hub(commit_message="Training complete")

'https://huggingface.co/chineidu/bert-finetuned-sequence-classification/tree/main/'

In [38]:
from transformers import pipeline


TASK: str = "text-classification"
MODEL_CHECKPOINT: str = "chineidu/bert-finetuned-sequence-classification"
sequence_classifier: pipeline = pipeline(task=TASK, model=MODEL_CHECKPOINT)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [40]:
dir(sequence_classifier)

['__abstractmethods__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_batch_size',
 '_ensure_tensor_on_device',
 '_forward',
 '_forward_params',
 '_num_workers',
 '_postprocess_params',
 '_preprocess_params',
 '_sanitize_parameters',
 'binary_output',
 'call_count',
 'check_model_type',
 'default_input_names',
 'device',
 'device_placement',
 'ensure_tensor_on_device',
 'feature_extractor',
 'forward',
 'framework',
 'function_to_apply',
 'get_inference_context',
 'get_iterator',
 'image_processor',
 'iterate',
 'model',
 'modelcard',
 'postprocess',
 'predict',
 'preprocess',
 'return_all_scores',
 'run_multi',
 'run_single

In [39]:
text: str = "FGN ELECTRONIC MONEY TRANSFER LEVY"
print(sequence_classifier(text))

In [45]:
text: str = "TRF/Loan/FRM JOHN DOE STEVE TO (IBADAN) FRANK LAMPS INTEREST FREE MONATAN - 301	57553.750000	D	"
print(sequence_classifier(text))

In [46]:
text: str = "oct/sal	125000.00	C 2020/10/28	"
print(sequence_classifier(text))

In [42]:
print(f"label2id: {label2id}")

print(f"id2label: {id2label}")