<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/06_Transformers/07c_sequence_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rich
!pip install transformers[torch]
!pip install torch datasets evaluate
!pip install seqeval

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.1 MB/s

# Sentence Classification

In [2]:
# Built-in library
import re
import json
import logging
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
# %load_ext lab_black

# auto reload imports
# %load_ext autoreload
# %autoreload 2

In [3]:
def set_up_logger(delim: str = "::") -> Any:
    """This is used to create a basic logger."""

    format_ = f"[%(levelname)s] {delim} %(asctime)s {delim} %(message)s"
    logging.basicConfig(level=logging.INFO, format=format_)
    logger = logging.getLogger(__name__)
    return logger


# Global variable
logger = set_up_logger()


def load_data(*, filename: str, sep: str = ",") -> pd.DataFrame:
    """This is used to load the data.

    NB: Supported formats are 'csv' and 'parquet'.

    Params:
    -------
        filename (str): The filepath.
        sep (str, default=","): The separator. e.g ',', '\t', etc

    Returns:
    --------
        data (pd.DataFrame): The loaded dataframe.
    """
    data = (
        pd.read_csv(filename, sep=sep)
        if filename.split(".")[-1] == "csv"
        else pd.read_parquet(filename)
    )
    logger.info(f"Shape of data: {data.shape}\n")
    return data

### Prepare Data

```text
Create:
- Training data
- Validation data
- Test data

Labels
------
salary
gambling
loan
airtime
ussdTransactions
flightRisk
savingsAndInvestments
entertainment
spend
  - posSpend
  - atmSpend
  - mobileSpend
  - webSpend
```

In [4]:
from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
fp: str = "/content/drive/MyDrive/My doc/Deep Learning/Data/trans_data.parquet"

df: pd.DataFrame = load_data(filename=fp).rename(columns={"cleaned_labels": "labels"})

df.head()

Unnamed: 0,date,description,amount,type,labels
0,2022-05-29,POS/WEB PMT T MODERNCOMMUNICATI 000104 2070849Y NG,20000.0,D,spend
1,2022-03-19,POS/WEB PMT NULL LA NG,13100.0,D,spend
2,2022-01-14,FGN ELECTRONIC MONEY TRANSFER LEVY,100.0,D,spendOnTransfers
3,2022-02-19,Quantum USSD,20026.880859,D,ussd
4,2022-05-28,POS/WEB PMT BETWAY NG/1853277904 PSTK LANG,10000.0,D,gambling


In [6]:
# Rename the labels
labels: dict[str, Any] = list(df["labels"].unique())
id2label: dict[str, Any] = {idx: key for idx, key in enumerate(labels)}
label2id: dict[str, Any] = {key: idx for idx, key in id2label.items()}

print(f"label2id: {label2id}")

print(f"id2label: {id2label}")

In [7]:
# Map the labels
df["labels"] = df["labels"].map(label2id)

df.head()

Unnamed: 0,date,description,amount,type,labels
0,2022-05-29,POS/WEB PMT T MODERNCOMMUNICATI 000104 2070849Y NG,20000.0,D,0
1,2022-03-19,POS/WEB PMT NULL LA NG,13100.0,D,0
2,2022-01-14,FGN ELECTRONIC MONEY TRANSFER LEVY,100.0,D,1
3,2022-02-19,Quantum USSD,20026.880859,D,2
4,2022-05-28,POS/WEB PMT BETWAY NG/1853277904 PSTK LANG,10000.0,D,3


### Split Data

```text
Split the data into:
- Training data
- Validation data
- Test data
```

In [8]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict


RANDOM_STATE: int = 123
TEST_SIZE: float = 0.08

X_t, X_test = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)

X_t.shape, X_test.shape

((460000, 5), (40000, 5))

In [9]:
X_train, X_validation = train_test_split(
    X_t, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train.shape, X_validation.shape, X_test.shape

((423200, 5), (36800, 5), (40000, 5))

In [10]:
# Create dataset objects
train_dataset: DatasetDict = Dataset.from_pandas(df=X_train)
validation_dataset: DatasetDict = Dataset.from_pandas(df=X_validation)
test_dataset: DatasetDict = Dataset.from_pandas(df=X_test)

train_dataset

Dataset({
    features: ['date', 'description', 'amount', 'type', 'labels', '__index_level_0__'],
    num_rows: 423200
})

In [11]:
# Create a dataset dict object
dataset: DatasetDict = DatasetDict(
    {"train": train_dataset, "validation": validation_dataset, "test": test_dataset}
)

dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', '__index_level_0__'],
        num_rows: 423200
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', '__index_level_0__'],
        num_rows: 36800
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', '__index_level_0__'],
        num_rows: 40000
    })
})

In [None]:
# Remove column
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels'],
        num_rows: 423200
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels'],
        num_rows: 36800
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels'],
        num_rows: 40000
    })
})

In [None]:
print(dataset.get("train")[0])

print(dataset.get("validation")[0])

print(dataset.get("test")[0])

In [None]:
def lower_case_slower(example: dict[str, Any]) -> dict[str, Any]:
    """Convert the text to lowercase. It's a MUCH SLOWER version."""
    return {"description": example.get("description").lower()}


def lower_case(example: dict[str, Any]) -> dict[str, Any]:
    """Convert the text to lowercase."""
    return {"description": [val.lower() for val in example.get("description")]}


def add_description_length(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to add the length of the description to the dataset."""
    return {"description_length": [len(val) for val in example.get("description")]}

In [None]:
# Much slower
# dataset_1 = dataset.map(lower_case_slower)
# dataset_1

In [None]:
# Much faster!
dataset_1 = dataset.map(lower_case, batched=True)
dataset_1

Map:   0%|          | 0/423200 [00:00<?, ? examples/s]

Map:   0%|          | 0/36800 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels'],
        num_rows: 423200
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels'],
        num_rows: 36800
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels'],
        num_rows: 40000
    })
})

In [None]:
dataset_1 = dataset_1.map(add_description_length, batched=True)
dataset_1

Map:   0%|          | 0/423200 [00:00<?, ? examples/s]

Map:   0%|          | 0/36800 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length'],
        num_rows: 423200
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length'],
        num_rows: 36800
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length'],
        num_rows: 40000
    })
})

In [None]:
print(dataset_1.get("train")[0])

print(dataset_1.get("validation")[0])

print(dataset_1.get("test")[0])

In [None]:
# Drop descriptions that are less than 10
THRESHOLD: int = 10
dataset_1 = dataset_1.filter(lambda x: x.get("description_length") >= THRESHOLD)
dataset_1

Filter:   0%|          | 0/423200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length'],
        num_rows: 408029
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length'],
        num_rows: 35476
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length'],
        num_rows: 38555
    })
})

In [None]:
print(dataset_1.get("validation")[0])

In [None]:
def concat_data(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to create a new column by combining columns in the dataset."""

    result: dict[str, Any] = {
        "body": [
            f"{date} \n {type} \n {amount} \n {description}"
            for (date, type, amount, description) in zip(
                example.get("date"),
                example.get("type"),
                example.get("amount"),
                example.get("description"),
            )
        ]
    }
    return result

In [None]:
dataset_1 = dataset_1.map(concat_data, batched=True)
dataset_1

Map:   0%|          | 0/408029 [00:00<?, ? examples/s]

Map:   0%|          | 0/35476 [00:00<?, ? examples/s]

Map:   0%|          | 0/38555 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length', 'body'],
        num_rows: 408029
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length', 'body'],
        num_rows: 35476
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'labels', 'description_length', 'body'],
        num_rows: 38555
    })
})

In [None]:
print(dataset_1.get("train")[0])

### Tokenize Data

In [None]:
from transformers import AutoTokenizer


MODEL_CHECKPOINT: str = "bert-base-cased"
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
    """This is used to tokenize the text. It returns a dict containing the
    input_ids, token_type_ids and attention_mask."""
    return tokenizer(examples.get("body"), truncation=True)

In [None]:
FEATURES: list[str] = [
    "date",
    "description",
    "amount",
    "type",
    "description_length",
    "body",
]

tokenized_dataset: DatasetDict = dataset_1.map(
    tokenize_function,
    batched=True,
    remove_columns=FEATURES,
)
tokenized_dataset

Map:   0%|          | 0/408029 [00:00<?, ? examples/s]

Map:   0%|          | 0/35476 [00:00<?, ? examples/s]

Map:   0%|          | 0/38555 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408029
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 35476
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 38555
    })
})

In [None]:
print(tokenized_dataset.get("train")[0])

In [None]:
# Number of unique labels
N_LABELS: int = len(tokenized_dataset.get("train").unique("labels"))

N_LABELS

11

In [None]:
from transformers import DataCollatorWithPadding


data_collator: DataCollatorWithPadding = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Apply the data collator
batch: torch.Tensor = data_collator([tokenized_dataset["train"][i] for i in range(2)])

print(batch)

In [None]:
# The 2nd idx has been padded with 0s
for row in batch.get("input_ids"):
    print(row)