<a href="https://colab.research.google.com/github/chineidu/NLP-Tutorial/blob/main/notebook/06_Transformers/07c_seq_classif_wf_polars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install rich
!pip install polars[pyarrow]
!pip install transformers[torch]
!pip install torch datasets evaluate
!pip install seqeval

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.6 MB/s

# Sentence Classification

In [2]:
# Built-in library
import re
import json
import logging
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
# %load_ext lab_black

# auto reload imports
# %load_ext autoreload
# %autoreload 2

### Prepare Data

```text
Create:
- Training data
- Validation data
- Test data

```

In [3]:
from google.colab import drive


drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
fp: str = "/content/drive/MyDrive/My doc/Deep Learning/Data/training_data_2.parquet"
N: int = 45_000

df: pl.DataFrame = pl.read_parquet(source=fp).rename({"tags": "label"})
df = df.sample(n=N, shuffle=True, seed=123)

df.head()

customer_id,nuban,date,description,amount,type,label
str,i64,str,str,f64,str,str
"""39396""",1,"""2021-11-16""","""MOBILE BANKING…",3100.0,"""Debit""","""spend.mobileSp…"
"""38727""",1,"""2022-04-12""","""POS/WEB PMT JE…",3220.0,"""Debit""","""spend.shopping…"
"""40065""",1,"""2022-08-27""","""Paystack/antam…",6200.0,"""Credit""","""behavioural.lo…"
"""40826""",1,"""2022-06-20""","""Amt includes C…",2621.5,"""Debit""","""spend.ussdTran…"
"""40187""",1,"""2022-03-19""","""POS/WEB PMT T …",10000.0,"""Debit""","""spend.shopping…"


In [None]:
# Rename the labels
labels: dict[str, Any] = list(df["label"].unique())
id2label: dict[str, Any] = {idx: key for idx, key in enumerate(labels)}
label2id: dict[str, Any] = {key: idx for idx, key in id2label.items()}

print(f"label2id: {label2id}")

print(f"id2label: {id2label}")

In [None]:
# Map the labels
df["label"] = df["label"].map(label2id)

df.head()

Unnamed: 0,date,description,amount,type,label
0,2022-08-28,TRF/POS/FRM OFFONG EMMANUEL TO Emmanuel Nso Offong- 611,92653.75,D,0
1,2022-09-20,Amt includes COMM & VAT/USSD/TAOFEEK OLAWALE HAMZAT,14000.0,C,1
2,2022-12-12,elias tosin enifeni/Transfer from to KOLEOSHO SODIQ,50000.0,C,1
3,2022-06-30,TRF//FRM ABDULLAHI NASIRU TO BELLO SALMANU - 033,5126.879883,D,0
4,2022-07-21,TRF FRM OLUWASEUN ADENIYI IDOWU\]. Kemi Fadare IFO ADIGUN ISIAKA SEGUN,33750.0,C,1


### Split Data

```text
Split the data into:
- Training data
- Validation data
- Test data
```

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict


RANDOM_STATE: int = 123
TEST_SIZE: float = 0.08

X_t, X_test = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE)

X_t.shape, X_test.shape

((19320, 5), (1680, 5))

In [None]:
X_train, X_validation = train_test_split(
    X_t, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train.shape, X_validation.shape, X_test.shape

((17774, 5), (1546, 5), (1680, 5))

In [None]:
# Create dataset objects
train_dataset: DatasetDict = Dataset.from_pandas(df=X_train)
validation_dataset: DatasetDict = Dataset.from_pandas(df=X_validation)
test_dataset: DatasetDict = Dataset.from_pandas(df=X_test)

train_dataset

Dataset({
    features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
    num_rows: 17774
})

In [None]:
# Create a dataset dict object
dataset: DatasetDict = DatasetDict(
    {"train": train_dataset, "validation": validation_dataset, "test": test_dataset}
)

dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 17774
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 1546
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', '__index_level_0__'],
        num_rows: 1680
    })
})

In [None]:
# Remove column
dataset = dataset.remove_columns(["__index_level_0__"])
dataset

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 17774
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 1546
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 1680
    })
})

In [None]:
print(dataset.get("train")[0])

print(dataset.get("validation")[0])

print(dataset.get("test")[0])

In [None]:
def lower_case_slower(example: dict[str, Any]) -> dict[str, Any]:
    """Convert the text to lowercase. It's a MUCH SLOWER version."""
    return {"description": example.get("description").lower()}

# For a faster implementation, use a list comprehension
def lower_case(example: dict[str, Any]) -> dict[str, Any]:
    """Convert the text to lowercase."""
    return {"description": [val.lower() for val in example.get("description")]}


def add_description_length(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to add the length of the description to the dataset."""
    return {"description_length": [len(val) for val in example.get("description")]}

In [None]:
# Much slower
# dataset_1 = dataset.map(lower_case_slower)
# dataset_1

In [None]:
# Much faster!
dataset_1 = dataset.map(lower_case, batched=True)
dataset_1

Map:   0%|          | 0/17774 [00:00<?, ? examples/s]

Map:   0%|          | 0/1546 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 17774
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 1546
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label'],
        num_rows: 1680
    })
})

In [None]:
dataset_1 = dataset_1.map(add_description_length, batched=True)
dataset_1

Map:   0%|          | 0/17774 [00:00<?, ? examples/s]

Map:   0%|          | 0/1546 [00:00<?, ? examples/s]

Map:   0%|          | 0/1680 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 17774
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1546
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1680
    })
})

In [None]:
print(dataset_1.get("train")[0])

print(dataset_1.get("validation")[0])

print(dataset_1.get("test")[0])

In [None]:
# Drop descriptions that are less than 10
THRESHOLD: int = 10
dataset_1 = dataset_1.filter(lambda x: x.get("description_length") >= THRESHOLD)
dataset_1

Filter:   0%|          | 0/17774 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1546 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1680 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 17742
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1542
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length'],
        num_rows: 1677
    })
})

In [None]:
print(dataset_1.get("validation")[0])

In [None]:
def concat_data(example: dict[str, Any]) -> dict[str, Any]:
    """This is used to create a new column by combining columns in the dataset."""

    result: dict[str, Any] = {
        "body": [
            f"{date} \n {type} \n {amount} \n {description}"
            for (date, type, amount, description) in zip(
                example.get("date"),
                example.get("type"),
                example.get("amount"),
                example.get("description"),
            )
        ]
    }
    return result

In [None]:
dataset_1 = dataset_1.map(concat_data, batched=True)
dataset_1

Map:   0%|          | 0/17742 [00:00<?, ? examples/s]

Map:   0%|          | 0/1542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1677 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 17742
    })
    validation: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 1542
    })
    test: Dataset({
        features: ['date', 'description', 'amount', 'type', 'label', 'description_length', 'body'],
        num_rows: 1677
    })
})

In [None]:
print(dataset_1.get("train")[0])

### Tokenize Data

In [None]:
from transformers import AutoTokenizer


MODEL_CHECKPOINT: str = "bert-base-cased"
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)


def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
    """This is used to tokenize the text. It returns a dict containing the
    input_ids, token_type_ids and attention_mask."""
    return tokenizer(examples.get("body"), truncation=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
FEATURES: list[str] = [
    "date",
    "description",
    "amount",
    "type",
    "description_length",
    "body",
]

tokenized_dataset: DatasetDict = dataset_1.map(
    tokenize_function,
    batched=True,
    remove_columns=FEATURES,
)
tokenized_dataset

Map:   0%|          | 0/17742 [00:00<?, ? examples/s]

Map:   0%|          | 0/1542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1677 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17742
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1542
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1677
    })
})

In [None]:
print(tokenized_dataset.get("train")[0])

In [None]:
# Number of unique labels
N_LABELS: int = len(tokenized_dataset.get("train").unique("label"))

N_LABELS

4

In [None]:
from transformers import DataCollatorWithPadding


data_collator: DataCollatorWithPadding = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Apply the data collator
batch: torch.Tensor = data_collator([tokenized_dataset["train"][i] for i in range(2)])

print(batch)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# The 2nd idx has been padded with 0s
for row in batch.get("input_ids"):
    print(row)

In [None]:
from sklearn.metrics import recall_score, accuracy_score
from datasets import load_metric


metric = load_metric("seqeval")


def compute_metrics(eval_preds: tuple) -> dict[str, Any]:
    """This is used to calculate the evaluation metrics."""
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    accuracy = np.mean(predictions == labels)

    return {
        "accuracy": accuracy
    }

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
labels: list[int] = tokenized_dataset.get("train").unique("label")
predictions: list[str] = labels.copy()

# Simulate prediction
predictions[0] = 2
predictions[2] = 3
print(metric.compute(predictions=[predictions], references=[labels]))

### Define Model

In [None]:
from transformers import AutoModelForSequenceClassification


# Now we can just pass them to the AutoModelForTokenClassification.from_pretrained() method,
# and they will be set in the model’s configuration and then properly saved and uploaded to the Hub:
model: AutoModelForSequenceClassification = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT, num_labels=N_LABELS, id2label=id2label, label2id=label2id
)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from huggingface_hub import notebook_login


# Login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import TrainingArguments


OUTPUT_DIR: str = "bert-finetuned-sequence-classification"
STRATEGY: str = "epoch"
LEARNING_RATE: float = 2e-5
NUM_EPOCHS: int = 3
WEIGHT_DECAY: float = 0.01


args: TrainingArguments = TrainingArguments(
    OUTPUT_DIR,
    evaluation_strategy=STRATEGY,
    save_strategy=STRATEGY,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size= 16,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=True,
)

In [None]:
from transformers import Trainer


# Train the model!
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset.get("train"),
    eval_dataset=tokenized_dataset.get("validation"),
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.025,0.017316,0.997406
2,0.011,0.015231,0.997406
3,0.0035,0.005631,0.999351


TrainOutput(global_step=3327, training_loss=0.028557314828311427, metrics={'train_runtime': 629.0739, 'train_samples_per_second': 84.61, 'train_steps_per_second': 5.289, 'total_flos': 1447164541020432.0, 'train_loss': 0.028557314828311427, 'epoch': 3.0})

In [None]:
trainer.push_to_hub(commit_message="Training complete")

'https://huggingface.co/chineidu/bert-finetuned-sequence-classification/tree/main/'

In [None]:
from transformers import pipeline


TASK: str = "text-classification"
MODEL_CHECKPOINT: str = "chineidu/bert-finetuned-sequence-classification"
sequence_classifier: pipeline = pipeline(task=TASK, model=MODEL_CHECKPOINT)

Downloading (…)lve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
text: str = "TRF/Loan/FRM JOHN DOE STEVE TO (IBADAN) FRANK LAMPS INTEREST FREE MONATAN - 301	57553.750000	D	"
print(sequence_classifier(text))

In [None]:
text: str = "oct/sal	125000.00	C 2020/10/28	"
print(sequence_classifier(text))

In [None]:
text: str ="POS/WEB PMT BETWAY NG/1853277904 PSTK LANG	10000.000000	C"
print(sequence_classifier(text))

In [None]:
X_test.head(10)

Unnamed: 0,date,description,amount,type,label
6561,2022-07-13,Tochi Kingsley Iwuoha/Transfer from to IKEH EKENE,100000.0,C,1
18315,2022-08-02,SANUSI SAHEED BABAFEMI/via GTWORLD Omismoney,78900.0,C,1
6798,2021-12-17,Quantum USSD,20021.5,D,3
17954,2022-02-01,Quantum USSD,5010.75,D,3
18030,2022-05-20,FGN ELECTRONIC MONEY TRANSFER LEVY,50.0,D,0
2124,2021-10-31,Quantum USSD,130.0,D,3
3698,2022-12-11,ROFIAT TEMITOPE AYOADE/MOB/ADENUGA HAWAU/UTO/13593760708/Diva,40000.0,C,1
11286,2022-11-01,Interest On Loan 099PDLP2230500GI 221101075051B1F4,2000.0,D,2
20415,2022-04-23,Amt includes COMM & VAT/USSD_NIP/SONIBE CHIBUZOR BLESSING,2310.75,D,3
7974,2022-06-26,TRF//FRM AKINSOLA LATEEF A. TO AKINSOLA LATEEF AYINDE- 058,30026.880859,D,0


In [None]:
X_test.loc[X_test["label"].isin([0, 2, 3])].sample(n=30, random_state=13)

Unnamed: 0,date,description,amount,type,label
2727,2022-03-17,Amt includes COMM & VAT/USSD_NIP/MCDONALD OMOJEFE OBIKU,18026.880859,D,3
14072,2022-05-27,"ATM WDL @10441774 GBOKO BRANCH BENUE STATE, NG REF:675690/214716675690",10000.0,D,2
1617,2022-09-17,TRF/Micor loan from Ade oba/FRM ADERIBIGBE SUN ALADE TO OYESIJI JULIUS - 076,30026.880859,D,2
16168,2022-08-09,TRF//FRM GODWIN MORENIKEJI TO DAMILOLA OLAMIPOSI - 033,1410.75,D,0
3747,2022-01-18,Quantum USSD,3110.75,D,3
3658,2022-05-14,TRF/FM/FRM OLUDARE OLUFEMI TO OLALEKAN LAMIDI MUSE - 221,5010.75,D,0
13434,2021-11-16,Quantum USSD,10.75,D,3
18787,2022-07-05,Amt includes COMM & VAT/USSD/MARYAM ADEFABI ABUBAKAR,17521.5,D,3
1332,2022-12-26,TRF/Goodwin Isaalah Onyewonsa/FRM ISAALAH NDUKA GODWIN TO MIRIAN EMEAGWALI OGUGUA - 033,2010.75,D,0
18330,2022-12-19,"ATM WDL @10441796 KARU BRANCH ABUJA, NIGERING REF:741746/235312741746",40000.0,D,2


In [None]:
texts: list[str] = ["TRF/Loan payment by Oladosu Taiwo Mariam/FRM OLADOSU MARIAM TO MARIAM TAIWO OLADOSU- 305	96453.750000	D",
                    "Amt includes COMM & VAT/USSD/FATIMAH ABIOLA ABDULWAHEED	10021.500000	D",
                    "POS/WEB PMT BRANCH INT/1828853631 PSTK LANG	200000.000000	D",
                    "TRF/Tr/FRM ADEBAYO ADENEKAN TO OLUWASEGUN M CLEMENT - 032	2510.750000	D",
                    "ATM WDL @10441796 KARU BRANCH ABUJA, NIGERING REF:741746/235312741746	40000.000000	D",
                    ]
print(sequence_classifier(texts))