## Data Preparation For Sentence Classification

In [1]:
# Built-in library
import re
import json
import logging
from typing import Any, Dict, List, Optional, Union
import logging
import warnings

# Standard imports
import numpy as np
import pandas as pd
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def set_up_logger(delim: str = "::") -> Any:
    """This is used to create a basic logger."""

    format_ = f"[%(levelname)s] {delim} %(asctime)s {delim} %(message)s"
    logging.basicConfig(level=logging.INFO, format=format_)
    logger = logging.getLogger(__name__)
    return logger


# Global variable
logger = set_up_logger()


def load_data(*, filename: str, sep: str = ",") -> pd.DataFrame:
    """This is used to load the data.

    NB: Supported formats are 'csv' and 'parquet'.

    Params:
    -------
        filename (str): The filepath.
        sep (str, default=","): The separator. e.g ',', '\t', etc

    Returns:
    --------
        data (pd.DataFrame): The loaded dataframe.
    """
    data = (
        pd.read_csv(filename, sep=sep)
        if filename.split(".")[-1] == "csv"
        else pd.read_parquet(filename)
    )
    logger.info(f"Shape of data: {data.shape}\n")
    return data

### Prepare Data

```text
Create:
- Training data
- Validation data
- Test data

Labels
------
salary
gambling
loan
airtime
ussdTransactions
flightRisk
savingsAndInvestments
entertainment
spend
  - posSpend
  - atmSpend
  - mobileSpend
  - webSpend
```

In [3]:
fp: str = "../../data/trans_TAGS_17.parquet"
N: int = 800_000
df: pd.DataFrame = (
    load_data(filename=fp).sample(n=N, random_state=123).reset_index(drop=True)
)

df.head()

[INFO] :: 2023-11-05 20:12:56,650 :: Shape of data: (2390838, 7)



Unnamed: 0,customer_id,nuban,date,description,amount,type,tags
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]"
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]"
2,40454,1,2022-07-05,Amt includes COMM & VAT/USSD/AISHAT ABIODUN BELLO,3021.5,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.bankCharges, spend.ussdTransactions, behavioural.topRecipient, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]"
3,39318,1,2022-07-08,TRF/For clothes/FRM AJAKAYE WUNMI R TO OLALEYE OPEOLUWA LYDIA,24000.0,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween10000And100000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]"
4,38851,1,2022-04-09,WT|KODIRIC GLOBAL ENTERPRAMAC NG,100.0,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.airtime, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]"


In [4]:
def extract_tags(data: pd.DataFrame, pattern: str) -> pd.Series:
    data = data.copy()

    result: np.ndarray = (
        data["tags"].astype("str").str.extract(pat=pattern, flags=re.I).to_numpy()
    ).flatten()
    result: pd.Series = pd.Series(data=result).astype(str)
    return result

In [5]:
df_1: pd.DataFrame = df.copy()

KEYWORD: str = "salary"
PATTERN: str = r"(salary\w{0,10})"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head()

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",
2,40454,1,2022-07-05,Amt includes COMM & VAT/USSD/AISHAT ABIODUN BELLO,3021.5,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.bankCharges, spend.ussdTransactions, behavioural.topRecipient, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",
3,39318,1,2022-07-08,TRF/For clothes/FRM AJAKAYE WUNMI R TO OLALEYE OPEOLUWA LYDIA,24000.0,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween10000And100000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",
4,38851,1,2022-04-09,WT|KODIRIC GLOBAL ENTERPRAMAC NG,100.0,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.airtime, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",


In [6]:
KEYWORD: str = "loan"
PATTERN: str = r"(loan\w{0,10})"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,


In [7]:
KEYWORD: str = "gambling"
PATTERN: str = r"(gambling\w{0,5})"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,


In [8]:
KEYWORD: str = "airtime"
PATTERN: str = r"(\w{0,5}airtime\w{0,5})"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,


In [9]:
KEYWORD: str = "ussdTransactions"
PATTERN: str = r"(ussd)"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label,ussdTransactions_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,,
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,


In [10]:
KEYWORD: str = "flightRisk"
PATTERN: str = r"(\w{0,10}flightRisk\w{0,10})"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label,ussdTransactions_label,flightRisk_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,,,
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,,


In [11]:
KEYWORD: str = "spendOnTransfers"
PATTERN: str = r"(\w{0,10}spendOnTransfers\w{0,10})"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label,ussdTransactions_label,flightRisk_label,spendOnTransfers_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers


In [12]:
KEYWORD: str = "entertainment"
PATTERN: str = r"(entertainment\w{0,10})"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label,ussdTransactions_label,flightRisk_label,spendOnTransfers_label,entertainment_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,


In [13]:
KEYWORD: str = "spend"
PATTERN: str = r"(\w{0,40}spend)"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label,ussdTransactions_label,flightRisk_label,spendOnTransfers_label,entertainment_label,spend_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend


In [14]:
KEYWORD: str = "bills"
PATTERN: str = r"(\w{0,10}bills\w{0,10})"
df_1[f"{KEYWORD}_label"] = extract_tags(data=df_1, pattern=PATTERN)

df_1.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label,ussdTransactions_label,flightRisk_label,spendOnTransfers_label,entertainment_label,spend_label,bills_label
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend,
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend,


In [15]:
def extract_label(tags: list[str]) -> list[str]:
    """This is used to extract the tags from the list of tags."""
    result: list[str] = [val for val in tags if val != "nan"]
    if len(result) == 0:
        result = ["other"]
    return result

In [16]:
cols: list[str] = list(df_1.columns)[-10:]
df_1["list_labels"] = df_1.apply(lambda x: list(x[cols]), axis=1)

In [17]:
df_2: pd.DataFrame = df_1.copy()

df_2.head()

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label,ussdTransactions_label,flightRisk_label,spendOnTransfers_label,entertainment_label,spend_label,bills_label,list_labels
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend,,"[nan, nan, nan, nan, nan, nan, spendOnTransfers, nan, spend, nan]"
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend,,"[nan, nan, nan, nan, nan, nan, spendOnTransfers, nan, spend, nan]"
2,40454,1,2022-07-05,Amt includes COMM & VAT/USSD/AISHAT ABIODUN BELLO,3021.5,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.bankCharges, spend.ussdTransactions, behavioural.topRecipient, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,ussd,,,,spend,,"[nan, nan, nan, nan, ussd, nan, nan, nan, spend, nan]"
3,39318,1,2022-07-08,TRF/For clothes/FRM AJAKAYE WUNMI R TO OLALEYE OPEOLUWA LYDIA,24000.0,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween10000And100000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend,,"[nan, nan, nan, nan, nan, nan, spendOnTransfers, nan, spend, nan]"
4,38851,1,2022-04-09,WT|KODIRIC GLOBAL ENTERPRAMAC NG,100.0,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.airtime, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,airtime,,,,,spend,,"[nan, nan, nan, airtime, nan, nan, nan, nan, spend, nan]"


In [18]:
df_2["cleaned_labels"] = df_2["list_labels"].apply(extract_label)

df_2.head(2)

Unnamed: 0,customer_id,nuban,date,description,amount,type,tags,salary_label,loan_label,gambling_label,airtime_label,ussdTransactions_label,flightRisk_label,spendOnTransfers_label,entertainment_label,spend_label,bills_label,list_labels,cleaned_labels
0,40837,1,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.transactionBetween100000And500000, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend,,"[nan, nan, nan, nan, nan, nan, spendOnTransfers, nan, spend, nan]","[spendOnTransfers, spend]"
1,39005,1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[balance, behavioural.accountSweep, behavioural.inflowOutflowRate, debit, spend.spendOnTransfers, transactionpattern.recurringExpense, transactionpattern.transactionLessThan10000, transactionpattern.mostFrequentTransactionRange, transactionpattern.mostFrequentBalanceRange]",,,,,,,spendOnTransfers,,spend,,"[nan, nan, nan, nan, nan, nan, spendOnTransfers, nan, spend, nan]","[spendOnTransfers, spend]"


In [19]:
IMP_COLS: list[str] = ["date", "description", "amount", "type", "cleaned_labels"]
df_2 = df_2[IMP_COLS]

df_2.head()

Unnamed: 0,date,description,amount,type,cleaned_labels
0,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,"[spendOnTransfers, spend]"
1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,"[spendOnTransfers, spend]"
2,2022-07-05,Amt includes COMM & VAT/USSD/AISHAT ABIODUN BELLO,3021.5,D,"[ussd, spend]"
3,2022-07-08,TRF/For clothes/FRM AJAKAYE WUNMI R TO OLALEYE OPEOLUWA LYDIA,24000.0,D,"[spendOnTransfers, spend]"
4,2022-04-09,WT|KODIRIC GLOBAL ENTERPRAMAC NG,100.0,D,"[airtime, spend]"


In [20]:
def clean_salary(tags: list[str], type: str) -> list[str]:
    """This is used to clean the salary tags."""
    if type == "D":
        result: list[str] = [val for val in tags if val != "salaryEarner"]
        if len(result) == 0:
            result = ["other"]
    else:
        result = tags
    return result

In [21]:
df_2["cleaned_labels"] = df_2.apply(
    lambda x: clean_salary(tags=x["cleaned_labels"], type=x["type"]), axis=1
)

In [22]:
df_2["cleaned_labels"] = df_2["cleaned_labels"].str[0]

df_2.head()

Unnamed: 0,date,description,amount,type,cleaned_labels
0,2022-04-19,TRF/Ac/fan/FRM GIBSON PAUL U TO CHRIS-CHUKKAS ELECTRONICS LTD- 070,163053.75,D,spendOnTransfers
1,2022-08-30,TRF/Friday /FRM JONAH FRIDAY TO ONYENAUCHEYA,5010.75,D,spendOnTransfers
2,2022-07-05,Amt includes COMM & VAT/USSD/AISHAT ABIODUN BELLO,3021.5,D,ussd
3,2022-07-08,TRF/For clothes/FRM AJAKAYE WUNMI R TO OLALEYE OPEOLUWA LYDIA,24000.0,D,spendOnTransfers
4,2022-04-09,WT|KODIRIC GLOBAL ENTERPRAMAC NG,100.0,D,airtime


In [23]:
df_2["cleaned_labels"] = df_2["cleaned_labels"].apply(
    lambda x: "posOrWebSpend" if "pos" in x else ("posOrWebSpend" if "web" in x else x)
)

In [24]:
df_2.sample(n=20, random_state=8)

Unnamed: 0,date,description,amount,type,cleaned_labels
609289,2022-11-06,Principal Liquidation 099ILAT222790188 2210060613129E15,98333.328125,D,other
433243,2022-10-09,TRF//FRM ADEBOLA FUMILAYO TO TEMITOPE OLUWASEUN IYEGBALE - 033,2010.75,D,spendOnTransfers
467264,2022-08-01,TRF/Sb/FRM ADEYEMI ADEFEMI TO CHECKOUT PAYSTACK - 035,10026.879883,D,spendOnTransfers
649313,2021-10-14,FGN ELECTRONIC MONEY TRANSFER LEVY,100.0,D,spendOnTransfers
533450,2021-11-24,NEXTGEN,34026.878906,D,other
527518,2022-11-09,AIRTIME/ MTN/08134099813,300.0,D,airtime
702274,2022-09-25,POS/WEB PMT OPAY DIGITAL SERVICE DL 00NG,5600.0,D,spend
695796,2022-08-12,"ATM WDL @10442404 SOMOLU BRANCH LAGOS STATE, NG REF:920083/222417920083",20000.0,D,loanRepayments
290175,2022-10-11,901 Airtime Topup/+2348066867340/USSDNWUSSD133099807188524394,200.0,D,airtime
785440,2022-11-08,TRF/Boosting acc/FRM ADELEKE GANIYU ADE TO BEEBUH ZIMMY CONCEPT,20000.0,D,spendOnTransfers


In [25]:
df_2["cleaned_labels"].value_counts(normalize=True)

other               0.397595
spendOnTransfers    0.235674
spend               0.148984
airtime             0.110242
ussd                0.063702
loanRepayments      0.015315
salaryEarner        0.012034
gambling            0.007025
loanAmount          0.006606
flightRisk          0.002693
entertainment       0.000130
Name: cleaned_labels, dtype: float64

In [26]:
df_2.to_parquet("./my_data/trans_data_2.parquet", index=False)