# Imports

In [1]:
import pandas as pd
from typing import Union, List
from transformers import (
    BertConfig,
    BertModel,
    BertForSequenceClassification,
    BertTokenizer,
    BertTokenizerFast,
    Trainer,
    TrainingArguments
)
import torch
from torch.utils.data import DataLoader
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Init random seeds

In [2]:
def setSeeds(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    return None

setSeeds(666)

# Constants

In [3]:
DATASET_PATH = "ML_test_case.xlsx"

# Functions

In [4]:
def readAllSheets(filePath: str) -> pd.DataFrame:
    dataframesDict = pd.read_excel(filePath, sheet_name=None)
    dataframesList = []
    for k in dataframesDict.keys():
        dataframesList.append(dataframesDict[k])
    return pd.concat(dataframesList).reset_index(drop=True)

In [5]:
def mapCodeToClass(inputDf: pd.DataFrame, codeCol: str) -> pd.DataFrame:
    codeToClassDict = {k: v for v, k in enumerate(inputDf[codeCol].unique())}
    inputDf["class"] = inputDf[codeCol].replace(codeToClassDict)

    return inputDf

In [6]:
def cleanNames(inputDf: pd.DataFrame, nameCol: Union[str, List[str]]) -> pd.DataFrame:
    if isinstance(nameCol, str):
        cols = [nameCol]
    else:
        cols = nameCol
    
    for col in cols:
        inputDf[col] = inputDf[col].astype('str')
        inputDf[col] = inputDf[col].str.replace(r'\d+', '', regex=True)
        inputDf[col] = inputDf[col].str.replace(r'[^\w\s]', ' ', regex=True)
        inputDf[col] = inputDf[col].str.replace(r'_', ' ', regex=True)
        inputDf[col] = inputDf[col].apply(lambda x: ' '.join(word for word in x.split() if len(word)>3))
    
    return inputDf

In [7]:
def unionNames(inputDf: pd.DataFrame, unionCols: List[str], labelCol: str) -> pd.DataFrame:
    dataframesList = []
    for col in unionCols:
        tmpDf = inputDf.copy()
        tmpDf = tmpDf[[col, labelCol]].drop_duplicates().rename(columns={col: "Ledger Name"})
        dataframesList.append(tmpDf)

    return pd.concat(dataframesList).reset_index(drop=True)

In [8]:
def computeMetrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    
    return {'accuracy': acc,}

# Load data

In [9]:
raw_dataset_df = readAllSheets(DATASET_PATH)
print(f"Shape of dataset: {raw_dataset_df.shape}")
print(f"Dataset columns: {', '.join(list(raw_dataset_df.columns))}")
raw_dataset_df

Shape of dataset: (2606, 4)
Dataset columns: Source Ledger Code, Source Ledger Name, Intelas Ledger Code, Intelas Ledger Name


Unnamed: 0,Source Ledger Code,Source Ledger Name,Intelas Ledger Code,Intelas Ledger Name
0,1100-1001,Cash - Operating,11000-110,Cash - Operating
1,1100-1002,Cash - Clearing,11000-120,Cash - Depository / Clearing
2,1100-1003,Cash - Money Market / Other,11000-110,Cash - Operating
3,1100-1005,Cash - Money Market / Other,11000-110,Cash - Operating
4,1100-1006,Cash - Money Market / Other,11000-110,Cash - Operating
...,...,...,...,...
2601,2136-0000,Insurance Payable,21000-900,A/P - General
2602,5461-0010,Management Services,56050-900,G&A Other - General
2603,4407-0060,Less: Extraordinary Bad Debt,43020-110,Write Offs
2604,4412-0002,Flooring Damage / Replace,44000-440,Damage Fees


# Process data

In [10]:
# map code to class
procesed_dataset_df = mapCodeToClass(raw_dataset_df, "Intelas Ledger Code")

# union names
procesed_dataset_df = unionNames(procesed_dataset_df, ["Source Ledger Name", "Intelas Ledger Name"], "class")

# clean names
procesed_dataset_df = cleanNames(procesed_dataset_df, "Ledger Name")
procesed_dataset_df

Unnamed: 0,Ledger Name,class
0,Cash Operating,0
1,Cash Clearing,1
2,Cash Money Market Other,0
3,Cash Borrowers Acct,0
4,Cash Payroll,0
...,...,...
1439,Convenience Credit Card,279
1440,Interest Income,280
1441,Exterior General,281
1442,Drainage,282


# Init tokenizer

In [11]:
modelName = "bert-base-uncased"
maxSeqLength = 512

In [12]:
tokenizer = BertTokenizer.from_pretrained(modelName, do_lower_case=True)

In [13]:
# tokenizer(procesed_dataset_df["Ledger Name"].to_list())

# Split dataset into train and eval

In [14]:
train_dataset_df, eval_dataset_df = train_test_split(procesed_dataset_df, test_size=0.2, random_state=42,
                                                     stratify=procesed_dataset_df["class"])
train_dataset_df = train_dataset_df.reset_index(drop=True)
eval_dataset_df = eval_dataset_df.reset_index(drop=True)

tokenizedTrain = tokenizer(train_dataset_df["Ledger Name"].to_list(), truncation=True,
                           padding=True, max_length=maxSeqLength)
tokenizedEval = tokenizer(eval_dataset_df["Ledger Name"].to_list(), truncation=True,
                          padding=True, max_length=maxSeqLength)

# Init custom PyTorch dataset and dataloader

In [15]:
class LedgerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings: dict, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return self.labels.shape[0]

In [16]:
trainDataset = LedgerDataset(tokenizedTrain, train_dataset_df["class"])
evalDataset = LedgerDataset(tokenizedEval, eval_dataset_df["class"])

In [17]:
# train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# eval_dataloader = DataLoader(eval_dataset, batch_size=64, shuffle=True)

# Init train config, model and trainer

In [18]:
numClasses = procesed_dataset_df["class"].nunique()

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

In [19]:
model = BertForSequenceClassification.from_pretrained(modelName, num_labels=numClasses).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=50,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=10,               # log & save weights each logging_steps
    save_steps=2500,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [21]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=trainDataset,         # training dataset
    eval_dataset=evalDataset,          # evaluation dataset
    compute_metrics=computeMetrics,     # the callback that computes metrics of interest
)

# Train BERT

In [22]:
trainer.train()

***** Running training *****
  Num examples = 1010
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3200


Step,Training Loss,Validation Loss,Accuracy
10,5.6663,5.721353,0.002304
20,5.7365,5.707302,0.004608
30,5.6926,5.686709,0.0
40,5.7095,5.664572,0.002304
50,5.6793,5.651411,0.002304
60,5.6447,5.64274,0.002304
70,5.6348,5.634471,0.002304
80,5.6054,5.626318,0.002304
90,5.638,5.612094,0.006912
100,5.5842,5.598743,0.023041


***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evalua

  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434

  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
***** Running Evaluation *****
  Num examples = 434
  Batch size = 20
**

TrainOutput(global_step=3200, training_loss=1.6730272710323333, metrics={'train_runtime': 277.9975, 'train_samples_per_second': 181.656, 'train_steps_per_second': 11.511, 'total_flos': 312205096368000.0, 'train_loss': 1.6730272710323333, 'epoch': 50.0})