# Imports

In [1]:
import pandas as pd
from typing import Union, List
from transformers import (
    BertConfig,
    BertModel,
    BertForSequenceClassification,
    BertTokenizer,
    BertTokenizerFast,
    Trainer,
    TrainingArguments
)
import torch
from torch.utils.data import DataLoader
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Init random seeds

In [2]:
def setSeeds(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    return None

setSeeds(666)

# Constants

In [3]:
DATASET_PATH = "ML_test_case.xlsx"

# Functions

In [4]:
def readAllSheets(filePath: str) -> pd.DataFrame:
    dataframesDict = pd.read_excel(filePath, sheet_name=None)
    dataframesList = []
    for k in dataframesDict.keys():
        dataframesList.append(dataframesDict[k])
    return pd.concat(dataframesList).reset_index(drop=True)

In [5]:
def mapCodeToClass(inputDf: pd.DataFrame, codeCol: str) -> pd.DataFrame:
    codeToClassDict = {k: v for v, k in enumerate(inputDf[codeCol].unique())}
    inputDf["class"] = inputDf[codeCol].replace(codeToClassDict)

    return inputDf

In [6]:
def cleanNames(inputDf: pd.DataFrame, nameCol: Union[str, List[str]]) -> pd.DataFrame:
    if isinstance(nameCol, str):
        cols = [nameCol]
    else:
        cols = nameCol
    
    for col in cols:
        inputDf[col] = inputDf[col].astype('str')
        inputDf[col] = inputDf[col].str.replace('/', ' ', regex=False)
        inputDf[col] = inputDf[col].str.replace(',', ' ', regex=False)
        inputDf[col] = inputDf[col].str.replace('-', ' ', regex=False)
        inputDf[col] = inputDf[col].str.replace('–', ' ', regex=False)
        inputDf[col] = inputDf[col].str.replace('&', ' ', regex=False)
        inputDf[col] = inputDf[col].str.replace('(', ' ', regex=False)
        inputDf[col] = inputDf[col].str.replace(')', ' ', regex=False)
        inputDf[col] = inputDf[col].str.replace(':', ' ', regex=False)
        inputDf[col] = inputDf[col].str.replace("'s'", ' ', regex=False)
#         inputDf[col] = inputDf[col].str.replace(r'\d+', '', regex=True)
#         inputDf[col] = inputDf[col].str.replace(r'[^\w\s]', ' ', regex=True)
#         inputDf[col] = inputDf[col].str.replace(r'_', '', regex=True)
        inputDf[col] = inputDf[col].str.lower()
        inputDf[col] = inputDf[col].str.replace(r'(?:^|\s)p r(?:^|\s|$)', ' p/r ', regex=True)
        inputDf[col] = inputDf[col].str.replace(r'(?:^|\s)r m(?:^|\s|$)', ' r&m ', regex=True)
        inputDf[col] = inputDf[col].str.replace(r'(?:^|\s)g a(?:^|\s|$)', ' g&a ', regex=True)
        inputDf[col] = inputDf[col].str.replace(r'(?:^|\s)a r(?:^|\s|$)', ' a/r ', regex=True)
        inputDf[col] = inputDf[col].str.replace(r'(?:^|\s)a p(?:^|\s|$)', ' a/p ', regex=True)
        inputDf[col] = inputDf[col].str.replace('fas 141', 'fas-141', regex=False)
        inputDf = inputDf[inputDf[col] != ' ']
#         inputDf[col] = inputDf[col].apply(lambda x: ' '.join(word for word in x.split() if len(word)>3))
    
    return inputDf

In [7]:
def unionNames(inputDf: pd.DataFrame, unionCols: List[str], labelCol: str) -> pd.DataFrame:
    dataframesList = []
    for col in unionCols:
        tmpDf = inputDf.copy()
#         tmpDf = tmpDf[[col, labelCol]].drop_duplicates().rename(columns={col: "Ledger Name"})
        tmpDf = tmpDf[[col, labelCol]].rename(columns={col: "Ledger Name"})
        dataframesList.append(tmpDf)

    return pd.concat(dataframesList).reset_index(drop=True)

In [8]:
def computeMetrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    
    return {'accuracy': acc,}

# Load data

In [9]:
raw_dataset_df = readAllSheets(DATASET_PATH)
print(f"Shape of dataset: {raw_dataset_df.shape}")
print(f"Dataset columns: {', '.join(list(raw_dataset_df.columns))}")
raw_dataset_df

Shape of dataset: (2606, 4)
Dataset columns: Source Ledger Code, Source Ledger Name, Intelas Ledger Code, Intelas Ledger Name


Unnamed: 0,Source Ledger Code,Source Ledger Name,Intelas Ledger Code,Intelas Ledger Name
0,1100-1001,Cash - Operating,11000-110,Cash - Operating
1,1100-1002,Cash - Clearing,11000-120,Cash - Depository / Clearing
2,1100-1003,Cash - Money Market / Other,11000-110,Cash - Operating
3,1100-1005,Cash - Money Market / Other,11000-110,Cash - Operating
4,1100-1006,Cash - Money Market / Other,11000-110,Cash - Operating
...,...,...,...,...
2601,2136-0000,Insurance Payable,21000-900,A/P - General
2602,5461-0010,Management Services,56050-900,G&A Other - General
2603,4407-0060,Less: Extraordinary Bad Debt,43020-110,Write Offs
2604,4412-0002,Flooring Damage / Replace,44000-440,Damage Fees


# Process data

In [10]:
# map code to class
procesed_dataset_df = mapCodeToClass(raw_dataset_df, "Intelas Ledger Code")

# union names
procesed_dataset_df = unionNames(procesed_dataset_df, ["Source Ledger Name", "Intelas Ledger Name"], "class")

# clean names
procesed_dataset_df = cleanNames(procesed_dataset_df, "Ledger Name")
procesed_dataset_df["Ledger Name"] = procesed_dataset_df["Ledger Name"].str.split()
procesed_dataset_df = procesed_dataset_df.dropna()
# procesed_dataset_df = procesed_dataset_df[procesed_dataset_df["class"] != 256]
# procesed_dataset_df.style

vocabList = ((procesed_dataset_df[["Ledger Name"]].explode("Ledger Name").groupby(["Ledger Name"]).size())
             .reset_index().sort_values([0]).reset_index(drop=True))["Ledger Name"].to_list()
vocabList = [v+"\n" for v in vocabList] + [
    "[PAD]\n", "[EOS]\n","[UNK]\n","[CLS]\n","[SEP]\n","[MASK]\n"
]

procesed_dataset_df["Ledger Name"] = procesed_dataset_df["Ledger Name"].apply(' '.join)

# len(vocabList)

procesed_dataset_df.style

# (procesed_dataset_df[["Ledger Name"]].explode("Ledger Name").groupby(["Ledger Name"]).size())
#  .reset_index().sort_values([0]).reset_index(drop=True)

Unnamed: 0,Ledger Name,class
0,cash operating,0
1,cash clearing,1
2,cash money market other,0
3,cash money market other,0
4,cash money market other,0
5,cash borrowers acct,0
6,cash money market other,0
7,cash payroll,0
8,cash petty,0
9,tenant's security deposits,2


In [11]:
# save vocab to file

with open('vocab.txt', 'w') as fp:
    fp.writelines(vocabList)

# Try w2v model

In [214]:
from gensim.models import Word2Vec

# w2vModel = Word2Vec(sentences=procesed_dataset_df["Ledger Name"].to_list(), vector_size=100, window=3,
#                     min_count=1, workers=16)

w2vModel = Word2Vec(vector_size=256, window=3, min_count=1, workers=16, epochs=1000)
w2vModel.build_vocab(procesed_dataset_df["Ledger Name"].to_list())
w2vModel.train(procesed_dataset_df["Ledger Name"].to_list(), total_examples=w2vModel.corpus_count * 10,
               epochs=w2vModel.epochs)

(3171155, 4097000)

In [215]:
# w2vModel.epochs

In [216]:
# w2vModel.wv["tax"]

In [217]:
# w2vModel.wv["cash"]

In [229]:
train_ex, test_ex = train_test_split(procesed_dataset_df, test_size=0.2, random_state=42,
                                     stratify=procesed_dataset_df["class"])

In [230]:
namesEmbeddingsTrain = []
classTrain = []

namesEmbeddingsTest = []
classTest = []

for example, cls in zip(train_ex["Ledger Name"].to_list(), train_ex["class"].to_list()):
#     print(example)
    tmpEmb = w2vModel.wv[example[0]].copy()
    countEmb = 1
    for tkn in example[1:]:
        tmpEmb += w2vModel.wv[tkn]
        countEmb += 1
    namesEmbeddingsTrain.append(tmpEmb / countEmb)
    classTrain.append(cls)

for example, cls in zip(test_ex["Ledger Name"].to_list(), test_ex["class"].to_list()):
#     print(example)
    tmpEmb = w2vModel.wv[example[0]].copy()
    countEmb = 1
    for tkn in example[1:]:
        tmpEmb += w2vModel.wv[tkn]
        countEmb += 1
    namesEmbeddingsTest.append(tmpEmb / countEmb)
    classTest.append(cls)

namesEmbeddings

[array([ 0.71240556,  0.37256962,  0.14753371, -0.01765937,  0.47132885,
         0.48296577,  0.26379532, -0.58417046,  0.13050896,  0.50053895,
        -0.47135973, -0.08572936, -0.15763213, -0.838324  ,  0.21275447,
         0.04423618,  0.67260504, -0.17510247,  0.49461004,  0.29851362,
        -0.63838625, -0.3202011 , -0.38832974, -0.53037566, -0.590878  ,
         0.23586974, -0.3788829 ,  0.3422413 ,  0.3608721 , -0.28941444,
        -0.16399448, -0.10122569, -0.34352252, -0.0891241 , -0.04571253,
         0.08109123,  0.30427828, -0.13310292,  0.3234996 ,  0.08613831,
        -0.5987171 , -0.30584824, -0.2057757 , -0.23641047,  0.06963667,
        -0.40436614, -0.2620783 ,  0.44587475,  0.18358038,  0.6302149 ,
         0.34240866,  0.43575096,  0.61712116,  0.13312106, -0.15480107,
         0.16768402,  0.11397605, -0.01518968, -0.15580377, -0.17554861,
        -0.23948762,  0.06089848, -0.17589661, -0.27766135, -0.10056146,
        -0.29488844, -0.1164066 ,  0.63589764,  0.0

# K-neighboors

In [231]:
knnModel = KNeighborsClassifier(n_neighbors=1, algorithm='brute', n_jobs=-1)

In [232]:
knnModel.fit(namesEmbeddingsTrain, classTrain)

In [233]:
ex_id = 29
print(procesed_dataset_df["class"].to_list()[ex_id])
knnModel.predict([namesEmbeddings[ex_id]])

17


array([17])

In [234]:
accuracy_score(
    classTest,
    knnModel.predict(namesEmbeddingsTest)
)

0.3737024221453287

# Init tokenizer

In [12]:
modelName = "bert-base-uncased"
maxSeqLength = 512

In [13]:
# tokenizer = BertTokenizer.from_pretrained(modelName, do_lower_case=True)
tokenizer = BertTokenizer.from_pretrained("./")
tokenizer

PreTrainedTokenizer(name_or_path='./', vocab_size=841, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [14]:
# tokenizer(procesed_dataset_df["Ledger Name"].to_list())

# Split dataset into train and eval

In [15]:
train_dataset_df, eval_dataset_df = train_test_split(procesed_dataset_df, test_size=0.2, random_state=42,
                                                     stratify=procesed_dataset_df["class"])
train_dataset_df = train_dataset_df.reset_index(drop=True)
eval_dataset_df = eval_dataset_df.reset_index(drop=True)

tokenizedTrain = tokenizer(train_dataset_df["Ledger Name"].to_list(), truncation=True,
                           padding=True, max_length=maxSeqLength)
tokenizedEval = tokenizer(eval_dataset_df["Ledger Name"].to_list(), truncation=True,
                          padding=True, max_length=maxSeqLength)

tokenizedEval

{'input_ids': [[838, 687, 839, 835, 835, 835, 835, 835, 835, 835], [838, 720, 698, 833, 839, 835, 835, 835, 835, 835], [838, 425, 504, 839, 835, 835, 835, 835, 835, 835], [838, 554, 837, 837, 832, 823, 839, 835, 835, 835], [838, 500, 687, 691, 839, 835, 835, 835, 835, 835], [838, 837, 837, 837, 834, 839, 835, 835, 835, 835], [838, 832, 831, 834, 839, 835, 835, 835, 835, 835], [838, 837, 837, 837, 824, 833, 839, 835, 835, 835], [838, 288, 830, 839, 835, 835, 835, 835, 835, 835], [838, 805, 735, 420, 660, 839, 835, 835, 835, 835], [838, 694, 832, 767, 831, 834, 839, 835, 835, 835], [838, 834, 783, 405, 839, 835, 835, 835, 835, 835], [838, 412, 839, 835, 835, 835, 835, 835, 835, 835], [838, 693, 839, 835, 835, 835, 835, 835, 835, 835], [838, 768, 822, 839, 835, 835, 835, 835, 835, 835], [838, 731, 837, 837, 837, 779, 828, 839, 835, 835], [838, 604, 822, 839, 835, 835, 835, 835, 835, 835], [838, 785, 657, 839, 835, 835, 835, 835, 835, 835], [838, 397, 719, 728, 839, 835, 835, 835, 835, 835

# Init custom PyTorch dataset and dataloader

In [16]:
class LedgerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings: dict, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return self.labels.shape[0]

In [17]:
trainDataset = LedgerDataset(tokenizedTrain, train_dataset_df["class"])
evalDataset = LedgerDataset(tokenizedEval, eval_dataset_df["class"])
eval_dataset_df["class"].max()

276

In [18]:
# train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# eval_dataloader = DataLoader(eval_dataset, batch_size=64, shuffle=True)

# Init train config, model and trainer

In [19]:
numClasses = procesed_dataset_df["class"].nunique()

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"
    
numClasses

284

In [20]:
# model = BertForSequenceClassification.from_pretrained(modelName, num_labels=numClasses).to(device)

modelConfig = BertConfig(vocab_size=841, max_position_embeddings=maxSeqLength, num_labels=numClasses)
model = BertForSequenceClassification(modelConfig)

In [21]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=50,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=10,               # log & save weights each logging_steps
    save_steps=2500,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

In [22]:
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=trainDataset,         # training dataset
    eval_dataset=evalDataset,          # evaluation dataset
    compute_metrics=computeMetrics,     # the callback that computes metrics of interest
)

# Train BERT

In [23]:
trainer.train()

***** Running training *****
  Num examples = 4169
  Num Epochs = 50
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 6550


Step,Training Loss,Validation Loss,Accuracy
10,5.6892,5.671454,0.004794
20,5.6762,5.627806,0.004794
30,5.6129,5.561757,0.063279
40,5.5084,5.482131,0.065197
50,5.4973,5.4356,0.065197
60,5.4427,5.405648,0.065197
70,5.476,5.377679,0.065197
80,5.4208,5.344314,0.065197
90,5.3917,5.315601,0.065197
100,5.3362,5.291916,0.065197


***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** 

  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num 

***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** 

***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** 

  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num 

  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num examples = 1043
  Batch size = 20
***** Running Evaluation *****
  Num 

TrainOutput(global_step=6550, training_loss=0.4286290767948136, metrics={'train_runtime': 805.221, 'train_samples_per_second': 258.873, 'train_steps_per_second': 8.134, 'total_flos': 1181304745407600.0, 'train_loss': 0.4286290767948136, 'epoch': 50.0})

In [27]:
trainer.predict(evalDataset).label_ids

***** Running Prediction *****
  Num examples = 1043
  Batch size = 20


array([[ 36],
       [143],
       [141],
       ...,
       [155],
       [  9],
       [ 20]])

In [28]:
eval_dataset_df["class"]

0        36
1       143
2       141
3         7
4         6
       ... 
1038     38
1039     28
1040    155
1041      9
1042     20
Name: class, Length: 1043, dtype: int64