# Imports

In [89]:
import pandas as pd
from typing import Union, List
from transformers import BertConfig, BertModel, BertForSequenceClassification, BertTokenizerFast
import torch
from torch.utils.data import DataLoader
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Init random seeds

In [74]:
def setSeeds(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    return None

setSeeds(666)

# Constants

In [3]:
DATASET_PATH = "ML_test_case.xlsx"

# Functions

In [4]:
def readAllSheets(filePath: str) -> pd.DataFrame:
    dataframesDict = pd.read_excel(filePath, sheet_name=None)
    dataframesList = []
    for k in dataframesDict.keys():
        dataframesList.append(dataframesDict[k])
    return pd.concat(dataframesList).reset_index(drop=True)

In [32]:
def mapCodeToClass(inputDf: pd.DataFrame, codeCol: str) -> pd.DataFrame:
    codeToClassDict = {k: v for v, k in enumerate(inputDf[codeCol].unique())}
    inputDf["class"] = inputDf[codeCol].replace(codeToClassDict)

    return inputDf

In [63]:
def cleanNames(inputDf: pd.DataFrame, nameCol: Union[str, List[str]]) -> pd.DataFrame:
    if isinstance(nameCol, str):
        cols = [nameCol]
    else:
        cols = nameCol
    
    for col in cols:
        inputDf[col] = inputDf[col].astype('str')
        inputDf[col] = inputDf[col].str.replace(r'\d+', '', regex=True)
        inputDf[col] = inputDf[col].str.replace(r'[^\w\s]', ' ', regex=True)
        inputDf[col] = inputDf[col].str.replace(r'_', ' ', regex=True)
        inputDf[col] = inputDf[col].apply(lambda x: ' '.join(word for word in x.split() if len(word)>3))
    
    return inputDf

In [64]:
def unionNames(inputDf: pd.DataFrame, unionCols: List[str], labelCol: str) -> pd.DataFrame:
    dataframesList = []
    for col in unionCols:
        tmpDf = inputDf.copy()
        tmpDf = tmpDf[[col, labelCol]].drop_duplicates().rename(columns={col: "Ledger Name"})
        dataframesList.append(tmpDf)

    return pd.concat(dataframesList).reset_index(drop=True)

# Load data

In [65]:
raw_dataset_df = readAllSheets(DATASET_PATH)
print(f"Shape of dataset: {raw_dataset_df.shape}")
print(f"Dataset columns: {', '.join(list(raw_dataset_df.columns))}")
raw_dataset_df

Shape of dataset: (2606, 4)
Dataset columns: Source Ledger Code, Source Ledger Name, Intelas Ledger Code, Intelas Ledger Name


Unnamed: 0,Source Ledger Code,Source Ledger Name,Intelas Ledger Code,Intelas Ledger Name
0,1100-1001,Cash - Operating,11000-110,Cash - Operating
1,1100-1002,Cash - Clearing,11000-120,Cash - Depository / Clearing
2,1100-1003,Cash - Money Market / Other,11000-110,Cash - Operating
3,1100-1005,Cash - Money Market / Other,11000-110,Cash - Operating
4,1100-1006,Cash - Money Market / Other,11000-110,Cash - Operating
...,...,...,...,...
2601,2136-0000,Insurance Payable,21000-900,A/P - General
2602,5461-0010,Management Services,56050-900,G&A Other - General
2603,4407-0060,Less: Extraordinary Bad Debt,43020-110,Write Offs
2604,4412-0002,Flooring Damage / Replace,44000-440,Damage Fees


# Process data

In [83]:
# map code to class
procesed_dataset_df = mapCodeToClass(raw_dataset_df, "Intelas Ledger Code")

# union names
procesed_dataset_df = unionNames(procesed_dataset_df, ["Source Ledger Name", "Intelas Ledger Name"], "class")

# clean names
procesed_dataset_df = cleanNames(procesed_dataset_df, "Ledger Name")
procesed_dataset_df

Unnamed: 0,Ledger Name,class
0,Cash Operating,0
1,Cash Clearing,1
2,Cash Money Market Other,0
3,Cash Borrowers Acct,0
4,Cash Payroll,0
...,...,...
1439,Convenience Credit Card,279
1440,Interest Income,280
1441,Exterior General,281
1442,Drainage,282


# Split dataset into train and eval

In [79]:
train_dataset_df = procesed_dataset_df.copy()
_, eval_dataset_df = train_test_split(train_dataset_df, test_size=0.2, random_state=42,
                                      stratify=train_dataset_df["class"])
del _

# Init custom PyTorch dataset and dataloader

In [85]:
class LedgerDataset(torch.utils.data.Dataset):
    def __init__(self, data: pd.DataFrame, labels: pd.Series):
        self.data = data.to_numpy()
        self.labels = labels.to_numpy()

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

    def __len__(self):
        return self.labels.shape[0]

In [86]:
train_dataset = LedgerDataset(train_dataset_df.drop(columns=["class"]), train_dataset_df["class"])
eval_dataset = LedgerDataset(eval_dataset_df.drop(columns=["class"]), train_dataset_df["class"])

In [87]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=64, shuffle=True)

# Init tokenizer, model and trainer

In [93]:
modelName = "bert-base-uncased"
maxSeqLength = 512
numClasses = procesed_dataset_df["class"].nunique()

if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

In [91]:
tokenizer = BertTokenizerFast.from_pretrained(modelName, do_lower_case=True)

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [94]:
model = BertForSequenceClassification.from_pretrained(modelName, num_labels=numClasses).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at