# Imports

In [30]:
import pandas as pd
from typing import Union, List

# Constants

In [2]:
DATASET_PATH = "ML_test_case.xlsx"

# Functions

In [15]:
def readAllSheets(filePath: str) -> pd.DataFrame:
    dataframesDict = pd.read_excel(filePath, sheet_name=None)
    dataframesList = []
    for k in dataframesDict.keys():
        dataframesList.append(dataframesDict[k])
    return pd.concat(dataframesList).reset_index(drop=True)

In [26]:
def mapCodeToClass(inputDf: pd.DataFrame, codeCol: str) -> pd.DataFrame:
    codeToClassDict = {k: v for v, k in enumerate(inputDf[codeCol].unique())}
    inputDf["class"] = inputDf[codeCol].replace(codeToClassDict)
    return inputDf

In [32]:
def cleanNames(inputDf: pd.DataFrame, nameCol: Union[str, List[str]]) -> pd.DataFrame:
    if isinstance(nameCol, str):
        cols = [nameCol]
    else:
        cols = nameCol

# Load data

In [23]:
raw_dataset_df = readAllSheets(DATASET_PATH)
print(f"Shape of dataset: {raw_dataset_df.shape}")
print(f"Dataset columns: {', '.join(list(raw_dataset_df.columns))}")
raw_dataset_df

Shape of dataset: (2606, 4)
Dataset columns: Source Ledger Code, Source Ledger Name, Intelas Ledger Code, Intelas Ledger Name


Unnamed: 0,Source Ledger Code,Source Ledger Name,Intelas Ledger Code,Intelas Ledger Name
0,1100-1001,Cash - Operating,11000-110,Cash - Operating
1,1100-1002,Cash - Clearing,11000-120,Cash - Depository / Clearing
2,1100-1003,Cash - Money Market / Other,11000-110,Cash - Operating
3,1100-1005,Cash - Money Market / Other,11000-110,Cash - Operating
4,1100-1006,Cash - Money Market / Other,11000-110,Cash - Operating
...,...,...,...,...
2601,2136-0000,Insurance Payable,21000-900,A/P - General
2602,5461-0010,Management Services,56050-900,G&A Other - General
2603,4407-0060,Less: Extraordinary Bad Debt,43020-110,Write Offs
2604,4412-0002,Flooring Damage / Replace,44000-440,Damage Fees


# Process data

In [29]:
# map code to class
procesed_dataset_df = mapCodeToClass(raw_dataset_df, "Intelas Ledger Code")
procesed_dataset_df

# clean names

Unnamed: 0,Source Ledger Code,Source Ledger Name,Intelas Ledger Code,Intelas Ledger Name,class
0,1100-1001,Cash - Operating,11000-110,Cash - Operating,0
1,1100-1002,Cash - Clearing,11000-120,Cash - Depository / Clearing,1
2,1100-1003,Cash - Money Market / Other,11000-110,Cash - Operating,0
3,1100-1005,Cash - Money Market / Other,11000-110,Cash - Operating,0
4,1100-1006,Cash - Money Market / Other,11000-110,Cash - Operating,0
...,...,...,...,...,...
2601,2136-0000,Insurance Payable,21000-900,A/P - General,247
2602,5461-0010,Management Services,56050-900,G&A Other - General,142
2603,4407-0060,Less: Extraordinary Bad Debt,43020-110,Write Offs,187
2604,4412-0002,Flooring Damage / Replace,44000-440,Damage Fees,52
