In [4]:
import pandas as pd
import re
import spacy
import json
from random import randint
from spacy.lang.en.stop_words import STOP_WORDS
from typing import Any, Optional, NamedTuple
from string import punctuation
from datetime import date
from itertools import combinations_with_replacement
from collections import Counter, defaultdict

In [5]:
nlp = spacy.load("en_core_web_lg")

In [18]:
d = pd.read_csv('data/transaction_texts.csv')

In [6]:
class Transaction:
    
    def __init__(self, account_category, account_name, description):
        
        self.account_category: Optional[str] = account_category
        self.account_name: Optional[str] = account_name
        self.description: Optional[str] = description
            
        self.docs = defaultdict()
        self.labels = defaultdict()
    
    def __str__(self):
        return f'{"acc.cat:":<10}{self.account_category}\n{"acc.name:":<10}{self.account_name}\n{"desc:":<10}{self.description}'
    
    def similarity(self, another_transaction):
        
        sim_acc_cat = self.docs['account_category'].similarity(another_transaction.docs['account_category'])
        sim_acc_name = self.docs['account_name'].similarity(another_transaction.docs['account_name'])
        sim_desc = self.docs['description'].similarity(another_transaction.docs['description'])
        
        print(f'similarity: acc.cat: {sim_acc_cat:.4f} acc.name: {sim_acc_name:.4f} description: {sim_desc:.4f}')
        
        return (sim_acc_cat, sim_acc_name, sim_desc)

In [63]:
class Parser:
    
    def __init__(self):
        
        self.abbrs = json.load(open('data/acdic.json'))
        self.syns = {'revenue': set('revenue earnings gain income incoming proceeds profit return yield'.split()),
                     'expense': set('expense charge expenditure obligation spending spend overhead surcharge'.split()),
                     'accrue': set('accrue accumulate amass collect gather aggregate hoard'.split()),
                     'consolidation': set('consolidation merger strengthening unification amalgamation'.split()),
                     'adjustment': set('adjustment adjust adj alteration modification readjustment'.split()),
                     'recurring': set('regular daily monthly weekly fortnightly yearly repeat routine recurring'.split()),
                     'defer': set('adjourn delay postpone suspend'.split()),
                     'payroll': set('payroll salary wages pay remuneration paycheck earning'.split()),
                     'liability': set('liability debt burden obligation owing uninvoiced'.split()),
                     'rent': set('rent lease rental'.split()),
                     'maintenance': set('maintenance repair'.split()),
                     'month': set('apr april aug august dec december feb february jan january jul july jun june mar march may nov november oct october sep september'.split()),
                     'food': set('food meal dinner lunch restaurant cafe brunch breakfast catering'.split()),
                     'reverse': set('reverse back return inverse converse'.split())}
        
        for k in self.syns:
            syns_upd = set()
            for w in nlp(' '.join(self.syns[k])):
                syns_upd.add(w.lemma_.lower())
            self.syns[k] = syns_upd
    
    def run(self, st):
        
        _labs = defaultdict(set) 
        _doc = None
        
        if (not isinstance(st, str)) or (not st.strip()):
            return (_docs, _labs)
        
        _labs['dates'] = self.find_dates(st)
        
        st = st.lower().translate({ord(sep): '' for sep in './'})
        
        st = st.translate(str.maketrans({_: ' ' for _ in punctuation}))
        
        # unfold abbreviations
        st = ' '.join([self.abbrs.get(w, w).lower().replace(',','') for w in st.split()])
        
        # remove numbers
        st = ' '.join([w for w in st.split() if w.isalpha()])
        
        # remove stopwords
        st = ' '.join([w for w in st.split() if w not in STOP_WORDS])
        
        # replace multiple consecutive white spaces with a single one
        st = re.sub(r'\s+', ' ', st).strip()
        
        if st:
            _doc = nlp(st)
        
            for what in self.syns:
                for w in _doc:
                    if w.lemma_ in self.syns[what]:
                        _labs['labels'].add(what)
            
        return (_doc, _labs)
    
    def find_dates(self, st):
        
        plausible = dict(day=set(range(1,32)), 
                         month=set(range(1,13)), 
                         year=set(range(19,23)) | set(range(2019, 2023)))
        
        # try to create a set of found dates using a single - as separator 
        try:
            _dates = {re.sub(r'[-\/.]+','-', d) for d in re.findall(r'\b\d{1,2}[-\/.]+\d{1,2}[-\/.]+\d{4}\b|'
                                                                           r'\b\d{4}[-\/.]+\d{1,2}[-\/.]+\d{1,2}\b|'
                                                                           r'\b\d{1,2}[-\/.]+\d{1,2}[-\/.]+\d{1,2}\b', st)}     
        except:
            return None
        
        # dates will be gathered in this set if they are valid
        dates = []
        
        for _date in _dates:
            
            _date_parts = _date.split('-')
            
            position_cands = defaultdict(set)
            
            for i, p in enumerate(_date_parts):
                for q in 'year month day'.split():
                    if int(p) in plausible[q]:
                        position_cands[i].add(q)
            
            if set(position_cands) != set(range(len(_date_parts))):
                continue
            
            for p0 in position_cands[0]:
                for p1 in position_cands[1] - {p0}:
                    for p2 in position_cands[2] - {p0} - {p1}:
                        
                        date_as_dict = {p0: int(_date_parts[0]), p1: int(_date_parts[1]), p2: int(_date_parts[2])}
                        
                        # make sure year is presented as 20YY
                        for _ in date_as_dict:
                            if (_ == 'year') and (len(str(date_as_dict[_])) == 2):
                                date_as_dict.update({_: int('20'+str(date_as_dict[_]))})
                                
                        dates.append(date(**date_as_dict))

        return dates   

In [64]:
p = Parser()

In [65]:
p.run(' ewfefe 2141x !@@@!2 2021--12-4 a.B 3/1452-1-23m  1.1.22 rev accrued maintenance&repairs -- reversed')

(ewfefe ab revenue accrued maintenance repairs reversed,
 defaultdict(set,
             {'dates': [datetime.date(2022, 1, 1),
               datetime.date(2022, 1, 1),
               datetime.date(2021, 4, 12),
               datetime.date(2021, 12, 4)],
              'labels': {'accrue', 'maintenance', 'revenue', 'reverse'}}))

In [66]:
t1 = Transaction(**d.iloc[randint(0, len(d)-1)])
print(t1)
t1.docs['account_category'], t1.labels['account_category']= p.run(t1.account_category)
t1.docs['account_name'], t1.labels['account_name']= p.run(t1.account_name)
t1.docs['description'], t1.labels['description']= p.run(t1.description)

acc.cat:  Revenue - Services
acc.name: Wine & Beer - Lunch
desc:     Period 9 daily sales


In [68]:
t1.labels

defaultdict(None,
            {'account_category': defaultdict(set,
                         {'dates': [], 'labels': {'revenue'}}),
             'account_name': defaultdict(set,
                         {'dates': [], 'labels': {'food'}}),
             'description': defaultdict(set,
                         {'dates': [], 'labels': {'recurring'}})})

In [69]:
t2 = Transaction(**d.iloc[randint(0, len(d)-1)])
t2.docs['account_category'], t2.labels['account_category']= p.run(t2.account_category)
t2.docs['account_name'], t2.labels['account_name']= p.run(t2.account_name)
t2.docs['description'], t2.labels['description']= p.run(t2.description)

In [70]:
print(t2)

acc.cat:  Benefit Expense
acc.name: FICA
desc:     UltiPro Payroll 09.13.2019


In [71]:
t1.similarity(t2)

similarity: acc.cat: 0.6901 acc.name: -0.1082 description: 0.4283


(0.6900576326013322, -0.10820845111193943, 0.4282828817361174)

In [72]:
t2.labels

defaultdict(None,
            {'account_category': defaultdict(set,
                         {'dates': [], 'labels': {'expense'}}),
             'account_name': defaultdict(set, {'dates': []}),
             'description': defaultdict(set,
                         {'dates': [datetime.date(2019, 9, 13)],
                          'labels': {'payroll'}})})