In [1]:
import pandas as pd
import re
import spacy
import json
from random import randint
from spacy.lang.en.stop_words import STOP_WORDS
from typing import Any, Optional, NamedTuple
from string import punctuation
from datetime import date
from itertools import combinations_with_replacement
from collections import Counter, defaultdict
from calendar import month_name, month_abbr

In [2]:
from spacy.language import Language
from spacy.util import compile_infix_regex

In [3]:
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS

In [5]:
nlp = spacy.load('en_core_web_lg' 
#                  exclude=['parser', 'ner']  # no need for dependency labels or POS
                )

In [5]:
d = pd.read_csv('data/transaction_texts.csv')

In [6]:
from spacy import displacy

In [12]:
doc=nlp('liability accrued CBA loans aug, 2020')
displacy.render(doc, style='ent', jupyter=True)

In [6]:
d.head()

Unnamed: 0,account_category,account_name,description
0,Revenue - Subscriptions,Revenue-Software Subscription (On Premise)-Unb...,ACP ASC 605 Reversals - 05/31/2018
1,Deferred Revenue - Current,AR - Unbilled - Subscription,Consolidation
2,Accrued Taxes,Payroll-Worker's Comp Payable,1197
3,Fringe,Employer SSec,"Reversed -- JOHNSON, CHERYL LYNN"
4,Cash and Cash Equivalents,Chase Operating,1.00E+11


In [42]:
@Language.component('date_finder')
def find_dates(doc):
        
    plausible = dict(day=set(range(1,32)), 
                     month=set(range(1,13)), 
                     year=set(range(19,23)) | set(range(2019, 2023)))
    
    _dates = set()
    
    st = ' '.join([t.text for t in doc])
    
    try:
        _dates |= {re.sub(r'[-\/.]+','-', d) for d in re.findall(r'\b\d{1,2}[-\/.]+\d{1,2}[-\/.]+\d{4}\b|'
                                                                 r'\b\d{4}[-\/.]+\d{1,2}[-\/.]+\d{1,2}\b|'
                                                                 r'\b\d{1,2}[-\/.]+\d{1,2}[-\/.]+\d{1,2}\b', st)}
        _dates |= {d + '-' + str(list(month_abbr).index(m.title())) + '-' + '20' + y 
                              for d, m, y in re.findall(r'\b(\d{1,2})(' + '|'.join([m for m in month_abbr[1:]]) + r')(\d{2}\b)', st, flags=re.IGNORECASE)}
    except:
        pass
    
    if not _dates:
        return None
    
    # dates will be gathered in this set if they are valid
    dates = []
    
    for _date in _dates:
        
        _date_parts = _date.split('-')
        
        position_cands = defaultdict(set)
        
        for i, p in enumerate(_date_parts):
            for q in 'year month day'.split():
                if int(p) in plausible[q]:
                    position_cands[i].add(q)
        
        if set(position_cands) != set(range(len(_date_parts))):
            continue
        
        for p0 in position_cands[0]:
            for p1 in position_cands[1] - {p0}:
                for p2 in position_cands[2] - {p0} - {p1}:
                    
                    date_as_dict = {p0: int(_date_parts[0]), p1: int(_date_parts[1]), p2: int(_date_parts[2])}
                    
                    # make sure year is presented as 20YY
                    for _ in date_as_dict:
                        if (_ == 'year') and (len(str(date_as_dict[_])) == 2):
                            date_as_dict.update({_: int('20'+str(date_as_dict[_]))})
                            
                    dates.append(date(**date_as_dict))
    doc.user_data |= {'dates': dates}
    
    return doc

In [46]:
nlp.add_pipe('date_finder', name='find_dates', first=True)

<function __main__.find_dates(doc)>

In [None]:
class Transaction:
    
    def __init__(self, account_category, account_name, description):
        
        self.account_category: Optional[str] = account_category
        self.account_name: Optional[str] = account_name
        self.description: Optional[str] = description
            
        self.docs = defaultdict()
        self.labels = defaultdict()
    
    def __str__(self):
        return f'{"acc.cat:":<10}{self.account_category}\n{"acc.name:":<10}{self.account_name}\n{"desc:":<10}{self.description}'
    
    def similarity(self, another_transaction):
        
        sim_acc_cat = self.docs['account_category'].similarity(another_transaction.docs['account_category'])
        sim_acc_name = self.docs['account_name'].similarity(another_transaction.docs['account_name'])
        sim_desc = self.docs['description'].similarity(another_transaction.docs['description'])
        
        print(f'similarity: acc.cat: {sim_acc_cat:.4f} acc.name: {sim_acc_name:.4f} description: {sim_desc:.4f}')
        
        return (sim_acc_cat, sim_acc_name, sim_desc)

In [None]:
class Parser:
    
    def __init__(self):
        
        self.abbrs = json.load(open('data/acdic.json'))
        self.syns = {'revenue': set('revenue unbilled earnings gain income incoming proceeds profit return yield unrealized'.split()),
                     'expense': set('expense charge expenditure obligation spending spend overhead surcharge cost'.split()),
                     'accrue': set('accrue accumulate amass collect gather aggregate hoard'.split()),
                     'consolidation': set('consolidation merger strengthening unification amalgamation'.split()),
                     'adjustment': set('adjustment adjust adj alteration correction modification readjustment'.split()),
                     'recurring': set('regular periodic monthly weekly fortnightly yearly repeat routine recurring'.split()),
                     'defer': set('adjourn delay postpone suspend'.split()),
                     'subscription': set('subscription'.split()),
                     'payroll': set('payroll salary wages pay remuneration paycheck earning'.split()),
                     'liability': set('liability debt burden obligation owing uninvoiced'.split()),
                     'rent': set('rent lease rental'.split()),
                     'depreciation': set('depreciation devaluation markdown deflation'.split()),
                     'tax': set('tax duty levy toll tariff excise'.split()),
                     'maintenance': set('maintenance repair'.split()),
                     'entertainment': set('entertainment recreation party'.split()),
                     'prepaid': set('prepaid'.split()),
                     'month': set(map(lambda x: x.lower(), set(month_name[1:]) | set(month_abbr[1:]))),
                     'food': set('food meal dinner lunch restaurant cafe brunch breakfast catering wine beer drinks'.split()),
                     'reverse': set('reverse back return inverse converse'.split())}
        
        for k in self.syns:
            syns_upd = set()
            for w in nlp(' '.join(self.syns[k])):
                syns_upd.add(w.lemma_.lower())
            self.syns[k] = syns_upd
    
    def run(self, st):
        
        _labs = defaultdict(set) 
        _doc = None
        
        if (not isinstance(st, str)) or (not st.strip()):
            return (_doc, _labs)
        
        _labs['dates'] = self.find_dates(st)
        
        st = st.lower().translate({ord(sep): '' for sep in './'})
        
        st = st.translate(str.maketrans({_: ' ' for _ in punctuation}))
        
        # unfold abbreviations
        st = ' '.join([self.abbrs.get(w, w).lower().replace(',','') for w in st.split()])
        
        # remove numbers
        st = ' '.join([w for w in st.split() if w.isalpha()])
        
        # remove stopwords
        st = ' '.join([w for w in st.split() if w not in STOP_WORDS])
        
        # replace multiple consecutive white spaces with a single one
        st = re.sub(r'\s+', ' ', st).strip()
        
        if st:
            _doc = nlp(st)
        
            for what in self.syns:
                for w in _doc:
                    if w.lemma_ in self.syns[what]:
                        _labs['labels'].add(what)
            
        return (_doc, _labs)
    
    def find_dates(self, st):
        
        plausible = dict(day=set(range(1,32)), 
                         month=set(range(1,13)), 
                         year=set(range(19,23)) | set(range(2019, 2023)))
        
        _dates = set()
        
        # try to create a set of found dates using a single - as separator 
        try:
            _dates |= {re.sub(r'[-\/.]+','-', d) for d in re.findall(r'\b\d{1,2}[-\/.]+\d{1,2}[-\/.]+\d{4}\b|'
                                                                     r'\b\d{4}[-\/.]+\d{1,2}[-\/.]+\d{1,2}\b|'
                                                                     r'\b\d{1,2}[-\/.]+\d{1,2}[-\/.]+\d{1,2}\b', st)}
        except:
            pass
        
        try:
            _dates |= {d + '-' + str(list(month_abbr).index(m.title())) + '-' + '20' + y 
                                  for d, m, y in re.findall(r'\b(\d{1,2})(' + '|'.join([m for m in month_abbr[1:]]) + r')(\d{2}\b)', st, flags=re.IGNORECASE)}
        except:
            pass
        
        if not _dates:
            return None
        
        # dates will be gathered in this set if they are valid
        dates = []
        
        for _date in _dates:
            
            _date_parts = _date.split('-')
            
            position_cands = defaultdict(set)
            
            for i, p in enumerate(_date_parts):
                for q in 'year month day'.split():
                    if int(p) in plausible[q]:
                        position_cands[i].add(q)
            
            if set(position_cands) != set(range(len(_date_parts))):
                continue
            
            for p0 in position_cands[0]:
                for p1 in position_cands[1] - {p0}:
                    for p2 in position_cands[2] - {p0} - {p1}:
                        
                        date_as_dict = {p0: int(_date_parts[0]), p1: int(_date_parts[1]), p2: int(_date_parts[2])}
                        
                        # make sure year is presented as 20YY
                        for _ in date_as_dict:
                            if (_ == 'year') and (len(str(date_as_dict[_])) == 2):
                                date_as_dict.update({_: int('20'+str(date_as_dict[_]))})
                                
                        dates.append(date(**date_as_dict))
        
        

        return dates   

In [None]:
p = Parser()

In [36]:
w1 = {'a':[3,4],'b':2}
w2 = {'c': 5}

In [38]:
w1 |= w2

In [39]:
w1

{'a': [3, 4], 'b': 2, 'c': 5}

In [None]:
p.run(' ewfefe 2141x !@@@!2 2021--12-4 a.B 3/1452-1-23m  1FEb24  1.1.22 rev accrued maintenance&repairs -- reversed')

In [None]:
t2 = Transaction(**d.iloc[randint(0, len(d)-1)])
t2.docs['account_category'], t2.labels['account_category']= p.run(t2.account_category)
t2.docs['account_name'], t2.labels['account_name']= p.run(t2.account_name)
t2.docs['description'], t2.labels['description']= p.run(t2.description)

In [73]:
from tokenizers import Tokenizer, BertWordPieceTokenizer

In [69]:
from tokenizers import normalizers
from tokenizers import pre_tokenizers

In [74]:
tokenizer = BertWordPieceTokenizer()

In [75]:
tokenizer.encode(' ewfefe 2141x !@@@!2 2021--12-4 a.B 3/1452-1-23m  1FEb24  1.1.22 rev accrued maintenance&repairs -- reversed')

Exception: WordPiece error: Missing [UNK] token from the vocabulary

In [68]:
normalizers.BertNormalizer().normalize_str('webb21 333388482---1 21//01/21 WIQS%&&')

'webb21 333388482---1 21//01/21 wiqs%&&'

In [60]:
nlp.tokenizer.rules

{'\t': [{65: '\t'}],
 '\n': [{65: '\n'}],
 ' ': [{65: ' '}],
 "'": [{65: "'"}],
 "''": [{65: "''"}],
 "'Cause": [{65: "'Cause", 67: 'because'}],
 "'Cos": [{65: "'Cos", 67: 'because'}],
 "'Coz": [{65: "'Coz", 67: 'because'}],
 "'Cuz": [{65: "'Cuz", 67: 'because'}],
 "'S": [{65: "'S", 67: "'s"}],
 "'bout": [{65: "'bout", 67: 'about'}],
 "'cause": [{65: "'cause", 67: 'because'}],
 "'cos": [{65: "'cos", 67: 'because'}],
 "'coz": [{65: "'coz", 67: 'because'}],
 "'cuz": [{65: "'cuz", 67: 'because'}],
 "'d": [{65: "'d"}],
 "'em": [{65: "'em", 67: 'them'}],
 "'ll": [{65: "'ll", 67: 'will'}],
 "'nuff": [{65: "'nuff", 67: 'enough'}],
 "'re": [{65: "'re", 67: 'are'}],
 "'s": [{65: "'s", 67: "'s"}],
 '(*_*)': [{65: '(*_*)'}],
 '(-8': [{65: '(-8'}],
 '(-:': [{65: '(-:'}],
 '(-;': [{65: '(-;'}],
 '(-_-)': [{65: '(-_-)'}],
 '(._.)': [{65: '(._.)'}],
 '(:': [{65: '(:'}],
 '(;': [{65: '(;'}],
 '(=': [{65: '(='}],
 '(>_<)': [{65: '(>_<)'}],
 '(^_^)': [{65: '(^_^)'}],
 '(o:': [{65: '(o:'}],
 '(¬_¬)': [{6

In [4]:
from spacy.tokens import Doc

In [None]:
infix_re = compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_re.finditer

In [51]:
do = nlp(' ewfefe 2141x !@@@!2 2021--12-4 a.B 3/1452-1-23m  1FEb24  1.1.22 rev accrued maintenance&repairs -- reversed')

In [53]:
[t.lemma_ for t in do]

[' ',
 'ewfefe',
 '2141x',
 '!',
 '@@@!2',
 '2021',
 '-',
 '-12',
 '-',
 '4',
 'a.',
 'b',
 '3/1452',
 '-',
 '1',
 '-',
 '23',
 'm',
 ' ',
 '1feb24',
 ' ',
 '1.1.22',
 'rev',
 'accrue',
 'maintenance&repairs',
 '--',
 'reverse']

In [52]:
nlp.pipeline

[('find_dates', <function __main__.find_dates(doc)>),
 ('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7fcf48bb9d60>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7fd0101ef0e0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7fcff5419740>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7fcff6968ac0>)]

In [49]:
do.user_data

{'dates': [datetime.date(2022, 1, 1), datetime.date(2022, 1, 1)]}

In [50]:
[t.label_ for t in do.ents]

[]

In [11]:
do.user_data

{'labels': ['maintenance', 'cash']}

In [None]:
print(t2.labels)

In [None]:
t1.similarity(t2)

In [None]:
t2.labels

In [None]:
d.head()

In [None]:
all_trans = []

for r in d.iterrows():
    
    t2 = Transaction(**r[1][['account_name', 'account_category','description']])
    t2.docs['account_category'], t2.labels['account_category']= p.run(t2.account_category)
    t2.docs['account_name'], t2.labels['account_name']= p.run(t2.account_name)
    t2.docs['description'], t2.labels['description']= p.run(t2.description)
    
    for k in t2.labels:
        if 'subscription' in t2.labels[k]['labels']:
            all_trans.append(r[1])
            print(len(all_trans))
            
    if len(all_trans) == 100:
        break

In [None]:
all_trans