In [541]:
import pandas as pd
import re
import spacy
import json
from random import randint
from spacy.lang.en.stop_words import STOP_WORDS
from typing import Any, Optional, NamedTuple
from string import punctuation
from datetime import date
from itertools import combinations_with_replacement
from collections import Counter, defaultdict

In [368]:
nlp = spacy.load("en_core_web_lg")

In [369]:
d = pd.read_csv('data/transaction_texts.csv')

In [732]:
class Transaction:
    
    def __init__(self, account_category, account_name, description):
        
        self.account_category: Optional[str] = account_category
        self.account_name: Optional[str] = account_name
        self.description: Optional[str] = description
            
        self.docs = defaultdict()
        self.labels = defaultdict()
    
    def __str__(self):
        return f'{"acc.cat:":<10}{self.account_category}\n{"acc.name:":<10}{self.account_name}\n{"desc:":<10}{self.description}'
    
    def similarity(self, another_transaction):
        
        sim_acc_cat = self.docs['account_category'].similarity(another_transaction.docs['account_category'])
        sim_acc_name = self.docs['account_name'].similarity(another_transaction.docs['account_name'])
        sim_desc = self.docs['description'].similarity(another_transaction.docs['description'])
        
        print(f'similarity: acc.cat: {sim_acc_cat:.4f} acc.name: {sim_acc_name:.4f} description: {sim_desc:.4f}')
        
        return (sim_acc_cat, sim_acc_name, sim_desc)

In [733]:
class Parser:
    
    def __init__(self):
        
        self.abbrs = json.load(open('data/acdic.json'))
        self.syns = {'revenue': set('revenue earnings gain income incoming proceeds profit return yield'.split()),
                     'expense': set('expense charge expenditure obligation spending spend overhead surcharge'.split()),
                     'accrue': set('accrue accumulate amass collect gather aggregate hoard'.split()),
                     'consolidation': set('consolidation merger strengthening unification amalgamation'.split()),
                     'adjustment': set('adjustment adjust adj alteration modification readjustment'.split()),
                     'recurring': set('regular daily monthly weekly fortnightly yearly repeat routine recurring'.split()),
                     'payroll': set('payroll salary wages pay remuneration paycheck earning'.split()),
                     'liability': set('liability debt burden obligation owing uninvoiced'.split()),
                     'month': set('apr april aug august dec december feb february jan january jul july jun june mar march may nov november oct october sep september'.split()),
                     'food': set('food meal dinner lunch restaurant cafe brunch breakfast catering'.split())}
    
    def run(self, st):
        
        _labs = defaultdict()
        
        # all to lower case
        st = st.lower()
        
        # remove /
        st = st.replace('/','')
        
        # replace punctuation with a single white space
        st = st.translate(str.maketrans({_: ' ' for _ in punctuation}))
        
        # replace multiple consecutive white spaces with a single one
        st = re.sub(r'\s+', ' ', st)
        
        extracted_dates = self.find_date(st)
        if extracted_dates:
            _labs['dates'] = extracted_dates
        
        # unfold abbreviations
        st = ' '.join([self.abbrs.get(w, w) for w in st.split()])
        
        # remove numbers
        st = ' '.join([w for w in st.split() if w.isalpha()])
        
        # remove stopwords
        st = ' '.join([w for w in st.split() if w not in STOP_WORDS])
        
        # strip white spaces on ends
        st = st.strip()
        
        extracted_labels = self.extract_labels(st)
        if extracted_labels:
            _labs['types'] = extracted_labels
        
        return (nlp(st), _labs)
    
    def find_date(self, st):
        
        try:
            _date = re.search(r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{2}\b)|'
                              r'(\b\d{4}\s{1}\d{1,2}\s{1}\d{1,2}\b)|'
                              r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{4}\b)', st).group(0)
        except:
            return None
        
        plausible = dict(days=set(range(1,32)), 
                         months=set(range(1,13)), 
                         years=set(range(19,22)) | set(range(2019, 2022)))
        
        candidates = dict(days=set(), 
                          months=set(), 
                          years=set())
        
        for p in _date.split():
            for _ in 'years months days'.split():
                if int(p) in plausible[_]:
                    candidates[_].add(int(p))
        
        if max(candidates['years']) >= 2019:
            candidates['years'] = {max(candidates['years'])}
        
        _dates = set()
        
        for _y in candidates['years']:
            
            if len(str(_y)) == 2:
                _y = int('20' + str(_y))
                
            for _m in candidates['months']:
                for _d in candidates['days']:
                    _dates.add(date(year=_y, month=_m, day=_d))
        
        return _dates   
    
    def extract_labels(self, st):
        
        _labs = set()
        
        for what in self.syns:
            for w in st.split():
                if w in self.syns[what]:
                    _labs.add(what)
        
        if _labs:
            return _labs

In [734]:
p = Parser()

In [743]:
t1 = Transaction(**d.iloc[randint(0, len(d)-1)])
t1.docs['account_category'], t1.labels['account_category']= p.run(t1.account_category)
t1.docs['account_name'], t1.labels['account_name']= p.run(t1.account_name)
t1.docs['description'], t1.labels['description']= p.run(t1.description)

In [744]:
print(t1)

acc.cat:  Deferred Revenue - Current
acc.name: Deferred Revenue-Maintenance and support (Software)-Billed
desc:     ACP 606 Reversals - 12/31/2017


In [745]:
t1.labels['description']

defaultdict(None, {})

In [746]:
t2 = Transaction(**d.iloc[randint(0, len(d)-1)])
t2.docs['account_category'], t2.labels['account_category']= p.run(t2.account_category)
t2.docs['account_name'], t2.labels['account_name']= p.run(t2.account_name)
t2.docs['description'], t2.labels['description']= p.run(t2.description)

In [747]:
print(t2)

acc.cat:  Revenue - Licenses
acc.name: Revenue-Cloud Service Subscription-Billed
desc:     ACP ASC 605 Reversals - 03/31/2017


In [748]:
t1.similarity(t2)

similarity: acc.cat: 0.7591 acc.name: 0.8033 description: 0.8501


(0.7591434030560957, 0.8032778786216105, 0.8500671352968832)

In [750]:
t1.labels

defaultdict(None,
            {'account_category': defaultdict(None, {'types': {'revenue'}}),
             'account_name': defaultdict(None, {'types': {'revenue'}}),
             'description': defaultdict(None, {})})

In [754]:
[_.text for _ in t2.docs['description']]

['average',
 'collection',
 'period',
 'accounting',
 'standards',
 'committee',
 'reversals']