In [1]:
import pandas as pd
import re
import spacy
import json
from random import randint
from spacy.lang.en.stop_words import STOP_WORDS
from typing import Any, Optional, NamedTuple
from string import punctuation

In [2]:
d = pd.read_csv('data/transaction_texts.csv')

In [124]:
date_ptrn = re.compile(r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{2}\b)|(\b\d{4}\s{1}\d{1,2}\s{1}\d{1,2}\b)|(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{4}\b)')

In [173]:
class TransactDescr:
    
    def __init__(self, account_category, account_name, description):
        
        self.account_category: Optional[str] = account_category
        self.account_name: Optional[str] = account_name
        self.description: Optional[str] = description
    
    def __str__(self):
        return f'{"acc.cat:":<10}{self.account_category}\n{"acc.name:":<10}{self.account_name}\n{"desc:":<10}{self.description}'
    
    def _normalize(self, st):
        
        # all to lower case
        st = st.lower()
        
        # replace punctuation with a single white space
        st = st.translate(str.maketrans({_: ' ' for _ in punctuation}))
        
        # replace multiple consecutive white spaces with a single one
        st = re.sub(r'\s+', ' ', st)
        
        # strip white spaces on ends
        st = st.strip()
        
        return st
    
    def normalize(self):
        
        self.account_category = self._normalize(self.account_category)
        self.account_name = self._normalize(self.account_name)
        self.description = self._normalize(self.description)
        
        return self
    
    def _is_revenue(self, st):
        
        syns = set('revenue earnings gain income incoming proceeds profit return yield'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _is_accrued(self, st):
        
        syns = set('accrue accumulate amass collect gather aggregate hoard'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
    
    def _is_adjustment(self, st):
        
        syns = set('adjustment adjust adj alteration modification readjustment'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _is_liability(self, st):
        
        syns = set('liability debt burden obligation owing uninvoiced'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _get_date(self, st):
        
        try:
            _date = date_ptrn.search(st).group(0)
        except:
            _date = None
        
        day_vals = set(range(1,32))
        mon_vals = set(range(1,13))
        year_vals = set(range(19,22))
            
        if _date:
            
            y = m = d = None
            
            day_cand = set()
            mon_cand = set()
            yr_cand = set()
            
            for p in _date.split():
                
                p_int = int(p)
                
                if (len(p) == 4):
                    if int(p[:2]) in year_vals:
                        y = p_int
                    else:
                        return None
                elif len(p) == 2:
                    if p_int in year_vals:
                        yr_cand.add(p_int)
                    if p_int in mon_vals:
                        mon_cand.add(p_int)
                    if p_int in day_vals:
                        day_cand.add(p_int)
                        
            if len(mon_cand) == 1:
                day_cand -= mon_cand
                yr_cand -= mon_cand
                m = mon_cand.pop()
            if len(day_cand) == 1:
                mon_cand -= day_cand
                d = day_cand.pop()
            if len(yr_cand) == 1:
                y = yr_cand.pop()
            
            if all([y, m, d]):
                return (y, m, d)
            else:
                print('y=',y, 'm=',m, 'd=',d)
                print('month cands: ', mon_cand)
                print('day cands: ', day_cand)
                        
                    
            

In [174]:
t = TransactDescr(**d.iloc[12]).normalize()

In [175]:
print(t)

acc.cat:  deferred revenue current
acc.name: deferred revenue maintenance and support software unbilled
desc:     acp 605 reversals 10 31 2016


In [176]:
t._get_date(t.description)

{19, 20, 21}


(2016, 10, 31)

In [69]:
ac

{'aaa': 'american accounting association',
 'aapa': 'association of authorized public accountants',
 'aat': 'association of accounting technicians',
 'abb': 'activity based budgeting',
 'aca': 'associate of the institute of chartered accountants in england and wales',
 'acca': 'associate of the association of chartered certified accountants',
 'acis': 'associate of the institute of chartered secretaries and administrators',
 'acma': 'associate of the chartered institute of management accountants',
 'act': 'activity',
 'adr': 'american depositary receipt',
 'afaanz': 'accounting and finance association of australia and new zealand',
 'agm': 'annual general meeting',
 'aiab': 'associate of the intemational association of book keepers',
 'aicpa': 'accountancy investigation and discipline board',
 'aim': 'alternative investment market',
 'aje': 'adjusting journal entry',
 'amps': 'auction market preferred stock',
 'apacs': 'payment clearing services',
 'apb': 'auditing practices board',
 '

In [58]:
a,b

(4, 5)

In [3]:
d.head()

Unnamed: 0,account_category,account_name,description
0,Revenue - Subscriptions,Revenue-Software Subscription (On Premise)-Unb...,ACP ASC 605 Reversals - 05/31/2018
1,Deferred Revenue - Current,AR - Unbilled - Subscription,Consolidation
2,Accrued Taxes,Payroll-Worker's Comp Payable,1197
3,Fringe,Employer SSec,"Reversed -- JOHNSON, CHERYL LYNN"
4,Cash and Cash Equivalents,Chase Operating,1.00E+11


In [68]:
ac = json.load(open('data/acdic.json'))

In [5]:
nlp = spacy.load("en_core_web_lg")

In [6]:
def get_text():
    
    row = d.iloc[randint(0, len(d)-1)]

    doc = nlp((row['account_category'] + ' ' + row['account_name'] + ' ' + row['description']).lower())
    
    doc_norm = [(token.text, token.lemma_, token.pos_, token.has_vector) 
                     for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON'] and 
                                 len(token.lemma_)>1 and 
                                 token.text.isalpha()]
    return ' '.join([_[0] for _ in doc_norm])

In [7]:
l1 = get_text()
doc1 = nlp(l1)
l2 = get_text()
doc2 = nlp(l2)

In [8]:
print(f'1: {l1}\n2: {l2}\nsimilarity: {doc1.similarity(doc2)}')

1: intercompany payable inter program salishan inter entity due
2: deferred revenue current deferred revenue software subscription acp reversals
similarity: 0.6615397444949102
