In [221]:
import pandas as pd
import re
import spacy
import json
from random import randint
from spacy.lang.en.stop_words import STOP_WORDS
from typing import Any, Optional, NamedTuple
from string import punctuation
from datetime import date
from itertools import combinations_with_replacement

In [368]:
nlp = spacy.load("en_core_web_lg")

In [369]:
d = pd.read_csv('data/transaction_texts.csv')

In [529]:
class TransactDescr:
    
    def __init__(self, account_category, account_name, description):
        
        self.account_category: Optional[str] = account_category
        self.account_name: Optional[str] = account_name
        self.description: Optional[str] = description
    
    def __str__(self):
        return f'{"acc.cat:":<10}{self.account_category}\n{"acc.name:":<10}{self.account_name}\n{"desc:":<10}{self.description}'
    
    def _normalize(self, st):
        
        # all to lower case
        st = st.lower()
        
        # replace punctuation with a single white space
        st = st.translate(str.maketrans({_: ' ' for _ in punctuation}))
        
        # replace multiple consecutive white spaces with a single one
        st = re.sub(r'\s+', ' ', st)
        
        # strip white spaces on ends
        st = st.strip()
        
        # remove numbers
        st = self._remove_numbers(st)
        
        # remove stopwords
        st = self._remove_stopwords(st)
        
        # lemmatize
        st = self._lemmatize(st)
        
        return st
    
    def _remove_numbers(self, st):
        
        return ' '.join([w for w in st.split() if w.isalpha()])
    
    def _remove_stopwords(self, st):
        
        return ' '.join([w for w in st.split() if w not in STOP_WORDS])
    
    def _lemmatize(self, st):

        return ' '.join([t.lemma_ for t in nlp(st)])
    
    def normalize(self):
        
        self.account_category = self._normalize(self.account_category)
        self.account_name = self._normalize(self.account_name)
        self.description = self._normalize(self.description)
        
        return self
    
    def _is_revenue(self, st):
        
        syns = set('revenue earnings gain income incoming proceeds profit return yield'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _is_accrued(self, st):
        
        syns = set('accrue accumulate amass collect gather aggregate hoard'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
    
    def _is_consolidation(self, st):
        
        syns = set('consolidation merger strengthening unification amalgamation '.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
    
    def _is_month(self, st):
        
        syns = set('january february march april may june july august september october november december'.split())
        syns |= {m[:3] for m in syns}
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
    
    def _is_adjustment(self, st):
        
        syns = set('adjustment adjust adj alteration modification readjustment'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
    
    def _is_payroll(self, st):
        
        syns = set('payroll salary wages pay remuneration paycheck earning'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
    
    def _is_food(self, st):
        
        syns = set('food meal dinner lunch restaurant cafe brunch breakfast catering'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _is_liability(self, st):
        
        syns = set('liability debt burden obligation owing uninvoiced'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _find_date(self, st):
        
        try:
            _date = re.search(r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{2}\b)|'
                              r'(\b\d{4}\s{1}\d{1,2}\s{1}\d{1,2}\b)|'
                              r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{4}\b)', st).group(0)
        except:
            print('no dates here, move on..')
            return None
        
        plausible = dict(days=set(range(1,32)), 
                         months=set(range(1,13)), 
                         years=set(range(19,22)) | set(range(2019, 2022)))
        
        candidates = dict(days=set(), 
                          months=set(), 
                          years=set())
        
        for p in _date.split():
            for _ in 'years months days'.split():
                if int(p) in plausible[_]:
                    candidates[_].add(int(p))
        
        if max(candidates['years']) >= 2019:
            candidates['years'] = {max(candidates['years'])}
        
        _dates = set()
        
        for _y in candidates['years']:
            
            if len(str(_y)) == 2:
                _y = int('20' + str(_y))
                
            for _m in candidates['months']:
                for _d in candidates['days']:
                    _dates.add(date(year=_y, month=_m, day=_d))
        
        return _dates   

In [530]:
t = TransactDescr(**d.iloc[12]).normalize()

In [387]:
all_abbrs = []

for r in d.iterrows():
    abbrs = {_ for _ in r[1]['account_name'].lower().split() if len(_) in [2,3]}
    all_abbrs += list(abbrs)

In [390]:
li = [p.replace("/",'') for p in all_abbrs]

In [420]:
i = 0
for r in d.iterrows():
    if ' bch ' in r[1]['account_name'].lower():
        print(r[1]['account_name'])
        i += 1
        if i == 5:
            break

Investments - Ops Account - BCH - Trade Commissions
Unrealized G/(L) - BCH Reserve
Investments - Ops Account - BCH FV Adj
Unrealized G/(L) - BCH Operating
Investments - Ops Account - BCH FV Adj


In [393]:
from collections import Counter

In [None]:
Counter([_ for _ in li if _.isalpha() and _ not in STOP_WORDS]).most_common()

In [378]:
print(t)

acc.cat:  deferred revenue current
acc.name: deferred revenue maintenance support software unbilled
desc:     acp reversal


In [385]:
t._lemmatize('rabbits love carrots referred deferred higher')

'rabbit love carrot refer defer high'

In [68]:
ac = json.load(open('data/acdic.json'))

In [5]:
nlp = spacy.load("en_core_web_lg")

In [521]:
def get_text():
    
#     row = d.iloc[randint(0, len(d)-1)]
    row = TransactDescr(**d.iloc[randint(0, len(d)-1)]).normalize()

    doc = nlp(row.description)
    
    doc_norm = [(token.text, token.lemma_, token.pos_, token.has_vector) 
                     for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON'] and 
                                 len(token.lemma_)>1 and 
                                 token.text.isalpha()]
    return ' '.join([_[0] for _ in doc_norm])

In [533]:
for _ in range(50):
    l1 = get_text()
    doc1 = nlp(l1)
    l2 = get_text()
    doc2 = nlp(l2)
    print(f'1: {l1}\n2: {l2}\nsimilarity: {doc1.similarity(doc2)}')

1: lyft charge phil bar fnd consulting
2: journal entry futa tax
similarity: 0.3976405099269043
1: llc
2: acp reversal
similarity: 0.21054081286453544
1: period daily sale
2: depreciation asset computer chromebook projector pay app serial number
similarity: 0.45400513977711354
1: vacation accrual
2: 
similarity: 0.0
1: acp reversal
2: consolidation
similarity: 0.2224955151386263
1: ledger entry
2: reverse rpc online giving cfw
similarity: 0.24317013205124913
1: paddywack dog supply
2: tel data exp kamlesh tiwari
similarity: 0.09723609420292094


  print(f'1: {l1}\n2: {l2}\nsimilarity: {doc1.similarity(doc2)}')


1: pos jan
2: cash
similarity: 0.1892912541509257
1: dental
2: vacation accrual
similarity: 0.2656496886205442
1: 
2: expense allocation vco
similarity: 0.0
1: 
2: consolidation
similarity: 0.0
1: acp reversal
2: consolidation
similarity: 0.2224955151386263
1: table game find money
2: acp reversal
similarity: 0.06188248479072436
1: cash
2: consolidation
similarity: 0.4657770986771067
1: consolidation
2: deduction hso
similarity: 0.16151838300021323
1: consolidation
2: account payable
similarity: 0.3603264977976946
1: purchase
2: 
similarity: 0.0
1: inter entity
2: acp reversal
similarity: 0.12222155909953979
1: report write
2: adj
similarity: -0.047283218275892484
1: reclassify people expense project gate foundation grant expense
2: period daily sale
similarity: 0.4844234555663616
1: adj
2: consolidation
similarity: 0.1341898331172192
1: journal entry allocate ppd work comp
2: reverse defer reclass
similarity: 0.3585636009202673
1: melinda
2: charge mar
similarity: 0.005602606914107825

1: period daily sale amex
2: dialpad monthly cell phone bill
similarity: 0.37131790764316164
