In [221]:
import pandas as pd
import re
import spacy
import json
from random import randint
from spacy.lang.en.stop_words import STOP_WORDS
from typing import Any, Optional, NamedTuple
from string import punctuation
from datetime import date
from itertools import combinations_with_replacement

In [368]:
nlp = spacy.load("en_core_web_lg")

In [369]:
d = pd.read_csv('data/transaction_texts.csv')

In [379]:
class TransactDescr:
    
    def __init__(self, account_category, account_name, description):
        
        self.account_category: Optional[str] = account_category
        self.account_name: Optional[str] = account_name
        self.description: Optional[str] = description
    
    def __str__(self):
        return f'{"acc.cat:":<10}{self.account_category}\n{"acc.name:":<10}{self.account_name}\n{"desc:":<10}{self.description}'
    
    def _normalize(self, st):
        
        # all to lower case
        st = st.lower()
        
        # replace punctuation with a single white space
        st = st.translate(str.maketrans({_: ' ' for _ in punctuation}))
        
        # replace multiple consecutive white spaces with a single one
        st = re.sub(r'\s+', ' ', st)
        
        # strip white spaces on ends
        st = st.strip()
        
        # remove numbers
        st = self._remove_numbers(st)
        
        # remove stopwords
        st = self._remove_stopwords(st)
        
        # lemmatize
        st = self._lemmatize(st)
        
        return st
    
    def _remove_numbers(self, st):
        
        return ' '.join([w for w in st.split() if w.isalpha()])
    
    def _remove_stopwords(self, st):
        
        return ' '.join([w for w in st.split() if w not in STOP_WORDS])
    
    def _lemmatize(self, st):

        return ' '.join([t.lemma_ for t in nlp(st)])
    
    def normalize(self):
        
        self.account_category = self._normalize(self.account_category)
        self.account_name = self._normalize(self.account_name)
        self.description = self._normalize(self.description)
        
        return self
    
    def _is_revenue(self, st):
        
        syns = set('revenue earnings gain income incoming proceeds profit return yield'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _is_accrued(self, st):
        
        syns = set('accrue accumulate amass collect gather aggregate hoard'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
    
    def _is_adjustment(self, st):
        
        syns = set('adjustment adjust adj alteration modification readjustment'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _is_liability(self, st):
        
        syns = set('liability debt burden obligation owing uninvoiced'.split())
        
        if syns & set(st.split()):
            return 1
        else:
            return 0
        
    def _find_date(self, st):
        
        try:
            _date = re.search(r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{2}\b)|'
                              r'(\b\d{4}\s{1}\d{1,2}\s{1}\d{1,2}\b)|'
                              r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{4}\b)', st).group(0)
        except:
            print('no dates here, move on..')
            return None
        
        plausible = dict(days=set(range(1,32)), 
                         months=set(range(1,13)), 
                         years=set(range(19,22)) | set(range(2019, 2022)))
        
        candidates = dict(days=set(), 
                          months=set(), 
                          years=set())
        
        for p in _date.split():
            for _ in 'years months days'.split():
                if int(p) in plausible[_]:
                    candidates[_].add(int(p))
        
        if max(candidates['years']) >= 2019:
            candidates['years'] = {max(candidates['years'])}
        
        _dates = set()
        
        for _y in candidates['years']:
            
            if len(str(_y)) == 2:
                _y = int('20' + str(_y))
                
            for _m in candidates['months']:
                for _d in candidates['days']:
                    _dates.add(date(year=_y, month=_m, day=_d))
        
        return _dates   

In [380]:
t = TransactDescr(**d.iloc[12]).normalize()

In [387]:
all_abbrs = []

for r in d.iterrows():
    abbrs = {_ for _ in r[1]['account_name'].lower().split() if len(_) in [2,3]}
    all_abbrs += list(abbrs)

In [390]:
li = [p.replace("/",'') for p in all_abbrs]

In [420]:
i = 0
for r in d.iterrows():
    if ' bch ' in r[1]['account_name'].lower():
        print(r[1]['account_name'])
        i += 1
        if i == 5:
            break

Investments - Ops Account - BCH - Trade Commissions
Unrealized G/(L) - BCH Reserve
Investments - Ops Account - BCH FV Adj
Unrealized G/(L) - BCH Operating
Investments - Ops Account - BCH FV Adj


In [393]:
from collections import Counter

In [396]:
Counter([_ for _ in li if _.isalpha() and _ not in STOP_WORDS]).most_common()

[('ap', 2713286),
 ('ar', 2086554),
 ('rev', 763203),
 ('sd', 570957),
 ('er', 431418),
 ('tax', 390707),
 ('fs', 361162),
 ('svb', 209652),
 ('emp', 154799),
 ('ben', 152267),
 ('gas', 151638),
 ('sub', 149164),
 ('new', 133003),
 ('cos', 126708),
 ('cad', 116569),
 ('fx', 110818),
 ('usd', 110074),
 ('acc', 98563),
 ('td', 97874),
 ('use', 97409),
 ('llc', 90542),
 ('rpp', 80467),
 ('fso', 72943),
 ('fb', 71234),
 ('res', 70572),
 ('pto', 70050),
 ('rec', 60025),
 ('gwg', 57017),
 ('pay', 56803),
 ('cib', 56358),
 ('ltd', 53150),
 ('med', 51036),
 ('cr', 50851),
 ('iba', 50597),
 ('fee', 50428),
 ('ft', 45992),
 ('uk', 45980),
 ('hsa', 44570),
 ('air', 44392),
 ('dlp', 42943),
 ('iv', 41830),
 ('non', 40445),
 ('wip', 38447),
 ('wf', 38064),
 ('olo', 36507),
 ('aps', 36289),
 ('pr', 34975),
 ('exp', 34884),
 ('cc', 33628),
 ('iet', 31756),
 ('co', 31541),
 ('gst', 28950),
 ('mrr', 28838),
 ('mta', 28239),
 ('vol', 26993),
 ('net', 25760),
 ('wbf', 25473),
 ('ins', 25151),
 ('ic', 247

In [394]:
Counter(li).most_common()

[('ap', 2713286),
 ('and', 2333466),
 ('ar', 2086554),
 ('due', 1709613),
 ('(on', 1284498),
 ('rev', 763203),
 ('sd', 570957),
 ('to', 466737),
 ('er', 431418),
 ('tax', 390707),
 ('fs', 361162),
 ('go', 261410),
 ('svb', 209652),
 ('us', 174306),
 ('emp', 154799),
 ('ben', 152267),
 ('gas', 151638),
 ('sub', 149164),
 ('in', 139060),
 ('new', 133003),
 ('on', 127252),
 ('off', 126814),
 ('cos', 126708),
 ('cad', 116569),
 ('of', 112085),
 ('fx', 110818),
 ('usd', 110074),
 ('acc', 98563),
 ('3rd', 98424),
 ('td', 97874),
 ('use', 97409),
 ('llc', 90542),
 ('rpp', 80467),
 ('for', 78032),
 ('fso', 72943),
 ('fb', 71234),
 ('res', 70572),
 ('pto', 70050),
 ('14%', 61878),
 ('rec', 60025),
 ('gwg', 57017),
 ('pay', 56803),
 ('cib', 56358),
 ('10%', 55681),
 ('ltd', 53150),
 ('med', 51036),
 ('cr', 50851),
 ('g&a', 50828),
 ('iba', 50597),
 ('fee', 50428),
 ('t&e', 48530),
 ('12%', 47224),
 ('ft', 45992),
 ('uk', 45980),
 ('hsa', 44570),
 ('air', 44392),
 ('dlp', 42943),
 ('iv', 41830),


In [378]:
print(t)

acc.cat:  deferred revenue current
acc.name: deferred revenue maintenance support software unbilled
desc:     acp reversal


In [385]:
t._lemmatize('rabbits love carrots referred deferred higher')

'rabbit love carrot refer defer high'

In [68]:
ac = json.load(open('data/acdic.json'))

In [5]:
nlp = spacy.load("en_core_web_lg")

In [6]:
def get_text():
    
    row = d.iloc[randint(0, len(d)-1)]

    doc = nlp((row['account_category'] + ' ' + row['account_name'] + ' ' + row['description']).lower())
    
    doc_norm = [(token.text, token.lemma_, token.pos_, token.has_vector) 
                     for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON'] and 
                                 len(token.lemma_)>1 and 
                                 token.text.isalpha()]
    return ' '.join([_[0] for _ in doc_norm])

In [7]:
l1 = get_text()
doc1 = nlp(l1)
l2 = get_text()
doc2 = nlp(l2)

In [8]:
print(f'1: {l1}\n2: {l2}\nsimilarity: {doc1.similarity(doc2)}')

1: intercompany payable inter program salishan inter entity due
2: deferred revenue current deferred revenue software subscription acp reversals
similarity: 0.6615397444949102
