In [541]:
import pandas as pd
import re
import spacy
import json
from random import randint
from spacy.lang.en.stop_words import STOP_WORDS
from typing import Any, Optional, NamedTuple
from string import punctuation
from datetime import date
from itertools import combinations_with_replacement
from collections import Counter, defaultdict

In [368]:
nlp = spacy.load("en_core_web_lg")

In [369]:
d = pd.read_csv('data/transaction_texts.csv')

In [549]:
class Transaction:
    
    def __init__(self, account_category, account_name, description):
        
        self.account_category: Optional[str] = account_category
        self.account_name: Optional[str] = account_name
        self.description: Optional[str] = description
            
        self.doc = defaultdict()
        
        self.original = self.__str__()
        
        self.labels = set()
    
    def __str__(self):
        return f'{"acc.cat:":<10}{self.account_category}\n{"acc.name:":<10}{self.account_name}\n{"desc:":<10}{self.description}'
    
    def make_doc(self):
        
        if self.account_category:
            self.doc['account_category'] = nlp(self.account_category.lower())
        if self.account_name:
            self.doc['account_name'] = nlp(self.account_name.lower())
        if self.description:
            self.doc['description'] = nlp(self.description.lower())
        
        return self

In [623]:
class Parser:
    
    def __init__(self):
        
        self.abbrs = json.load(open('data/acdic.json'))
        self.syns = {'revenue': set('revenue earnings gain income incoming proceeds profit return yield'.split()),
                     'accrue': set('accrue accumulate amass collect gather aggregate hoard'.split()),
                     'consolidation': set('consolidation merger strengthening unification amalgamation'.split()),
                     'adjustment': set('adjustment adjust adj alteration modification readjustment'.split()),
                     'payroll': set('payroll salary wages pay remuneration paycheck earning'.split()),
                     'liability': set('liability debt burden obligation owing uninvoiced'.split()),
                     'month': {'apr', 'april', 'aug', 'august', 'dec', 'december', 'feb', 'february',
                               'jan', 'january', 'jul', 'july', 'jun','june', 'mar', 'march','may',
                               'nov', 'november', 'oct', 'october', 'sep', 'september'},
                     'food': set('food meal dinner lunch restaurant cafe brunch breakfast catering'.split())}
    
    def run(self, st):
        
        _labs = defaultdict()
        
        # all to lower case
        st = st.lower()
        
        # remove /
        st = st.replace('/','')
        
        # replace punctuation with a single white space
        st = st.translate(str.maketrans({_: ' ' for _ in punctuation}))
        
        # replace multiple consecutive white spaces with a single one
        st = re.sub(r'\s+', ' ', st)
        
        extracted_dates = self.find_date(st)
        if extracted_dates:
            _labs['dates'] = extracted_dates
        
        # unfold abbreviations
        st = ' '.join([self.abbrs.get(w, w) for w in st.split()])
        
        # remove numbers
        st = ' '.join([w for w in st.split() if w.isalpha()])
        
        # remove stopwords
        st = ' '.join([w for w in st.split() if w not in STOP_WORDS])
        
        # strip white spaces on ends
        st = st.strip()
        
        extracted_labels = self.extract_labels(st)
        if extracted_labels:
            _labs['types'] = extracted_labels
        
        return (st, _labs)
    
    def find_date(self, st):
        
        try:
            _date = re.search(r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{2}\b)|'
                              r'(\b\d{4}\s{1}\d{1,2}\s{1}\d{1,2}\b)|'
                              r'(\b\d{1,2}\s{1}\d{1,2}\s{1}\d{4}\b)', st).group(0)
        except:
            return None
        
        plausible = dict(days=set(range(1,32)), 
                         months=set(range(1,13)), 
                         years=set(range(19,22)) | set(range(2019, 2022)))
        
        candidates = dict(days=set(), 
                          months=set(), 
                          years=set())
        
        for p in _date.split():
            for _ in 'years months days'.split():
                if int(p) in plausible[_]:
                    candidates[_].add(int(p))
        
        if max(candidates['years']) >= 2019:
            candidates['years'] = {max(candidates['years'])}
        
        _dates = set()
        
        for _y in candidates['years']:
            
            if len(str(_y)) == 2:
                _y = int('20' + str(_y))
                
            for _m in candidates['months']:
                for _d in candidates['days']:
                    _dates.add(date(year=_y, month=_m, day=_d))
        
        return _dates   
    
    def extract_labels(self, st):
        
        _labs = set()
        
        for what in self.syns:
            for w in st.split():
                if w in self.syns[what]:
                    _labs.add(what)
        
        if _labs:
            return _labs

In [624]:
p = Parser()

In [625]:
sent = 'ACCrual payment 23-1-20 on the low GFT and also JUNE '
sentn, labs = p.run(sent)

In [626]:
print(sentn)

accrual payment low gft june


In [627]:
labs

defaultdict(None,
            {'dates': {datetime.date(2020, 1, 1),
              datetime.date(2020, 1, 20),
              datetime.date(2020, 1, 23)},
             'types': {'month'}})

In [543]:
t = Transaction(**d.iloc[12])

In [598]:
syns = set('january february march april may june july august september october november december'.split())
syns |= {m[:3] for m in syns}

In [599]:
syns

{'apr',
 'april',
 'aug',
 'august',
 'dec',
 'december',
 'feb',
 'february',
 'jan',
 'january',
 'jul',
 'july',
 'jun',
 'june',
 'mar',
 'march',
 'may',
 'nov',
 'november',
 'oct',
 'october',
 'sep',
 'september'}

In [546]:
t.make_doc()

<__main__.Transaction at 0x7ffa8aa61ee0>

In [548]:
[_.lemma_ for _ in t.doc['account_category']]

['Deferred', 'revenue', '-', 'current']

In [540]:
print(t)

acc.cat:  deferred revenue current
acc.name: deferred revenue maintenance support software unbilled
desc:     acp reversal


In [387]:
all_abbrs = []

for r in d.iterrows():
    abbrs = {_ for _ in r[1]['account_name'].lower().split() if len(_) in [2,3]}
    all_abbrs += list(abbrs)

In [390]:
li = [p.replace("/",'') for p in all_abbrs]

In [420]:
i = 0
for r in d.iterrows():
    if ' bch ' in r[1]['account_name'].lower():
        print(r[1]['account_name'])
        i += 1
        if i == 5:
            break

Investments - Ops Account - BCH - Trade Commissions
Unrealized G/(L) - BCH Reserve
Investments - Ops Account - BCH FV Adj
Unrealized G/(L) - BCH Operating
Investments - Ops Account - BCH FV Adj


In [None]:
Counter([_ for _ in li if _.isalpha() and _ not in STOP_WORDS]).most_common()

In [378]:
print(t)

acc.cat:  deferred revenue current
acc.name: deferred revenue maintenance support software unbilled
desc:     acp reversal


In [385]:
t._lemmatize('rabbits love carrots referred deferred higher')

'rabbit love carrot refer defer high'

In [68]:
ac = json.load(open('data/acdic.json'))

In [5]:
nlp = spacy.load("en_core_web_lg")

In [521]:
def get_text():
    
#     row = d.iloc[randint(0, len(d)-1)]
    row = TransactDescr(**d.iloc[randint(0, len(d)-1)]).normalize()

    doc = nlp(row.description)
    
    doc_norm = [(token.text, token.lemma_, token.pos_, token.has_vector) 
                     for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON'] and 
                                 len(token.lemma_)>1 and 
                                 token.text.isalpha()]
    return ' '.join([_[0] for _ in doc_norm])

In [534]:
for _ in range(50):
    l1 = get_text()
    doc1 = nlp(l1)
    l2 = get_text()
    doc2 = nlp(l2)
    print(f'1: {l1}\n2: {l2}\nsimilarity: {doc1.similarity(doc2)}')

1: acp reversal
2: payroll
similarity: 0.11917300646492232
1: purchase
2: taxis
similarity: 0.12457224066823691
1: depreciation asset landscaping serial number
2: depreciation asset long point office wall reconfiguration serial number
similarity: 0.8528660305681611
1: 
2: consolidation
similarity: 0.0
1: consolidation
2: consolidation
similarity: 1.0
1: receivables sale entry
2: fund bmp
similarity: 0.30342735008984334
1: book provision certification charge report
2: consolidation
similarity: 0.39132950925975746


  print(f'1: {l1}\n2: {l2}\nsimilarity: {doc1.similarity(doc2)}')


1: consolidation
2: cash
similarity: 0.4657770986771067
1: accrue expense oct
2: consolidation
similarity: 0.4300465789305381
1: consolidation
2: acp reversal
similarity: 0.2224955151386263
1: purchase
2: prepay inv chair sale service amc chair chair apr oct
similarity: 0.4320984592578958
1: acp reversal
2: consolidation
similarity: 0.2224955151386263
1: depreciation asset co serial number
2: consolidation
similarity: 0.43216430252592875
1: reverse accrual adobe inv
2: consolidation
similarity: 0.3807563918103622
1: period daily sale
2: 
similarity: 0.0
1: inter entity
2: service allocation
similarity: 0.4607773646987202
1: consolidation
2: ledger entry
similarity: 0.26463612588798363
1: legacy mp donation dep
2: consolidation
similarity: 0.258098599502456
1: service allocation
2: consolidation
similarity: 0.4230199631327694
1: ultipro payroll
2: alloc active pt
similarity: 0.05631301193468577
1: merchant etf
2: acp rr reclass
similarity: 0.1648853551361504
1: contribution payable
2: c

1: period daily sale amex
2: dialpad monthly cell phone bill
similarity: 0.37131790764316164
