In [17]:
import re

import requests
import pdfplumber
import pandas as pd
from collections import namedtuple

In [18]:
Inv = namedtuple('Inv', 'vend_num vend_name inv_dt due_dt inv_amt net_amt description')

In [19]:
def download_file(url):
    local_filename = url.split('/')[-1]
    
    with requests.get(url) as r:
        with open(local_filename, 'wb') as f:
            f.write(r.content)
        
    return local_filename

In [20]:
ap_url = 'https://www.tabs3.com/support/sample/apreports.pdf'

In [21]:
ap = download_file(ap_url)

In [22]:
with pdfplumber.open(ap) as pdf:
    page = pdf.pages[16]
    text = page.extract_text()

In [23]:
print(text)

AccountsPayableReportPack
changeswillbereflectedunderthe"New"heading.Whenaninvoiceormanualcheckisdeleted,
theInvoiceAmountisautomaticallychangedtozeroandtheStatusfieldischangedto"D".
TotalTransactionCount Totalnumberoftransactionsonthelistincludingchangesanddeletions.Eachchangedtransaction
(i.e.,botholdandnew)countsasonetransaction.
1Notshownonsamplereport.
Invoice by Vendor List
Date: 08/25/2020 Invoice by Vendor List Page: 1
Jensen, Martin & Anderson, P.C.
Inv Due Inv Disc Disc Net
Voucher # Invoice # Date Date Amount Amount Date Amount 1099 Ref # Description Codes
200 United Parcel Service
200.01MLJ 3243387 032620 062620 75.00 P 75.00 1 Overnight document delivery * 1
3243387 032720 062720 75.00 P 75.00 3 Postage * 1
3243387 072620 072620 75.00 P 75.00 2 Overnight document delivery * 1
Vendor Totals 225.00 0.00 225.00
202 Software Technology, LLC
312205 082020 082520 490.00 P 490.00 1 Software Maintenance Agreement Tabs3 * 1
312206 082020 082520 239.00 P 239.00 2 Software Maintenanc

In [8]:
new_vend_re = re.compile(r'^\d{3} [A-Z].*')

In [9]:
for line in text.split('\n'):
    if new_vend_re.match(line):
        print(line)

200 United Parcel Service
202 Software Technology, LLC
203 Clerk of the County Court
204 Clerk of the District Court
225 Jackson/Wylinda
240 NELCO
250 D & B Real Estate Management Company
300 Thomson Reuters Payment Center
325 Professional Messenger Services
400 Sprint Local & Long Distance
700 Clean All Janitorial Services


In [15]:
for line in text.split('\n'):
    if new_vend_re.match(line):
        vend_num, *vend_name = line.split()
        vend_name = ' '.join(vend_name)
print(vend_num)
print(vend_name)

700
Clean All Janitorial Services


In [33]:
inv_line_re = re.compile(r'(\d{6}) (\d{6}) ([\d,]+\.\d{2}) [\sP]*([\d,]+\.\d{2}) [YN ]*\d (.*?) [*\s\d]')

In [38]:
line_items = []
for line in text.split('\n'):
    if new_vend_re.match(line):
        vend_num, *vend_name = line.split()
        vend_name = ' '.join(vend_name)    
    
    line = inv_line_re.search(line)
    if line:
        inv_dt = line.group(1)
        due_dt = line.group(2)
        inv_amt = line.group(3)
        net_amt = line.group(4)
        desc = line.group(5)
        line_items.append(Inv(vend_num, vend_name, inv_dt, due_dt, inv_amt, net_amt, desc))

In [40]:
df = pd.DataFrame(line_items)

In [41]:
df.head()

Unnamed: 0,vend_num,vend_name,inv_dt,due_dt,inv_amt,net_amt,description
0,200,United Parcel Service,32620,62620,75.0,75.0,Overnight document delivery
1,200,United Parcel Service,32720,62720,75.0,75.0,Postage
2,200,United Parcel Service,72620,72620,75.0,75.0,Overnight document delivery
3,202,"Software Technology, LLC",82020,82520,490.0,490.0,Software Maintenance Agreement Tabs3
4,202,"Software Technology, LLC",82020,82520,239.0,239.0,Software Maintenance Agreement Tabs3 General L...


In [42]:
df['inv_dt'] = pd.to_datetime(df['inv_dt'])
df['due_dt'] = pd.to_datetime(df['due_dt'])


In [43]:
df.head()

Unnamed: 0,vend_num,vend_name,inv_dt,due_dt,inv_amt,net_amt,description
0,200,United Parcel Service,2020-03-26,2020-06-26,75.0,75.0,Overnight document delivery
1,200,United Parcel Service,2020-03-27,2020-06-27,75.0,75.0,Postage
2,200,United Parcel Service,2020-07-26,2020-07-26,75.0,75.0,Overnight document delivery
3,202,"Software Technology, LLC",2020-08-20,2020-08-25,490.0,490.0,Software Maintenance Agreement Tabs3
4,202,"Software Technology, LLC",2020-08-20,2020-08-25,239.0,239.0,Software Maintenance Agreement Tabs3 General L...


In [45]:
df['inv_amt'] = df['inv_amt'].map(lambda x: float(x.replace(',', '')))

In [46]:
df['net_amt'] = df['net_amt'].map(lambda x: float(x.replace(',', '')))

In [47]:
df.sum()

vend_num       2002002002022022022022022022022032042042042252...
vend_name      United Parcel ServiceUnited Parcel ServiceUnit...
inv_amt                                                  22476.3
net_amt                                                  22476.3
description    Overnight document deliveryPostageOvernight do...
dtype: object

In [48]:
df.to_csv('inv_lines.csv')