In [125]:
from os import scandir
from os.path import join
from PyPDF2 import PdfFileReader

import pandas as pd
import re

In [126]:
old_pdfs = r"C:\Users\Chris\OneDrive\Documents\Python_Projects\Money_Manager\data\Discover\Before_Change"
old_reg = r"\w{3} \d{1,2} \w{3} \d{1,2} .* \$?-?\d?,?\d+\.\d{2}"

new_pdfs = r"C:\Users\Chris\OneDrive\Documents\Python_Projects\Money_Manager\data\Discover\After_Change"
new_reg = r"\d{1,2}/\d{2}.*?\n?.*-?\$\d+\.\d{2}"

In [133]:
def scrape_pdfs(path: str, regex: re.Pattern):
    matches = []

    with scandir(path) as entries:
        for entry in entries:
            name = entry.name
            year = name[:4]

            with open(join(path, name), "rb") as f:
                reader = PdfFileReader(f)
                for page in reader.pages:
                    text = page.extract_text(0) + '\n'

                    found = re.findall(regex, text, flags=re.MULTILINE)
                    [matches.append(' '.join([year, x])) for x in found]

    del year, name
    return matches

In [134]:
test_dir = r'C:\Users\Chris\OneDrive\Documents\Python_Projects\Money_Manager\data\test2'
scrape_pdfs(test_dir, old_reg)

['2015 Oct 3 Oct 3 INTERNET PAYMENT - THANK YOU $ -265.00',
 '2015 Sep 22 Sep 22 FGT*EDCVEGAS 888-512-7469 TX $ 264.49',
 '2015 Sep 28 Sep 28 EXPEDIA*1117965480211 BELLEVUE WA 1,901.89',
 '2015 Sep 24 Sep 24 THE HOME DEPOT #0969 FORKED RIVER NJ $ 28.42',
 '2015 Nov 14 Nov 14 INTERNET PAYMENT - THANK YOU $ -150.00',
 '2015 Nov 2 Nov 2 REWARD STATEMENT CREDIT $ -23.08',
 '2015 Dec 19 Dec 19 INTERNET PAYMENT - THANK YOU $ -60.00',
 '2016 Jan 19 Jan 19 INTERNET PAYMENT - THANK YOU $ -100.00',
 '2016 Feb 20 Feb 20 INTERNET PAYMENT - THANK YOU $ -150.00',
 '2016 Mar 15 Mar 15 INTERNET PAYMENT - THANK YOU $ -150.00',
 '2016 Mar 23 Mar 23 TCKTWEB*STEVEAOKIEDCWE 800-965-4827 CA $ 61.87',
 '2016 Apr 19 Apr 19 INTERNET PAYMENT - THANK YOU $ -174.38',
 '2016 Apr 19 Apr 19 REWARD STATEMENT CREDIT $ -0.62',
 '2016 Apr 7 Apr 7 BUDGET.COM PREPAY 800-621-2844 NJ $ 269.56',
 '2016 May 20 May 20 INTERNET PAYMENT - THANK YOU $ -172.30',
 '2016 May 20 May 20 CASHBACK BONUS REDEMPTION PYMT/STMT CRDT $ -2.70

In [114]:
test_pdfs = r'C:\Users\Chris\OneDrive\Documents\Python_Projects\Money_Manager\data\test2'

In [None]:
# clean old PDF data and append to master list

for line in old_lines:
    split_line = line.split()
    split_date = split_line[0:3]
    date = ['/'.join([months_dict[split_date[1]], split_date[2], split_date[0]])]
    merchant = [' '.join(x for x in split_line[5:-2])]
    amount = [split_line[-1:][0].replace(',', '')]
    clean_data.append(date + ['Discover'] + amount + merchant)

In [None]:
# clean new PDF data and append to master list

new_lines = [line.replace('\n', ' ') if '\n' in line else line for line in new_lines] # replace newline characters
new_lines = [line[:line.index('PREVIOUS')] if 'PREVIOUS' in line else line for line in new_lines] # delete PREVIOUS in lines

for line in new_lines:
    split_line = line.split()
    date = ['/'.join([split_line[1].replace('0', ''), split_line[0]]) if split_line[1].startswith('0') else '/'.join([split_line[1], split_line[0]])]
    merchant = [' '.join(x for x in split_line[2:-1])]
    amount = [split_line[-1].replace('$', '')]
    clean_data.append(date + ['Discover'] + amount + merchant)


In [None]:
data_types = {
    'Account': object,
    'Amount': float,
    'Recipient': object,
    'Category': object,
    'SubCategory': object,
    'Project': object,
    'Note': object
}

In [None]:
df = pd.DataFrame(clean_data, columns=['Date', 'Account', 'Amount', 'Recipient'])
df['Category'] = np.NaN
df['SubCategory'] = np.NaN
df['Project'] = np.NaN
df['Note'] = np.NaN

df = df.astype(data_types)
df['Amount'].apply(lambda x: x * -1).sum()
df.set_index('Date', inplace=True)
df.index = pd.to_datetime(df.index, format="%m/%d/%Y", exact=True)
df.sort_index(ascending=True, inplace=True)
# df.to_excel('discover_transactions.xlsx')

In [None]:
df.head(20)