In [1]:
import tabula

In [2]:
# import pdfplumber as pdfp
import pandas as pd
import numpy as np
import warnings
import os
import time
# import tabula

warnings.filterwarnings('ignore')

In [3]:
###################################################################################################################
# Get the year and month that the user wants to analyze
###################################################################################################################

# clear the screen and get the input
os.system('cls' if os.name == 'nt' else 'clear')

year_ = input('\n What year are you looking to analyze? \n\n')
assert int(year_) in (2021, 2022), 'Invalid year entered'
month_ = input('\n What month are you looking to analyze? (1 - 12) \n\n')
assert int(month_) in list(np.arange(1, 13)), 'Invalid month entered'

# re-structure the input in such a way that we can use it
if int(month_) < 10:
    month_ = '0' + month_
str_month = year_[-2:] + '-' + month_

print(f'\n Processing data for {str_month}... \n')
# time.sleep(1.01)

###################################################################################################################
# Get the statement types we want to analyze
###################################################################################################################

statements = {1 : '(1) Neo - Credit',
              2 : '(2) RBC - Credit',
              3 : '(3) RBC - Debit',
              4 : '(4) Tangerine - Chequing',
              5 : '(5) Tangerine - Savings'}

# clear the screen and print the statements available to analyze
os.system('cls' if os.name == 'nt' else 'clear')

print('Available options: ', end='\n\n')
for i in statements:
    print('\t', statements[i])

# time.sleep(1.01)
    
# ask the user which statements they want to analyze
selected_options = []
option = ''
while len(selected_options) != 5:
    
    option = input("\n Select the number of the option you'd like to add (or press enter to continue)\n\n")
    if option == '':
        break
    assert int(option) in (1, 2, 3, 4, 5) and int(option) not in selected_options, 'Invalid year entered'
    selected_options.append(' '.join(statements[int(option)].split(' ')[1:]))

    


 What year are you looking to analyze? 

2022

 What month are you looking to analyze? (1 - 12) 

7

 Processing data for 22-07... 

Available options: 

	 (1) Neo - Credit
	 (2) RBC - Credit
	 (3) RBC - Debit
	 (4) Tangerine - Chequing
	 (5) Tangerine - Savings

 Select the number of the option you'd like to add (or press enter to continue)

2

 Select the number of the option you'd like to add (or press enter to continue)




In [9]:
statement_type = selected_options[0]
statement_type

'RBC - Credit'

In [10]:
statement_type = 'RBC - Debit'

In [11]:
file = f'Statements/{statement_type}/{str_month}.pdf'

test = tabula.read_pdf(file)
test[0]

'pages' argument isn't specified.Will extract only from page 1 by default.


Unnamed: 0.1,29 Jun,Payroll Deposit Wave PYRL,Unnamed: 0,Unnamed: 1,"1,886.89","3,366.02"
0,30 Jun,Online Banking transfer - 8885,,636.61,,2729.41
1,4 Jul,Interac purchase - 3330 GRACE CONVENIEN,,7.01,,2722.4
2,,Misc Payment Questrade Inc,,300.0,,
3,,e-Transfer sent Victoria Rumboldt,,1700.0,,722.4
4,5 Jul,e-Transfer received WSNCC,,,195.0,917.4
5,,GST CANADA,,,108.6,1026.0
6,6 Jul,Online Banking transfer - 9122,,351.14,,674.86
7,8 Jul,Prov/Local Gvt Payment CANADA,,,324.0,998.86


In [71]:
###################################################################################################################
# loop through the statements and parse out the important information
###################################################################################################################    

for statement_type in selected_options:    

    pages = []
    
    # open pdf statement, read each page and extract the text line by line, save results
    with pdfp.open(f'Statements/{statement_type}/{str_month}.pdf') as pdf:

        for page in pdf.pages:
            pages.append(page.extract_text().split('\n'))

        for i, page in enumerate(pages):
            if i == 0:
                df = pd.DataFrame(page, columns=['text'])
            else:
                new_df = pd.DataFrame(page, columns=['text'])
                df = pd.concat([df, new_df])


 What year are you looking to analyze? 

2022

 What month are you looking to analyze? (1 - 12) 

7

 Processing data for 22-07... 

Available options: 

	 (1) Neo - Credit
	 (2) RBC - Credit
	 (3) RBC - Debit
	 (4) Tangerine - Chequing
	 (5) Tangerine - Savings

 Select the number of the option you'd like to add (or press enter to continue)

3

 Select the number of the option you'd like to add (or press enter to continue)




In [59]:
pd.set_option('max_colwidth', None)

In [70]:
df.head(20)

Unnamed: 0,text
0,RBC® Avion® Visa PlatinumJ
1,DOMINICMORTIMER 451409******9415
2,"STATEMENTFROMJUN28TOJUL26,2022"
3,1OF3
4,PREVIOUSACCOUNTBALANCE $74.91 IMPORTANTINFORMATION
5,RBCREWARDSPOINTS
6,"DOMINICMORTIMER PreviousPointsbalance 58,167"
7,"451409******9415-PRIMARY Pointsearnedthisstatement 1,818"
8,"TRANSACTION POSTING ACTIVITYDESCRIPTION AMOUNT($) Newpointsbalance 59,985"
9,DATE DATE


In [75]:
pd.set_option('max_rows', None)

In [76]:
df

Unnamed: 0,text
0,RoyalBankofCanada Your RBC personal banking
1,P.O.Box4047TerminalA
2,TorontoONM5W1L5
3,account statement
4,"FromJune17,2022toJuly15,2022"
5,RBPDA10020_5929731_008 E D 017 00592 04426
6,DOMINICMORTIMER
7,Youraccountnumber: 00592-5039060
8,427BRIMORTONDRIVE
9,SCARBOROUGHON M1H2E4


In [48]:
    ###################################################################################################################
    # some data cleaning
    ###################################################################################################################

    # drop null values
    df = df.dropna()

    # select only lines that have a date associated with them
    df = df.loc[df['text'].str[:3].str.contains('JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC'), :]

    # split the date, description and amount out into their own columns
    df['date'] = df['text'].str.split(' ').str[0]
    df['place'] = df['text'].str.split(' ').str[2]
    df['amount'] = df['text'].str.split(' ').str[3]

    # reset the index and drop unnecessary columns
    df = df.reset_index().drop(['index', 'text'], axis=1)
    df = df.rename(columns={
        'date' : 'Date', 
        'place' : 'Place', 
        'amount' : 'Amount'
    })

    # touch up the amount column so that it will be formatted as a number in google sheet
    df['Amount'] = df['Amount'].str.replace(',', '').str.replace('\$', '')

Available options: 

	 (1) Neo - Credit
	 (2) RBC - Credit
	 (3) RBC - Debit
	 (4) Tangerine - Chequing
	 (5) Tangerine - Savings

 Select the number of the option you'd like to add (or press enter to continue)

2

 Select the number of the option you'd like to add (or press enter to continue)




In [3]:
import pdfplumber as pdfp
import pandas as pd
import numpy as np
import warnings
import os
import time

warnings.filterwarnings('ignore')

def extracting_credit_card_statements():

    ###################################################################################################################
    # Get the year and month that the user wants to analyze
    ###################################################################################################################

    # clear the screen and get the input
    os.system('cls' if os.name == 'nt' else 'clear')

    year_ = input('\n What year are you looking to analyze? \n\n')
    assert int(year_) in (2021, 2022), 'Invalid year entered'
    month_ = input('\n What month are you looking to analyze? (1 - 12) \n\n')
    assert int(month_) in list(np.arange(1, 13)), 'Invalid month entered'

    # re-structure the input in such a way that we can use it
    if int(month_) < 10:
        month_ = '0' + month_
    month = year_[-2:] + '-' + month_

    print(f'\n Processing data for {month}... \n')
    time.sleep(2)

    ###################################################################################################################
    # loading the data
    ###################################################################################################################

    pages = []

    # open pdf statement, read each page and extract the text line by line, save results
    with pdfp.open(f'Credit Card Statements/{month}.pdf') as pdf:

        for page in pdf.pages:
            pages.append(page.extract_text().split('\n'))

        for i, page in enumerate(pages):
            if i == 0:
                df = pd.DataFrame(page, columns=['text'])
            else:
                new_df = pd.DataFrame(page, columns=['text'])
                df = pd.concat([df, new_df])

    ###################################################################################################################
    # some data cleaning
    ###################################################################################################################

    # drop null values
    df = df.dropna()

    # select only lines that have a date associated with them
    df = df.loc[df['text'].str[:3].str.contains('JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC'), :]

    # split the date, description and amount out into their own columns
    df['date'] = df['text'].str.split(' ').str[0]
    df['place'] = df['text'].str.split(' ').str[2]
    df['amount'] = df['text'].str.split(' ').str[3]

    # reset the index and drop unnecessary columns
    df = df.reset_index().drop(['index', 'text'], axis=1)
    df = df.rename(columns={
        'date' : 'Date', 
        'place' : 'Place', 
        'amount' : 'Amount'
    })

    # touch up the amount column so that it will be formatted as a number in google sheet
    df['Amount'] = df['Amount'].str.replace(',', '').str.replace('\$', '')

    ###################################################################################################################
    # exclude entries which are recurring payments
    ###################################################################################################################

    recurring_payments = 'spotify'\
                        '|fido'\
                        '|tsiinternet'\
                        '|insurancecompanymarkham'

    os.system('cls' if os.name == 'nt' else 'clear')
    print('Recurring payments (not included in analysis): ', end='\n\n')
    for item in recurring_payments.split('|'):
        print(f'\t{item}')

    answer = input('\n Do the recurring payments look correct? (y/n): ')
    assert answer == 'y', 'Better update it!'

    df = df.loc[~df['Place'].str.lower().str.contains(recurring_payments), :]

    ###################################################################################################################
    # show the list of charges and ask if anything should be ignored this time around
    ###################################################################################################################

    os.system('cls' if os.name == 'nt' else 'clear')

    print('\n All charges: ')
    pd.set_option('max_rows', None)
    print(df[['Place', 'Amount']])
    pd.reset_option('max_rows')

    payments_to_ignore = 'westjet'\
                        '|paybackwithpoints'\
                        '|bessadakia'\
                        '|payment-thankyou'\
                        '|bestseller'
    # payments_to_ignore = 'nothing to ignore!'

    print(f'\n\n Ignoring the following charges: \n')
    for charge in payments_to_ignore.split('|'):
        print(charge)

    answer = input('\n Looking at the charges above, anything else you would like to add to the ignore list? (y/n): ')
    assert answer == 'n', 'Add it into line 116!'

    df = df.loc[~df['Place'].str.lower().str.contains(payments_to_ignore), :]

    ###################################################################################################################
    # set up the keywords which will be used to group the charges
    ###################################################################################################################

    on_the_go_coffee = 'timhortons'\
                    '|starbuck'\
                    '|coffee'\
                    '|timothy'\
                    '|balzac'\
                    '|madawaskacoffee'\
                    '|secondcup'

    beer_and_weed = 'lcbo'\
                    '|beerstore'\
                    '|oneplant'

    take_out = 'subway'\
            '|domino'\
            '|a&w'\
            '|zoup'\
            '|amazing'\
            '|maestro'\
            '|emily'\
            '|vipei'\
            '|bigbrother'\
            '|booster'\
            '|freshly'\
            '|northwinds'\
            '|jerk'\
            '|milkylane'\
            '|burrito'\
            '|jusdanfoods'\
            '|cornwall'\
            '|bastard'\
            '|rolltation'\
            '|shawarma'\
            '|carleton'\
            '|doordash'\
            '|mcdonald'\
            '|caesar'\
            '|pizza'\
            '|papajohn'\
            '|shakeshack'\
            '|einsteinbrosbagel'\
            '|bagelsvancouver'

    bars_and_restaurants = 'jackastor'\
                        '|portly'\
                        '|oldestone'\
                        '|magwyer'\
                        '|kelsey'\
                        '|chuuk'\
                        '|lacarnita'\
                        '|smitty'\
                        '|milestone'\
                        '|wildwing'\
                        '|popeyes'\
                        '|prenup'\
                        '|moose'\
                        '|sabai'\
                        '|borealis'\
                        '|thepint'\
                        '|chicago'\
                        '|spaghetti'\
                        '|bmofield'\
                        '|yummykorean'\
                        '|eggsmart'\
                        '|aokcraft'\
                        '|legendsmusic'\
                        '|aramark'\
                        '|cineplex'\
                        '|bar-main'\
                        '|petitami'\
                        '|brewhouse'\
                        '|irishtimespub'\
                        '|poncho'\
                        '|kingsheadpub'\
                        '|thecentralseattle'\
                        '|chachalounge'\
                        '|jamcafe'\
                        '|superflux'\
                        '|mileoneeatinghouse'\
                        '|trattoria'

    clothing = 'zara'\
            '|h&m'\
            '|aeo'\
            '|softmoc'\
            '|vans'\
            '|jack&jones'\
            '|oldnavy'\
            '|sportchek'\
            '|winners'\
            '|spencergifts'\
            '|boathouse'

    grocery = 'rcss'\
            '|freshco'\
            '|wal-mart'\
            '|nofrills'\
            '|loblaws'\
            '|zehrs'\
            '|metro'\
            '|foodbasics'\
            '|nikufarms'\
            '|safeway'

    gas = 'shell'\
        '|petro'\
        '|macewen barrys'\
        '|pioneer'\
        '|essofowlers'

    ###################################################################################################################
    # assert that we're not double counting any charges by placing them in multiple categories
    ###################################################################################################################

    cats = {
        'on_the_go_coffee' : on_the_go_coffee,
        'take_out' : take_out,
        'bars_and_restaurants' : bars_and_restaurants,
        'clothing' : clothing,
        'grocery' : grocery,
        'gas' : gas
    }

    for cat1 in cats:
        
        for cat2 in cats:
            
            if cat1 == cat2:
                continue
            else:
                for merchant in cats[cat1].split('|'):
                    assert merchant not in cats[cat2].split('|'), \
                    f'{cat1} contains duplicate values with {cat2}, duplicate value: {merchant}'

                    
    ###################################################################################################################
    ### show the user what is in each category
    ###################################################################################################################

    # for i, cat in enumerate(cats):
        
    #     os.system('cls' if os.name == 'nt' else 'clear')
    #     print('\n')
            
    #     print(f'{cat}: \n')
    #     for item in cats[cat].split('|'):
            
    #         print(f'\t{item}')
            
    #     answer = input('\n Does this category look correct? (y/n): ')
    #     assert answer == 'y', 'Better change it!'

        
    ###################################################################################################################
    ### show the user what is leftover (i.e. what was not caught by their keywords)
    ###################################################################################################################

    # clear the terminal screen
    os.system('cls' if os.name == 'nt' else 'clear')

    # generate full list of categorized charges
    full_list = on_the_go_coffee + '|' + beer_and_weed + '|' + take_out + '|' + \
                bars_and_restaurants + '|' + clothing + '|' + grocery + '|' + gas

    # use this to find uncategorized charges
    tmp = df.loc[~df['Place'].str.lower().str.contains(full_list), ['Place', 'Amount']]
    tmp.index = np.arange(1, len(tmp) + 1)
    print('\n\n', f'{tmp.shape[0]} leftover (uncategorized) charges: \n\n', tmp, '\n\n')

    # check with user that the uncategorized charges are correct
    answer = input('These will all go into the misc category, anything in here that you would like to add to a category? (y/n): ')
    assert answer == 'n', 'Please add to the categories as needed'

    # touch up the final version of the misc charges
    misc_charges = tmp['Place'].unique()
    misc = ''

    for misc_charge in misc_charges:
        misc = misc + '|' + misc_charge
        
    misc = misc.strip('|').lower()

    ###################################################################################################################
    ### aggregate the totals by category, show final output
    ###################################################################################################################

    # clear the terminal screen
    os.system('cls' if os.name == 'nt' else 'clear')

    # aggregate regular categories
    cat_names = [cat for cat in cats]
    final_df = pd.DataFrame(columns=cat_names)

    # set up the category field for the 'all_charges' df
    df['category'] = ''

    # loop through the categories, add the total charges to the final_df, and add the category label to the 'all_charges' df
    for cat in cats:
        final_df.loc[0, cat] = df.loc[df['Place'].str.lower().str.contains(cats[cat]), 'Amount'].astype('float64').sum()
        df['category'] = np.where(df['Place'].str.lower().str.contains(cats[cat]),
                                    cat,
                                    df['category'])
    df['category'] = np.where(df['category'] == '', 'Misc', df['category'])

    # aggregate the misc category
    final_df.loc[0, 'misc'] = df.loc[df['Place'].str.lower().str.contains(misc), 'Amount'].astype('float64').sum()

    # do some final touch ups on the 'all_charges' df
    df['date'] = '2022-' + df['Date'].str[:3].str.lower() + '-' + df['Date'].str[3:]
    df['date'] = pd.to_datetime(df['date'])
    df = df.drop('Date', axis=1)
    df['days_into_period'] = (df['date'] - df['date'].min()).dt.days
    df['Amount'] = df['Amount'].astype('float')
    df = df[['date', 'days_into_period', 'Place', 'category', 'Amount']]

    # show output
    print('Final result: ', end='\n\n')
    for col in final_df.columns:
        charge = round(final_df[col][0])
        print(f'{col} : ${charge}.00')
    print('\n\n')

    return (df, final_df)

# if __name__ == '__main__':
#     df, final_df = extracting_credit_card_statements()

In [82]:
import pdfplumber as pdfp
import pandas as pd
import numpy as np
import warnings
import os
import time

warnings.filterwarnings('ignore')

def extracting_credit_card_statements():

    ###################################################################################################################
    # Get the year and month that the user wants to analyze
    ###################################################################################################################

    # clear the screen and get the input
    os.system('cls' if os.name == 'nt' else 'clear')

    year_ = '2022'
    month_ = '05'
    month = year_[-2:] + '-' + month_
    
    # re-structure the input in such a way that we can use it
#     if int(month_) < 10:
#         month_ = '0' + month_
#     month = year_[-2:] + '-' + month_

    print(f'\n Processing data for {month}... \n')
    time.sleep(2)

    ###################################################################################################################
    # loading the data
    ###################################################################################################################

    pages = []

    # open pdf statement, read each page and extract the text line by line, save results
    with pdfp.open(f'Credit Card Statements/{month}.pdf') as pdf:

        for page in pdf.pages:
            pages.append(page.extract_text().split('\n'))

        for i, page in enumerate(pages):
            if i == 0:
                df = pd.DataFrame(page, columns=['text'])
            else:
                new_df = pd.DataFrame(page, columns=['text'])
                df = pd.concat([df, new_df])

    ###################################################################################################################
    # some data cleaning
    ###################################################################################################################

    # drop null values
    df = df.dropna()

    # select only lines that have a date associated with them
    df = df.loc[df['text'].str[:3].str.contains('JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC'), :]

    # split the date, description and amount out into their own columns
    df['date'] = df['text'].str.split(' ').str[0]
    df['place'] = df['text'].str.split(' ').str[2]
    df['amount'] = df['text'].str.split(' ').str[3]

    # reset the index and drop unnecessary columns
    df = df.reset_index().drop(['index', 'text'], axis=1)
    df = df.rename(columns={
        'date' : 'Date', 
        'place' : 'Place', 
        'amount' : 'Amount'
    })

    # touch up the amount column so that it will be formatted as a number in google sheet
    df['Amount'] = df['Amount'].str.replace(',', '').str.replace('\$', '')

    ###################################################################################################################
    # exclude entries which are recurring payments
    ###################################################################################################################

    recurring_payments = 'spotify'\
                        '|fido'\
                        '|tsiinternet'\
                        '|insurancecompanymarkham'

    os.system('cls' if os.name == 'nt' else 'clear')
    print('Recurring payments (not included in analysis): ', end='\n\n')
    for item in recurring_payments.split('|'):
        print(f'\t{item}')

#     answer = input('\n Do the recurring payments look correct? (y/n): ')
#     assert answer == 'y', 'Better update it!'

    df = df.loc[~df['Place'].str.lower().str.contains(recurring_payments), :]

    ###################################################################################################################
    # show the list of charges and ask if anything should be ignored this time around
    ###################################################################################################################

    os.system('cls' if os.name == 'nt' else 'clear')

    print('\n All charges: ')
    pd.set_option('max_rows', None)
    print(df[['Place', 'Amount']])
    pd.reset_option('max_rows')

    payments_to_ignore = 'westjet'\
                        '|paybackwithpoints'\
                        '|bessadakia'\
                        '|payment-thankyou'\
                        '|bestseller'
    # payments_to_ignore = 'nothing to ignore!'

    print(f'\n\n Ignoring the following charges: \n')
    for charge in payments_to_ignore.split('|'):
        print(charge)

#     answer = input('\n Looking at the charges above, anything else you would like to add to the ignore list? (y/n): ')
#     assert answer == 'n', 'Add it into line 116!'

    df = df.loc[~df['Place'].str.lower().str.contains(payments_to_ignore), :]

    ###################################################################################################################
    # set up the keywords which will be used to group the charges
    ###################################################################################################################

    on_the_go_coffee = 'timhortons'\
                    '|starbuck'\
                    '|coffee'\
                    '|timothy'\
                    '|balzac'\
                    '|madawaskacoffee'\
                    '|secondcup'

    beer_and_weed = 'lcbo'\
                    '|beerstore'\
                    '|oneplant'

    take_out = 'subway'\
            '|domino'\
            '|a&w'\
            '|zoup'\
            '|amazing'\
            '|maestro'\
            '|emily'\
            '|vipei'\
            '|bigbrother'\
            '|booster'\
            '|freshly'\
            '|northwinds'\
            '|jerk'\
            '|milkylane'\
            '|burrito'\
            '|jusdanfoods'\
            '|cornwall'\
            '|bastard'\
            '|rolltation'\
            '|shawarma'\
            '|carleton'\
            '|doordash'\
            '|mcdonald'\
            '|caesar'\
            '|pizza'\
            '|papajohn'\
            '|shakeshack'\
            '|einsteinbrosbagel'\
            '|bagelsvancouver'

    bars_and_restaurants = 'jackastor'\
                        '|portly'\
                        '|oldestone'\
                        '|magwyer'\
                        '|kelsey'\
                        '|chuuk'\
                        '|lacarnita'\
                        '|smitty'\
                        '|milestone'\
                        '|wildwing'\
                        '|popeyes'\
                        '|prenup'\
                        '|moose'\
                        '|sabai'\
                        '|borealis'\
                        '|thepint'\
                        '|chicago'\
                        '|spaghetti'\
                        '|bmofield'\
                        '|yummykorean'\
                        '|eggsmart'\
                        '|aokcraft'\
                        '|legendsmusic'\
                        '|aramark'\
                        '|cineplex'\
                        '|bar-main'\
                        '|petitami'\
                        '|brewhouse'\
                        '|irishtimespub'\
                        '|poncho'\
                        '|kingsheadpub'\
                        '|thecentralseattle'\
                        '|chachalounge'\
                        '|jamcafe'\
                        '|superflux'\
                        '|mileoneeatinghouse'\
                        '|trattoria'

    clothing = 'zara'\
            '|h&m'\
            '|aeo'\
            '|softmoc'\
            '|vans'\
            '|jack&jones'\
            '|oldnavy'\
            '|sportchek'\
            '|winners'\
            '|spencergifts'\
            '|boathouse'

    grocery = 'rcss'\
            '|freshco'\
            '|wal-mart'\
            '|nofrills'\
            '|loblaws'\
            '|zehrs'\
            '|metro'\
            '|foodbasics'\
            '|nikufarms'\
            '|safeway'

    gas = 'shell'\
        '|petro'\
        '|macewen barrys'\
        '|pioneer'\
        '|essofowlers'

    ###################################################################################################################
    # assert that we're not double counting any charges by placing them in multiple categories
    ###################################################################################################################

    cats = {
        'on_the_go_coffee' : on_the_go_coffee,
        'take_out' : take_out,
        'bars_and_restaurants' : bars_and_restaurants,
        'clothing' : clothing,
        'grocery' : grocery,
        'gas' : gas,
        'beer_and_weed' : beer_and_weed
    }

    for cat1 in cats:
        
        for cat2 in cats:
            
            if cat1 == cat2:
                continue
            else:
                for merchant in cats[cat1].split('|'):
                    assert merchant not in cats[cat2].split('|'), \
                    f'{cat1} contains duplicate values with {cat2}, duplicate value: {merchant}'

                    
    ###################################################################################################################
    ### show the user what is in each category
    ###################################################################################################################

    # for i, cat in enumerate(cats):
        
    #     os.system('cls' if os.name == 'nt' else 'clear')
    #     print('\n')
            
    #     print(f'{cat}: \n')
    #     for item in cats[cat].split('|'):
            
    #         print(f'\t{item}')
            
    #     answer = input('\n Does this category look correct? (y/n): ')
    #     assert answer == 'y', 'Better change it!'

        
    ###################################################################################################################
    ### show the user what is leftover (i.e. what was not caught by their keywords)
    ###################################################################################################################

    # clear the terminal screen
    os.system('cls' if os.name == 'nt' else 'clear')

    # generate full list of categorized charges
    full_list = on_the_go_coffee + '|' + beer_and_weed + '|' + take_out + '|' + \
                bars_and_restaurants + '|' + clothing + '|' + grocery + '|' + gas

    # use this to find uncategorized charges
    tmp = df.loc[~df['Place'].str.lower().str.contains(full_list), ['Place', 'Amount']]
    tmp.index = np.arange(1, len(tmp) + 1)
    print('\n\n', f'{tmp.shape[0]} leftover (uncategorized) charges: \n\n', tmp, '\n\n')

    # check with user that the uncategorized charges are correct
#     answer = input('These will all go into the misc category, anything in here that you would like to add to a category? (y/n): ')
#     assert answer == 'n', 'Please add to the categories as needed'

    # touch up the final version of the misc charges
    misc_charges = tmp['Place'].unique()
    misc = ''

    for misc_charge in misc_charges:
        misc = misc + '|' + misc_charge
        
    misc = misc.strip('|').lower()

    ###################################################################################################################
    ### aggregate the totals by category, show final output
    ###################################################################################################################

    # clear the terminal screen
    os.system('cls' if os.name == 'nt' else 'clear')

    df['category'] = ''
    
    # aggregate regular categories
    cat_names = [cat for cat in cats]
    final_df = pd.DataFrame(columns=cat_names)
    for cat in cats:
        print(cat)
        final_df.loc[0, cat] = df.loc[df['Place'].str.lower().str.contains(cats[cat]), 'Amount'].astype('float64').sum()
        df['category'] = np.where(df['Place'].str.lower().str.contains(cats[cat]),
                                  cat,
                                  df['category'])
        
    df['category'] = np.where(df['category'] == '', 'Misc', df['category'])

    # aggregate the misc category
    final_df.loc[0, 'misc'] = df.loc[df['Place'].str.lower().str.contains(misc), 'Amount'].astype('float64').sum()

    # show output
    print('Final result: ', end='\n\n')
    for col in final_df.columns:
        charge = round(final_df[col][0])
        print(f'{col} : ${charge}.00')
    print('\n\n')

    return (df, final_df)

In [5]:
df, df2 = extracting_credit_card_statements()


 What year are you looking to analyze? 

2022

 What month are you looking to analyze? (1 - 12) 

8

 Processing data for 22-08... 

Recurring payments (not included in analysis): 

	spotify
	fido
	tsiinternet
	insurancecompanymarkham

 Do the recurring payments look correct? (y/n): y

 All charges: 
                                    Place    Amount
0   WAL-MARTSUPERCENTER#3635SCARBOROUGHON     33.17
1            UBERCANADA/UBERTRIPTORONTOON     23.27
2               MCDONALD'S#40427TORONTOON      1.31
3         EMILY'SPALACECARIBBEANVAUGHANON     19.78
4           WESTJET8382169513322CALGARYAB   1474.08
5           PETSMARTINC.2009SCARBOROUGHON     -9.03
7    PAYBACKWITHPOINTS/PAYEZAVECVOSPOINTS   -200.00
8      SHN-CENTENARYHOSPITALSCARBOROUGHON      8.00
9        PETROCAN-1514STEELESAVEVAUGHANON     56.98
10               GOOGLE*DOMAINSINTERNETNS     19.21
11        BESSADAKIAOFAJAXANDPPICKERINGON    500.00
12        DOMINOSPIZZA10323905-831-0030ON     29.82
15         WINNERSHOM

In [6]:
df.head()

Unnamed: 0,date,days_into_period,Place,category,Amount
0,2022-07-24,0,WAL-MARTSUPERCENTER#3635SCARBOROUGHON,grocery,33.17
1,2022-07-26,2,UBERCANADA/UBERTRIPTORONTOON,Misc,23.27
2,2022-07-26,2,MCDONALD'S#40427TORONTOON,take_out,1.31
3,2022-07-27,3,EMILY'SPALACECARIBBEANVAUGHANON,take_out,19.78
5,2022-07-27,3,PETSMARTINC.2009SCARBOROUGHON,Misc,-9.03


In [123]:
df['date'] = '2022-' + df['Date'].str[:3].str.lower() + '-' + df['Date'].str[3:]
df['date'] = pd.to_datetime(df['date'])
df = df.drop('Date', axis=1)
df['days_into_period'] = (df['date'] - df['date'].min()).dt.days
df['Amount'] = df['Amount'].astype('float')
df.head()

Unnamed: 0,Place,Amount,category,date,days_into_period
1,NOFRILLSDAVE'S#3924SCARBOROUGHON,24.57,grocery,2022-04-27,0
2,TIMHORTONS#8434SCARBOROUGHON,2.4,on_the_go_coffee,2022-04-27,0
3,PIONEER#385SCARBOROUGHON,65.59,gas,2022-04-29,2
4,TIMHORTONS#8434SCARBOROUGHON,5.64,on_the_go_coffee,2022-04-30,3
5,CHICAGOPUBANDBILLIARDSCAMBRIDGEON,69.52,bars_and_restaurants,2022-04-30,3


In [114]:
fig = go.Figure()

tmp = df.groupby('category').sum().reset_index()[['category', 'Amount']]
tmp['amount_txt'] = round(tmp['Amount'])
tmp['amount_txt'] = '$' + tmp['amount_txt'].astype('str') + '0'

trace1 = go.Bar(x=tmp['category'], y=tmp['Amount'], text=tmp['amount_txt'], textposition='outside')

fig.add_trace(trace1)

fig.update_layout(title='Visualization of Charges by Category',
                  updatemenus=[dict(
            type="buttons",
            buttons=[dict(label="Play",
                          method="animate",
                          args=[None])])]
    )
fig.update_yaxes(title='Charges')
fig.show()

In [144]:
tmp = df.groupby('category').sum().reset_index()[['category', 'Amount']]
tmp['amount_txt'] = round(tmp['Amount'])
tmp['amount_txt'] = '$' + tmp['amount_txt'].astype('str') + '0'
trace1 = go.Bar(x=tmp['category'], y=tmp['Amount'], text=tmp['amount_txt'], textposition='outside')

ymax = tmp['Amount'].max() * 1.2


frames_test = []
for i in range(32):
    tmp = df.loc[df['days_into_period'] <= i, :]
    tmp = tmp.groupby('category').sum().reset_index()[['category', 'Amount']]
    tmp['amount_txt'] = round(tmp['Amount'])
    tmp['amount_txt'] = '$' + tmp['amount_txt'].astype('str') + '0'
    
    frames_test.append(go.Frame(
        data=[
            go.Bar(x=tmp['category'], y=tmp['Amount'], text=tmp['amount_txt'], textposition='outside')
        ]))

In [146]:
ymax

396.9

In [147]:
fig = go.Figure(
    data=[trace1],
    layout=go.Layout(
        yaxis=dict(range=[0, ymax], autorange=False, zeroline=False),
        title_text="Kinematic Generation of a Planar Curve", hovermode="closest",
        updatemenus=[dict(type="buttons",
                          buttons=[dict(label="Play",
                                        method="animate",
                                        args=[None])])]),
    frames=frames_test
)
fig.show()

In [129]:
# we have a list of frames
s = np.linspace(-1, 1, 50)
xx = s + s ** 2
yy = s - s ** 2

frames = []
for k in range(50):
    frames.append(go.Frame(
            data=[
                go.Scatter(
                x=[xx[k]],
                y=[yy[k]],
                mode="markers",
                marker=dict(color="red", size=10))
            ]))


In [117]:
fig.add_trace(trace1)

fig.update_layout(title='Visualization of Charges by Category',
                  updatemenus=[dict(
            type="buttons",
            buttons=[dict(label="Play",
                          method="animate",
                          args=[None])])],
                  frames=[go.Frame(
            data=[go.Scatter(
                x=[xx[k]],
                y=[yy[k]],
                mode="markers",
                marker=dict(color="red", size=10))])

            for k in range(N)]
    )
fig.update_yaxes(title='Charges')
fig.show()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


NameError: name 'N' is not defined