In [296]:
import numpy as np
import pandas as pd

import glob

import datetime

import re

# Chase statements

## functions

In [297]:
def add_20 (string):
    """
    makes 2 digit 2000's years ('17 or '10) into the 4 digit equivalent. If it's already 4 digit, leave it alone.
    
    Takes (string) which is the full date string of format mm/dd/yy or mm/dd/yyyy
    """
    #check if it's a 2 digit year
    if string[-3] == '/':
        
        return string[:-2] + '20' + string[-2:]
    
    #if it's already a 4 digit year, return as is
    elif string[-3] == '0':
        
        return string

In [298]:
def get_dtime (string):
    """
    converts strings to datetime objects using datetime.strptime mostly just wrote this so I could .apply() it in pandas
    method chains
    
    takes (string) which is the full date of format mm/dd/yyyy (can't be 2 digit year)
    """
    return datetime.datetime.strptime(string, '%m/%d/%Y')

In [299]:
def open_close (df_from_pdf):
    """
    produces a list of series slices [opening datetime, closing datetime] by searching for the (hopefully)
    single row where "opening/closing date" is listed, hopefully with the actual values as a separate cell
    than "opening/closing date" string
    ---Takes---
    df_from_pdf : a whole pdf derived credit card statement as a dataframe
    
    ---Returns---
    dates  :  [opening datetime, closing datetime] as series slices (I hope this is ok)
    """
    
    #copy so not mod orig
    df = df_from_pdf.copy()
    
    #get just the actual data in the row with opening and closing date
    df = df.dropna(axis=0, how='all').loc[df['Unnamed: 0'] == 'Opening/Closing Date', :].dropna(axis=1, how='all')
    
    #columns are default column titles that are annoying, rename to numbers
    df.columns = [i for i in range(len(df.columns))]
    
    #df now has 2 columns [0] = 'opening/closing date', [1] = 'mm/dd/yy - mm/dd/yy'
    open_close = df[1].str.split('-', expand=True)
    
    dates = []
    for i in range(len(open_close.columns)):
        dates.append(open_close[i].str.strip().apply(add_20).apply(get_dtime).values[0])
    
    #dates has open date first and close date second
    return dates

In [300]:
def is_date_string (entry):
    
    """
    decides whether or not a string matches the pattern 'mm/dd...'
    just checks if that patterns is present at the beginning of the string, doesnt check for end of string
    because statements sometimes have concatenated a bunch of strings into one giant entry that just begins
    with mm/dd
    """
    
    if isinstance(entry, str):
    
        bb = re.match(r"([0-9]{1,2}?)/([0-9]{1,2}?)", entry)

        if bb:
            return True
        else:
            return False
    else:
        return False

In [301]:
def date_desc_price (row_of_df):
    
    """
    takes a row of a dataframe containing one transaction's information
    the row must have at least 3 separate columns (others holding NaNs) with date, description, price
    they must be in that order
    
    returns a dictionary with {'date':date, 'desc':description, 'price':price} for use in making dataframes later
    """
    
    #have to drop on index because a row slice is also a series object which doesn't have multiple columns
    row = row_of_df.dropna(axis='index', how='all')
    
    #create tuples that pair date/desc/price to the appropriate item extracted from the row
    #and make a dictionary where date/desc/price : appropriate data. This makes it easier to
    #make dataframes later
    date_desc_price_dict = {i:j for (i,j) in zip(['date', 'desc', 'price'], [item for item in row])}
    

    return date_desc_price_dict

In [302]:
def expand_all_one_cell (df_row):
    """
    if an entry has all the info smashed into the first ('date') cell in the excel sheet,
    this function pulls out the date, desc and price and returns a dictionary that can be added
    into the cleaned transaction dataframe
    """
    
    entry = df_row['date']
    
    sp = ' '
    spaces = [pos for (pos, char) in enumerate(entry) if char == sp]
    
    first_space = spaces[0]
    last_space = spaces[-1]
    
    date = entry[:first_space]
    
    try:
        price = float(entry[last_space:])
    except:
        price = entry[last_space:]
    
    desc = entry[first_space:last_space].strip()
    
    
    return {i:j for (i,j) in zip(['date', 'desc', 'price'], [date, desc, price])}

In [303]:
def expand_2_in_one (df_row):
    
    entry = df_row['date']
    
    sp = ' '
    spaces = [pos for (pos, char) in enumerate(entry) if char == sp]
    
    first_space = spaces[0]
    
    date = df_row['date'][:first_space]
    desc = df_row['date'][first_space:].strip()
    price = df_row['desc']
    
    return {i:j for (i,j) in zip(['date', 'desc', 'price'], [date, desc, price])}

In [304]:
def remove_n_line (df):
    """
    sometimes the first entry in the date column is concatenated with a newline ('\n') and Purchases
    This splits at the new line and takes just the date
    """
    for row in df.index:
    
        entry = df.loc[row, 'date'] 

        if '\n' in entry:
            df.loc[row, 'date'] = entry.split('\n')[0]
        else:
            pass
        
    return df

In [305]:
def extract_all_date_desc_price (raw_df):
    
    """
    takes in a raw statement df read in from the xlsx file, searches using regex for dates of form 'mm/dd' in the
    first column, then extracts just those rows, cleans them up, uses function date_desc_price() to attempt to
    extract and label each piece of information appropriately and returns a dataframe with just the transaction
    information. Because of variable formatting in the parent xlsx, there will be errors, which I clean up later.
    """
    
    #get the indices in the statement where the first columns entry is a date, which should identify only transaction rows
    indx_where_trans = raw_df['Unnamed: 0'].apply(is_date_string)

    #get these transactions as a slice with a fresh index and no junky NaNs
    just_trans = (raw_df.loc[indx_where_trans, :]
                  .dropna(axis='columns', how='all')
                  .reset_index(drop=True)
                 )

    #make a list of dictionaries that hold all the date/desc/price info
    trans_dicts=[]
    for row in just_trans.index:

        trans_dicts.append(date_desc_price(just_trans.loc[row]))
        
    #make a nice clean dataframe with the information you want
    poss_err_trans = remove_n_line(pd.DataFrame(trans_dicts))
    
    return poss_err_trans

In [306]:
def correct_squishing_errors (date_desc_price_df):
    """
    there is variability in the parent xlsx documents that concatenates description and sometimes price to the date
    column, this checks for these errors by looking for NaN's in the transactions dataframe and then doing the proper
    correction based on the NaN pattern. The result is an dataframe with all the same transactions, but with concat
    errors fixed. The index is out of order as a result of this, can easily sort to fix.
    """
    
    for index in date_desc_price_df.index:
        
        #get a single row, may have NaNs in fields that have been concatenated into fields to the left
        h = date_desc_price_df.loc[index]

        #NaN's aren't equal to each other so this test will check for them
        #without triggering the weird np.isnan error when checking strings
        test = [item==item for item in h]
        
        #complete row
        if test == [True, True, True]:
            pass
        #date and desc squashed together, price is in desc
        elif test == [True, True, False]:
            date_desc_price_df = pd.concat([date_desc_price_df, pd.DataFrame(expand_2_in_one(h), index=[index])], axis='index')
        #all are squashed in first column
        elif test == [True, False, False] or test == [True]:
            date_desc_price_df = pd.concat([date_desc_price_df, pd.DataFrame(expand_all_one_cell(h), index=[index])], axis='index')
            
    date_desc_price_df = date_desc_price_df.dropna(axis='index', how='any')
    
    return date_desc_price_df

In [307]:
def remove_YEN (trans_df):
    """
    due to japan trip, there are some lines that have a date and the word "YEN" in them, but arent transactions
    remove these
    """
    
    for row in trans_df.index:
        #get a row
        r = trans_df.loc[row]
        
        for x in r:
            if 'YEN' in str(x):
                trans_df = trans_df.drop(row, axis=0)
    
    return trans_df

In [308]:
def complete_trans_date (trans_df):
    """
    transaction dates dont have years right now, sometimes transactions are from multiple years 
    (some are in 12/dd/2016, some are in 01/dd/2017). Add the proper year to the date column and
    extract that date to datetime
    """
    
    #check if this statement spans the change of a year
    c = trans_df.loc[0, 'st close'].year
    o = trans_df.loc[0, 'st open'].year
    
    year_diff = c - o
    
    #the statement stays within the same year
    if year_diff == 0:
        #doesn't matter which year to append
        trans_df['date'] = trans_df['date'] + '/' + str(c)
        
    #we change over years in this statement
    elif year_diff == 1:
        #where are the january transactions that should get the updated date
        where = ['01/' in d for d in trans_df['date']]
        
        #replace the january transactions with the closing date's year, which should be advanced
        trans_df.loc[where, 'date'] = trans_df.loc[where, 'date'] + '/' + str(c)
        
        #the one's that shouldn't get the advanced year
        not_where = [not x for x in where]
        
        #add the not advanced year to these dates
        trans_df.loc[not_where, 'date'] = trans_df.loc[not_where, 'date'] + '/' + str(o)
        
    else:
        raise ValueError('this statement appears to span multiple years, that doesnt make sense')
        
    trans_df['date'] = trans_df['date'].apply(get_dtime)
    
    return trans_df

In [309]:
def munge_one_statement (filepath):
    #open data
    df = pd.read_excel(filepath)

    #get the opening and closing dates as a list [open, close]
    o_c = open_close(df)
    
    #get the cleaned up, but still error prone transctions df
    poss_err_trans = extract_all_date_desc_price(df)
    
    #correct the concatenation errors
    trans_clean = correct_squishing_errors(poss_err_trans)
    
    #add the statement association information
    trans_clean['st open'] = o_c[0]
    trans_clean['st close'] = o_c[1]
    
    #remove any lines that have "YEN" because they aren't transactions
    trans_clean = remove_YEN(trans_clean)
    
    #complete and make datetime of the transaction date column
    trans_clean = complete_trans_date(trans_clean)
    
    return trans_clean

In [310]:
def all_sts_together (directory_with_st_xlsx):
    
    sts = glob.glob(directory_with_st_xlsx + '/*x8723-.xlsx')
    
    all_sts = []
    for statement_xlsx in sts:
        
        all_sts.append(munge_one_statement(statement_xlsx))
        
    full_hist = pd.concat(all_sts, axis=0)
    
    return full_hist

### work

In [333]:
dir_path = '../../../Finances/cc info/chase(-8723) statements/csv'

In [334]:
hist = all_sts_together(dir_path)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [335]:
#last statement was all buggy, so i just manually got the info out
last = complete_trans_date(pd.read_excel('../../../Finances/cc info/chase(-8723) statements/csv/clean st 20181008.xlsx'))

In [336]:
#put together
full_hist = pd.concat([hist, last], axis=0)

In [337]:
full_hist = full_hist.sort_values(by='date').reset_index(drop=True)
full_hist['card'] = 'chase -8723'

In [342]:
full_hist.loc[full_hist['desc'] == 'AUTOMATIC PAYMENT - THANK YOU', :]

Unnamed: 0,date,desc,price,st open,st close,card
28,2016-08-05,AUTOMATIC PAYMENT - THANK YOU,-285.91,2016-07-09,2016-08-08,chase -8723
46,2016-09-05,AUTOMATIC PAYMENT - THANK YOU,-1401.21,2016-08-09,2016-09-08,chase -8723
70,2016-10-05,AUTOMATIC PAYMENT - THANK YOU,-333.94,2016-09-09,2016-10-08,chase -8723
88,2016-11-04,AUTOMATIC PAYMENT - THANK YOU,-999.56,2016-10-09,2016-11-08,chase -8723
107,2016-12-05,AUTOMATIC PAYMENT - THANK YOU,-453.86,2016-11-09,2016-12-08,chase -8723
141,2017-01-05,AUTOMATIC PAYMENT - THANK YOU,-395.25,2016-12-09,2017-01-08,chase -8723
164,2017-02-05,AUTOMATIC PAYMENT - THANK YOU,-695.5,2017-01-09,2017-02-08,chase -8723
183,2017-03-05,AUTOMATIC PAYMENT - THANK YOU,-833.92,2017-02-09,2017-03-08,chase -8723
210,2017-04-05,AUTOMATIC PAYMENT - THANK YOU,-967.29,2017-03-09,2017-04-08,chase -8723
224,2017-05-05,AUTOMATIC PAYMENT - THANK YOU,-1126.64,2017-04-09,2017-05-08,chase -8723


In [340]:
full_hist.to_csv('../../../Finances/cc info/chase(-8723) statements/chase_up_to_20181008.csv', index=False)