In [339]:
import numpy as np
import pandas as pd

import glob

import datetime

import re

# Chase statements

## use glob to get all the statements

In [340]:
dir_path = '../../../Finances/cc info/chase(-8723) statements/csv'

sts = glob.glob(dir_path + '/*x8723-.xlsx')

In [341]:
#test example
sts[0]

'../../../Finances/cc info/chase(-8723) statements/csv\\20160708-statements-x8723-.xlsx'

## functions

In [342]:
def add_20 (string):
    """
    makes 2 digit 2000's years ('17 or '10) into the 4 digit equivalent. If it's already 4 digit, leave it alone.
    
    Takes (string) which is the full date string of format mm/dd/yy or mm/dd/yyyy
    """
    #check if it's a 2 digit year
    if string[-3] == '/':
        
        return string[:-2] + '20' + string[-2:]
    
    #if it's already a 4 digit year, return as is
    elif string[-3] == '0':
        
        return string

In [343]:
def get_dtime (string):
    """
    converts strings to datetime objects using datetime.strptime mostly just wrote this so I could .apply() it in pandas
    method chains
    
    takes (string) which is the full date of format mm/dd/yyyy (can't be 2 digit year)
    """
    return datetime.datetime.strptime(string, '%m/%d/%Y')

In [344]:
def open_close (df_from_pdf):
    """
    produces a list of series slices [opening datetime, closing datetime] by searching for the (hopefully)
    single row where "opening/closing date" is listed, hopefully with the actual values as a separate cell
    than "opening/closing date" string
    ---Takes---
    df_from_pdf : a whole pdf derived credit card statement as a dataframe
    
    ---Returns---
    dates  :  [opening datetime, closing datetime] as series slices (I hope this is ok)
    """
    
    #copy so not mod orig
    df = df_from_pdf.copy()
    
    #get just the actual data in the row with opening and closing date
    df = df.dropna(axis=0, how='all').loc[df['Unnamed: 0'] == 'Opening/Closing Date', :].dropna(axis=1, how='all')
    
    #columns are default column titles that are annoying, rename to numbers
    df.columns = [i for i in range(len(df.columns))]
    
    #df now has 2 columns [0] = 'opening/closing date', [1] = 'mm/dd/yy - mm/dd/yy'
    open_close = df[1].str.split('-', expand=True)
    
    dates = []
    for i in range(len(open_close.columns)):
        dates.append(open_close[i].str.strip().apply(add_20).apply(get_dtime))
    
    #dates has open date first and close date second
    return dates

In [345]:
def is_date_string (entry):
    
    """
    decides whether or not a string matches the pattern 'mm/dd...'
    just checks if that patterns is present at the beginning of the string, doesnt check for end of string
    because statements sometimes have concatenated a bunch of strings into one giant entry that just begins
    with mm/dd
    """
    
    if isinstance(entry, str):
    
        bb = re.match(r"([0-9]{1,2}?)/([0-9]{1,2}?)", entry)

        if bb:
            return True
        else:
            return False
    else:
        return False

In [346]:
def date_desc_price (row_of_df):
    
    """
    takes a row of a dataframe containing one transaction's information
    the row must have at least 3 separate columns (others holding NaNs) with date, description, price
    they must be in that order
    
    returns a dictionary with {'date':date, 'desc':description, 'price':price} for use in making dataframes later
    """
    
    #have to drop on index because a row slice is also a series object which doesn't have multiple columns
    row = row_of_df.dropna(axis='index', how='all')
    
    #create tuples that pair date/desc/price to the appropriate item extracted from the row
    #and make a dictionary where date/desc/price : appropriate data. This makes it easier to
    #make dataframes later
    date_desc_price_dict = {i:j for (i,j) in zip(['date', 'desc', 'price'], [item for item in row])}
    

    return date_desc_price_dict

In [355]:
def expand_all_one_cell (df_row):
    """
    if an entry has all the info smashed into the first ('date') cell in the excel sheet,
    this function pulls out the date, desc and price and returns a dictionary that can be added
    into the cleaned transaction dataframe
    """
    
    entry = df_row['date']
    
    sp = ' '
    spaces = [pos for (pos, char) in enumerate(entry) if char == sp]
    
    first_space = spaces[0]
    last_space = spaces[-1]
    
    date = entry[:first_space]
    try:
        price = float(entry[last_space:])
    except:
        price = entry[last_space:]
    desc = entry[first_space:last_space].strip()
    
    return {i:j for (i,j) in zip(['date', 'desc', 'price'], [date, desc, price])}

In [348]:
def expand_2_in_one (df_row):
    
    entry = df_row['date']
    
    sp = ' '
    spaces = [pos for (pos, char) in enumerate(entry) if char == sp]
    
    first_space = spaces[0]
    
    date = df_row['date'][:first_space]
    desc = df_row['date'][first_space:].strip()
    price = df_row['desc']
    
    return {i:j for (i,j) in zip(['date', 'desc', 'price'], [date, desc, price])}

In [371]:
#open data
df = pd.read_excel(sts[2])

#get the opening and closing dates as a list [open, close]
o_c = open_close(df)

In [372]:
o_c

[49   2016-08-09
 Name: 0, dtype: datetime64[ns], 49   2016-09-08
 Name: 1, dtype: datetime64[ns]]

In [373]:
#get the indices in the statement where the first columns entry is a date, which should identify only transaction rows
indx_where_trans = df['Unnamed: 0'].apply(is_date_string)

#get these transactions as a slice with a fresh index and no junky NaNs
just_trans = (df.loc[indx_where_trans, :]
              .dropna(axis='columns', how='all')
              .reset_index(drop=True)
             )

#make a list of dictionaries that hold all the date/desc/price info
trans_dicts=[]
for row in range(len(just_trans.index)):

    trans_dicts.append(date_desc_price(just_trans.loc[row]))

In [374]:
#make a nice clean dataframe with the information you want
trans_clean = pd.DataFrame(trans_dicts)

In [375]:
trans_clean

Unnamed: 0,date,desc,price
0,09/04 DELTA A...,-14.9,
1,09/05 AUTOMAT...,-1401.21,
2,08/15 RALPHS ...,62.39,
3,08/18 SPIRIT ...,213.09,
4,08/23 ALDI 79...,30.41,
5,08/26 WHOLEFD...,12.04,
6,08/27 LA TRAN...,4.5,
7,08/27,ARCO #42345 AMPM DANA POINT CA,21.6
8,08/29,UBER *US AUG28 Z2C7P 866-576-1039 CA,5.15
9,08/29,DOOMIE`S HOME COOKIN` LOS ANGELES CA,40.42


In [376]:
for index in trans_clean.index:
    
    h = trans_clean.loc[index]

    #NaN's aren't equal to each other so this test will check for them
    #without triggering the weird np.isnan error when checking strings
    test = [item==item for item in h]

    #complete row
    if test == [True, True, True]:
        pass
    #date and desc squashed together, price is in desc
    elif test == [True, True, False]:
        #write function for this
        trans_clean = pd.concat([trans_clean, pd.DataFrame(expand_2_in_one(h), index=[index])], axis='index')
    #all are squashed in first column
    elif test == [True, False, False]:
        trans_clean = pd.concat([trans_clean, pd.DataFrame(expand_all_one_cell(h), index=[index])], axis='index')

In [379]:
trans_clean.dropna(axis='index', how='any')

Unnamed: 0,date,desc,price
7,08/27,ARCO #42345 AMPM DANA POINT CA,21.6
8,08/29,UBER *US AUG28 Z2C7P 866-576-1039 CA,5.15
9,08/29,DOOMIE`S HOME COOKIN` LOS ANGELES CA,40.42
10,08/28,MTA - WIRELESS HANDHELDS LOS ANGELES CA,1.75
11,08/30,DELTA AIR 0062113362008 ATLANTA GA,103.53
0,09/04,DELTA AIR 0062113811452 ATLANTA GA,-14.9
1,09/05,AUTOMATIC PAYMENT - THANK YOU,-1401.21
2,08/15,RALPHS #0021 SOUTH PASADEN CA,62.39
3,08/18,SPIRIT AIRL 4870134791383 800-7727117 FL,213.09
4,08/23,ALDI 79109 06649780 ARCADIA CA,30.41
