# Prepare dataset
Check for existence of "`Dataset.csv`"; if it exists, move on to training, else do the following.
Read separate csv files from known CBA, Bendigo and ANZ exports. Then extract their "description" columns, and attach respective bank-identification labels. Then shuffle the dataset, and re-export it as "`Dataset.csv`".

## All python imports and globals

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import datetime as dt
import os.path
import parseutils as pu

DATA_FILE_CBA='../data/CBA.csv'
DATA_FILE_BENDIGO='../data/Bendigo.csv'
DATA_FILE_ANZ='../data/ANZ.csv'
DATASET_FILE='../data/Dataset.csv'

COL_NAME_DATE='Date'
COL_NAME_AMOUNT='Amount'
COL_NAME_DESC='Description'
COL_NAME_BANK='Bank'

CSV_TYPE_CBA='CBA'
CSV_TYPE_BENDIGO='BENDIGO'
CSV_TYPE_ANZ='ANZ'
CSV_TYPE_UNKNOWN='UNKNOWN'


Check if data-set file already exists

## Utility function to build dataset

Build the dataset combining all the raw data and normalizing column names, and adding label for identifying bank

In [2]:
def buildDataSet():
    print "Building dataset..."
    def loadRawFiles():
        cba_raw=pd.read_csv(DATA_FILE_CBA, header=None)
        bendigo_raw=pd.read_csv(DATA_FILE_BENDIGO, header=None)
        anz_raw=pd.read_csv(DATA_FILE_ANZ, header=None)
        return cba_raw, bendigo_raw, anz_raw

    def addHeaders(df, columns,bankLabel):
        df.columns=columns
        df[COL_NAME_BANK]=bankLabel
        return df
    cba_raw, bendigo_raw, anz_raw = loadRawFiles()
    
    #add columns and label bank in a separate column
    cba_raw = addHeaders(cba_raw,[COL_NAME_DATE, COL_NAME_AMOUNT, COL_NAME_DESC], CSV_TYPE_CBA)
    bendigo_raw = addHeaders(bendigo_raw,[COL_NAME_DATE, COL_NAME_AMOUNT, COL_NAME_DESC], CSV_TYPE_BENDIGO)
    anz_raw = addHeaders(anz_raw,[COL_NAME_DATE, COL_NAME_AMOUNT, COL_NAME_DESC], CSV_TYPE_ANZ)
    #print ("CBA\n===")
    #print (cba_raw.head(2))
    #print ("\n\nBENDIGO\n=======")
    #print (bendigo_raw.head(2))
    #print ("\n\nANZ\n===")
    #print (anz_raw.head(2))
    
    #combine all the resulting data into a single dataframe, and shuffle them
    return (pd.concat([cba_raw, bendigo_raw, anz_raw])).sample(frac=1)


    

## Build dataset if necessary


In [5]:
DATASET_FILE_EXISTS = os.path.isfile(DATASET_FILE)
dataset = None
if DATASET_FILE_EXISTS:
    print "Dataset file %s exists. \nLoading to pandas" % (DATASET_FILE)
    dataset = pd.read_csv(DATASET_FILE)
else:
    print "Dataset file doesn't exist. Building it from raw files"
    dataset = buildDataSet()
    print "Dataset built..."
    print dataset.head(10)
    print ("Saving to file...%s" % (DATASET_FILE)) 
    dataset.to_csv(DATASET_FILE, header=True, index=False)
    print "Saved: %s" %(os.path.isfile(DATASET_FILE))


Dataset file ../data/Dataset.csv exists. 
Loading to pandas
