In [83]:
import os
import re
import boto3
import json
import time
import requests
import datetime

import pandas as pd
import numpy as np

from sagemaker.session import Session
from bs4 import BeautifulSoup

## Accounting term matching
**Check to see if we report totals (e.g. Total Liability & Shareholder Equity) or sub-totals (e.g. Total Financial Instruments), these figure are not needed for construction of the unstructured database (avoid classification issue)**

In [121]:
def totals_check(df:pd.DataFrame) -> pd.DataFrame:
    """
    Checks to see if a line row meets the conditon of a total, if true we remove these rows as we make 
    have checked the terms before have meet our conditions (these include major and minor totals)
    ------------------------------------------------------------------------------------
    :param: (type pandas.DataFrame)
        A DataFrame that represents the Asset or Liability & Equity portion of the balance sheet
        
    :return: (type tuple)
        Return a cleaned DataFrame that strips the rows that represent totals
    """
    m, n = df.shape    # unpack the shape of dataframe
    
    # numpy exception for handling invalid log10 RunTime error (we opt to not show)
    # switch 'ignore' to 'warn', if you would like to flag the RunTime error 
    np.seterr(invalid = 'ignore') 
    
    def multiple_check(x1:float, x2:float):
        """
        Determine whether the two values are the same number scaled by 10
        """
        # prevent zero division error since x1 is the denominator and log10 zero division error
        if (x1 == 0) or (x2 == 0): return False
        else:
            # if our backward sum is a multiple of 10, we return True 
            # (e.g. Total Assets (x1) 745.2322 vs Backward Sum (x2) 7452322)
            check1 = np.log10(x2 / x1).is_integer()
             
            # if our backward sum is a substring of a line item, with a difference of one in length, we return True 
            # (e.g. Total Assets (x1) 174182935 vs Backward Sum (x2) 74182935)
            check2 = (str(x2) in str(x1) ) & (len(str(x2)) == len(str(x1)) - 1)
            
            if check1 or check2: 
                return True
            else: 
                return False
    
    def epsilon_error(x1:float, x2:float, tol:float=0.01):
        """
        Determine whether the two values are within a similar epsilon bound. We default our error tolerance
        to 0.01, implying that if two numbers are within a specified toleracnce (default 1% ) of one another 
        """
        if (x1 == 0) or (x2 == 0): return False
        else:
            # first we convert the numeric quantities into strings
            current = str(x1)
            lookback = str(x2)
            
            # we only want to check against the relative difference if one element in the number is read wrong
            if len(current) == len(lookback):

                # we iterate linearly through each string and check to see the positional match 
                # if we catch and mismatch we flag it with a 1, othewise skip with 0
                changes = [0 if (current[i] == lookback[i]) else 1 for i in range(len(current))]
                
                # check set differences produce a set with exactly 1 in length
                if sum(changes) == 1:
                
                    diff = abs(x1 - x2)      # compute numeric differences

                    # check to see whether an accounting condition was met wihtin a boundary condition
                    if abs(diff / x1) <= tol:
                        return True
            
            return False
    
    data_col = df.columns[1]         # the values column for balance sheet
    
    # iterate through each of the line items
    for i in range(m):
        # check the value of line items at a given index (forward index)
        item1 = df.loc[i].values[1]
        name = df.loc[i].values[0]
        
        # compute backward sum (lookback index) 
        for j in range(i):
            
            # check whether dataframe empty (if so we skip to avoid fitting errors)
            # NOTE: Index position (i-1)   = line above current line
            #                      (i-j-1) = trailing look up line 'j' lines above the line above current line
            lookback = df.loc[i-j-1:i-1][data_col]
            
            # we check whether the lookback period is empty (if so we most likely deleted the row)
            if not lookback.empty:
                # backward sum for line items (index minus j-periods before)
                item2 = lookback.sum()

                # if we achieve this then we strip totals and break, no need to continue backward sum
                if (item1 == item2) or multiple_check(item1, item2) or epsilon_error(item1, item2, tol=0.0004):
                    df = df.drop(index=i)
                    
                    # Error Handling for row deletions (uncomment for when not in use)
                    print('We dropped row {}, {}, with lookback window of {}.'.format(i, name, j+1))
                    print('\tOur row is valued at {}, our lookback sum is {}'.format(item1, item2))
                    
                    # we break from inner loop to avoid key error flag 
                    break     
                
    return df

## Unstructured Database construction
**We develop our unstructured database from each of the non-total rows (concating the line items)**

In [122]:
def unstructured_data(df, filing_d, fiscal_y, cik, cik2name:dict) -> pd.DataFrame:
    """
    Forms unstructured row for larger database to be stored in s3 bucket
    ------------------------------------------------------------------------------------------
    Input:
        :param: df (type pandas.DataFrame)
            The balance sheet for a particular  
        :paran: filing_d (type str)
            The filing date for release of X-17A-5 filings for a broker dealer e.g. 2013-03-21
        :paran: fiscal_y (type str)
            The fiscal year for the balance sheet to cover e.g. 2012 (usually 1-year prior to filing date)
        :paran: cik (type str)
            The CIK number for a broker dealer e.g. 887767
        :paran: cik2name (type dict)
            A dictionary that maps CIK to Broker Deale names 
    Output:  
        :return: (type pandas.DataFrame)
             Return a transposed dataframe with additional columns corresponding to filing data
    """
    
    # intialize the first column (line items)
    first_column = df.columns[0]
    
    # clean dataframe should be of size greater than 1
    if len(df.columns) > 1:
        
        # transpose split balance sheet figure (our line items are now columns)
        # we first groupby the first column (this become index) and sum to group congruent names
        row = df.groupby(first_column).sum().T

        # creating additional columns in row
        row['CIK'] = cik                                  # CIK number for firm 
        row['Filing Date'] = filing_d                     # Filing Date for firm filing
        row['Filing Year'] = fiscal_y                     # Year for balance sheet filing
        row['Name'] = cik2name['broker-dealers'][cik]     # returns the name of associated with the CIK
        
        return row
    
    else:
        print('{}-{}.csv - encountered issue reading PDF'.format(cik, filing_d))
        return None
    

In [123]:
def totals_flag(df:pd.DataFrame):
    """
    Check to see whether their exists the presence of a total term from the line items
    ------------------------------------------------------------------------------------------
    """
    # perform regex search 
    selection = [i for i in df.columns if re.search('total|totals', i, flags=re.I) is not None]
    
    # if there exists some mention of the word total in a line item
    # we know to re-run structuring on PNG rather than PDF
    if len(selection) > 0:
        return True
    
    return False    

In [124]:
def reorder_columns(df:pd.DataFrame) -> pd.DataFrame:
    """
    Re-order the completed DataFrame by ordering the CIK, Name, Filing Data and Filing Year 
    ------------------------------------------------------------------------------------------
    Input:
        :param: df (type pandas.DataFrame)
            The unstructured database for balance sheet figures
    Output:  
        :return: (type pandas.DataFrame)
            Return a dataframe with dimensions less than or equal to input dataframe (MxN) -> (MxK), 
            where K <= N
    """
    # re-order the CIK and Year columns to appear as the first two columns
    remap = df.columns[~np.isin(df.columns, ['CIK', 'Name', 'Filing Date', 'Filing Year'])]   # select all other columns
    df = df[np.insert(remap, [0, 0, 0, 0], ['CIK', 'Name', 'Filing Date', 'Filing Year'])]    # insert new order

    # filter out columns with NaN values
    filterNaN = df.isnull().all()                      # find if any column is all NaN 
    cleanCols = filterNaN[filterNaN == False].index    # select columns with at least one value

    # clean dataframe for unstructured asset terms
    return df[cleanCols]

## Final Main Execution

In [125]:
if __name__ == "__main__":
    
    # initiate s3 bucket and corresponding data folder
    bucket = "ran-s3-systemic-risk"
    
    pdf_asset_folder = "Output/X-17A-5-SPLIT-PDFS/Assets/"
    pdf_liable_folder = "Output/X-17A-5-SPLIT-PDFS/Liability & Equity/"
    
    png_asset_folder = "Output/X-17A-5-SPLIT-PNGS/Assets/"
    png_liable_folder = "Output/X-17A-5-SPLIT-PNGS/Liability & Equity/"
    
    out_folder = "Output/"

    # Amazon Textract client and Sagemaker session
    textract = boto3.client('textract')
    s3 = boto3.client('s3')
    session = Session()
    
    # ==============================================================================
    # retrieving CIK-Dealers JSON file from s3 bucket
    s3.download_file(bucket, 'Temp/CIKandDealers.json', 'temp.json')

    # read all CIK and Dealer name information from storage
    with open('temp.json', 'r') as f: cik2brokers = json.loads(f.read())

    # remove local file after it has been created (variable is stored in memory)
    os.remove('temp.json')
    # ==============================================================================
    
    # s3 paths where asset and liability paths are stored
    asset_paths = session.list_s3_files(bucket, pdf_asset_folder)[1:]
    liable_paths = session.list_s3_files(bucket, pdf_liable_folder)[1:]
    
    # intialize list to store dataframes for asset and liability & equity
    asset_concat = [0] * len(asset_paths)
    liable_concat = [0] * len(liable_paths)
    
    # --------------------------------------------
    # Asset Unstructured Database
    # --------------------------------------------
    print('Assets Unstructured Database')
    for idx, csv in enumerate(asset_paths):
        fileName = csv.split('/')[-1]        # e.g. '1224385-2005-03-01.csv'
        csv_strip = fileName[:-4]            # ignore last four elements from the back (i.e. .csv)
        
        # construct a string measure of important data measures 
        data_split = csv_strip.split('-')              
        filing_date = '-'.join(data_split[1:])         # join YYYY-mm-dd component for filing date
        fiscal_year = int(data_split[1]) - 1           # fiscal year are generally the previous year of filing
        cik = data_split[0]                            # extract the CIK number  
        
        # work on combining columns that are issued seperately
        s3.download_file(bucket, csv, 'temp.csv')
        df = pd.read_csv('temp.csv')

        # process balance sheet dataframes
        temp_df = totals_check(df)                  # run an accounting check removing any total rows
        export_df = unstructured_data(temp_df,      # construct row of unstructured data frame 
                                      filing_date, 
                                      fiscal_year, cik, cik2brokers)
        
        # if a totals category was found then we look to run data on PNG (if possible)
        if totals_flag(export_df):
            try:
                # work on combining columns that are issued seperately
                s3.download_file(bucket, png_asset_folder + fileName, 'temp.csv')
                df = pd.read_csv('temp.csv')

                # process balance sheet dataframes
                temp_df = totals_check(df)                 
                export_df = unstructured_data(temp_df, filing_date, fiscal_year, cik, cik2brokers)
            
            # in event PNG can't be read
            except:
                pass
        
        # stores the reported data frame 
        asset_concat[idx] = export_df
        
        # remove local file after it has been created
        os.remove('temp.csv')
        
        if ((idx + 1) % 10) == 0:
            print('\tWe have completed {} - asset rows'.format(idx + 1))
        
    # --------------------------------------------
    # Liability & Equity Unstructured Database
    # --------------------------------------------
    print('\nLiability & Equity Unstructured Database')
    for idx, csv in enumerate(liable_paths):
        fileName = csv.split('/')[-1]        # e.g. '1224385-2005-03-01.csv'
        csv_strip = fileName[:-4]            # ignore last four elements from the back (i.e. .csv)
        
        # construct a string measure of important data measures 
        data_split = csv_strip.split('-')              
        filing_date = '-'.join(data_split[1:])         # join YYYY-mm-dd component for filing date
        fiscal_year = int(data_split[1]) - 1           # fiscal year are generally the previous year of filing
        cik = data_split[0]                            # extract the CIK number  
        
        # work on combining columns that are issued seperately
        s3.download_file(bucket, csv, 'temp.csv')
        df = pd.read_csv('temp.csv')
        
        # process balance sheet dataframes
        temp_df = totals_check(df)                 # run an accounting check removing any total rows
        export_df = unstructured_data(temp_df,     # construct row of unstructured data frame 
                                      filing_date, 
                                      fiscal_year, cik, cik2brokers)
        
        # if a totals category was found then we look to run data on PNG (if possible)
        if totals_flag(export_df):
            try:
                # work on combining columns that are issued seperately
                s3.download_file(bucket, png_liable_folder + fileName, 'temp.csv')
                df = pd.read_csv('temp.csv')

                # process balance sheet dataframes
                temp_df = totals_check(df)                 
                export_df = unstructured_data(temp_df, filing_date, fiscal_year, cik, cik2brokers)
            
            # in event PNG can't be read
            except:
                pass
        
        # stores the reported data frame 
        liable_concat[idx] = export_df

        # remove local file after it has been created
        os.remove('temp.csv')
        
        if ((idx + 1) % 10) == 0:
            print('\tWe have completed {} - liability & equity rows'.format(idx + 1))
    
    # --------------------------------------------
    # Database exportation
    # --------------------------------------------
    
    asset_df = pd.concat(asset_concat)        # asset dataframe combining all rows from 
    asset_df = reorder_columns(asset_df)      # re-order columns for dataframe
    
    # writing data frame to .csv file
    filename1 = 'unstructured_assets.csv'
    asset_df.to_csv(filename1, index=False)
    
    # write .csv file to s3
    with open(filename1, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + filename1, Body=data)
    
    liable_df = pd.concat(liable_concat)      # liablity & equity dataframe combining all rows from 
    liable_df = reorder_columns(liable_df)    # re-order columns for dataframe
    
    # writing data frame to .csv file
    filename2 = 'unstructured_liable.csv'
    liable_df.to_csv(filename2, index=False)
    
    # write .csv file to s3
    with open(filename2, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + filename2, Body=data)
    
    # remove local file after it has been created
    os.remove(filename1)
    os.remove(filename2)
    
    print('\nWe created an unstructured asset and liability & equity')

Assets Unstructured Database
We dropped row 9, Total assets, with lookback window of 9.
	Our row is valued at 35611655000.0, our lookback sum is 35611655000.0
We dropped row 9, Total assets, with lookback window of 9.
	Our row is valued at 24029490000.0, our lookback sum is 24029490000.0
We dropped row 9, Total assets, with lookback window of 9.
	Our row is valued at 20879923000.0, our lookback sum is 20879923000.0
We dropped row 8, Total assets, with lookback window of 8.
	Our row is valued at 21660817000.0, our lookback sum is 21660817000.0
We dropped row 8, Total assets, with lookback window of 8.
	Our row is valued at 9580535000.0, our lookback sum is 9580535000.0
We dropped row 9, Total assets, with lookback window of 9.
	Our row is valued at 17499658000.0, our lookback sum is 17499658000.0
We dropped row 8, Total assets, with lookback window of 8.
	Our row is valued at 37609933000.0, our lookback sum is 37609933000.0
We dropped row 8, Total assets, with lookback window of 8.
	Our

We dropped row 15, Total assets, with lookback window of 15.
	Our row is valued at 265334256000.0, our lookback sum is 265334256000.0
We dropped row 15, Total assets, with lookback window of 15.
	Our row is valued at 310888679000.0, our lookback sum is 310888679000.0
We dropped row 15, Total assets, with lookback window of 15.
	Our row is valued at 412415148000.0, our lookback sum is 412415148000.0
	We have completed 60 - asset rows
We dropped row 16, Total assets, with lookback window of 16.
	Our row is valued at 501699713000.0, our lookback sum is 501699713000.0
We dropped row 16, Total assets, with lookback window of 16.
	Our row is valued at 583404825000.0, our lookback sum is 583404825000.0
We dropped row 19, Total assets, with lookback window of 19.
	Our row is valued at 604395450000.0, our lookback sum is 604395450000.0
We dropped row 19, Total assets, with lookback window of 19.
	Our row is valued at 297422286000.0, our lookback sum is 297422286000.0
We dropped row 8, Total fin

We dropped row 11, TOTAL ASSETS, with lookback window of 11.
	Our row is valued at 150559829000.0, our lookback sum is 150559829000.0
We dropped row 11, TOTAL ASSETS, with lookback window of 11.
	Our row is valued at 164601243000.0, our lookback sum is 164601243000.0
We dropped row 8, Net property and equipment, with lookback window of 3.
	Our row is valued at 107603.0, our lookback sum is 107603.0
We dropped row 9, TOTAL ASSETS, with lookback window of 9.
	Our row is valued at 25673078.0, our lookback sum is 25673078.0
We dropped row 11, TOTAL ASSETS, with lookback window of 11.
	Our row is valued at 193433468000.0, our lookback sum is 193433468000.0
We dropped row 12, TOTAL ASSETS, with lookback window of 12.
	Our row is valued at 191806579000.0, our lookback sum is 191806579000.0
We dropped row 12, TOTAL ASSETS, with lookback window of 12.
	Our row is valued at 236190719000.0, our lookback sum is 236190719000.0
	We have completed 100 - asset rows
We dropped row 12, TOTAL ASSETS, wit

We dropped row 9, Total assets, with lookback window of 9.
	Our row is valued at 77900039000.0, our lookback sum is 77900039000.0
We dropped row 10, Total assets, with lookback window of 10.
	Our row is valued at 74314954000.0, our lookback sum is 74314954000.0
We dropped row 10, Total assets, with lookback window of 10.
	Our row is valued at 81418513000.0, our lookback sum is 81418513000.0
	We have completed 150 - asset rows
We dropped row 9, Total assets, with lookback window of 9.
	Our row is valued at 71580510000.0, our lookback sum is 71580510000.0
We dropped row 9, Total assets, with lookback window of 9.
	Our row is valued at 71759167000.0, our lookback sum is 71759167000.0
We dropped row 9, Total assets, with lookback window of 9.
	Our row is valued at 91308598000.0, our lookback sum is 91308598000.0
We dropped row 10, Total assets, with lookback window of 9.
	Our row is valued at 105062000000.0, our lookback sum is 105062000000.0
We dropped row 9, Total assets, with lookback w

We dropped row 5, Total liabilities, with lookback window of 5.
	Our row is valued at 33345623000.0, our lookback sum is 33345623000.0
We dropped row 9, Total member's equity, with lookback window of 2.
	Our row is valued at 2780810000.0, our lookback sum is 2780810000.0
We dropped row 10, Total liabilities and member's equity, with lookback window of 10.
	Our row is valued at 37609933000.0, our lookback sum is 37609933000.0
We dropped row 5, Total liabilities, with lookback window of 5.
	Our row is valued at 34379197000.0, our lookback sum is 34379197000.0
We dropped row 9, Total member's equity, with lookback window of 2.
	Our row is valued at 2831370000.0, our lookback sum is 2831370000.0
We dropped row 10, Total liabilities and member's equity, with lookback window of 10.
	Our row is valued at 39534067000.0, our lookback sum is 39534067000.0
We dropped row 5, Total liabilities, with lookback window of 5.
	Our row is valued at 37889341000.0, our lookback sum is 37889341000.0
We drop

We dropped row 13, Total partners' capital, with lookback window of 2.
	Our row is valued at 11377000000.0, our lookback sum is 11377000000.0
We dropped row 14, Total liabilities and partners' capital, with lookback window of 4.
	Our row is valued at 477078000000.0, our lookback sum is 477076000000.0
We dropped row 10, Total liabilities, with lookback window of 10.
	Our row is valued at 465699000000.0, our lookback sum is 465699000000.0
We dropped row 13, Total partners' capital, with lookback window of 2.
	Our row is valued at 11377000000.0, our lookback sum is 11377000000.0
We dropped row 14, Total liabilities and partners' capital, with lookback window of 14.
	Our row is valued at 477076000000.0, our lookback sum is 477076000000.0
	We have completed 30 - liability & equity rows
We dropped row 10, Total liabilities, with lookback window of 10.
	Our row is valued at 444847000000.0, our lookback sum is 444848000000.0
We dropped row 13, Total partners' capital, with lookback window of 2

We dropped row 8, Total liabilities, with lookback window of 8.
	Our row is valued at 60277000000.0, our lookback sum is 60277000000.0
We dropped row 12, Total stockholder's equity, with lookback window of 2.
	Our row is valued at 11853000000.0, our lookback sum is 11853000000.0
We dropped row 13, Total liabilities and stockholder's equity, with lookback window of 13.
	Our row is valued at 72130000000.0, our lookback sum is 72130000000.0
We dropped row 17, Total stockholder's equity, with lookback window of 3.
	Our row is valued at 4261550000.0, our lookback sum is 4261550000.0
We dropped row 18, Total liabilities and stockholder's equity, with lookback window of 18.
	Our row is valued at 267625461000.0, our lookback sum is 267625461000.0
We dropped row 17, Total stockholder's equity, with lookback window of 3.
	Our row is valued at 4664554000.0, our lookback sum is 4664554000.0
We dropped row 18, Total liabilities and stockholder's equity, with lookback window of 18.
	Our row is value

We dropped row 8, Payables and accrued liabilities:, with lookback window of 4.
	Our row is valued at 2781603000.0, our lookback sum is 2781603000.0
We dropped row 16, Total liabilities, with lookback window of 16.
	Our row is valued at 57292974000.0, our lookback sum is 57292974000.0
We dropped row 18, Total liabilities and stockholder's equity, with lookback window of 18.
	Our row is valued at 57702809000.0, our lookback sum is 57702809000.0
We dropped row 7, Payables:, with lookback window of 4.
	Our row is valued at 3046503000.0, our lookback sum is 3046503000.0
We dropped row 13, Total liabilities, with lookback window of 13.
	Our row is valued at 62018809000.0, our lookback sum is 62018809000.0
We dropped row 18, Total stockholder's equity, with lookback window of 2.
	Our row is valued at 482409000.0, our lookback sum is 482409000.0
We dropped row 19, Total liabilities and stockholder's equity, with lookback window of 19.
	Our row is valued at 63951218000.0, our lookback sum is 6

We dropped row 16, Total Stockholder's Equity, with lookback window of 5.
	Our row is valued at 3462488000.0, our lookback sum is 3462488000.0
We dropped row 17, TOTAL LIABILITIES AND STOCKHOLDER'S EQUITY, with lookback window of 17.
	Our row is valued at 140655621000.0, our lookback sum is 140655621000.0
We dropped row 16, Total Stockholder's Equity, with lookback window of 5.
	Our row is valued at 3704442000.0, our lookback sum is 3704442000.0
We dropped row 17, TOTAL LIABILITIES AND STOCKHOLDER'S EQUITY, with lookback window of 17.
	Our row is valued at 150559829000.0, our lookback sum is 150559829000.0
We dropped row 17, Total Stockholder's Equity, with lookback window of 5.
	Our row is valued at 4096046000.0, our lookback sum is 4096046000.0
We dropped row 18, TOTAL LIABILITIES AND STOCKHOLDER'S EQUITY, with lookback window of 18.
	Our row is valued at 164601243000.0, our lookback sum is 164601243000.0
We dropped row 6, Total liabilities, with lookback window of 6.
	Our row is val

We dropped row 10, Total liabilities and stockholder's equity, with lookback window of 10.
	Our row is valued at 194436796000.0, our lookback sum is 194436796000.0
	We have completed 120 - liability & equity rows
We dropped row 8, Total liabilities, with lookback window of 8.
	Our row is valued at 234088093000.0, our lookback sum is 234088093000.0
We dropped row 10, Total liabilities and stockholder's equity, with lookback window of 10.
	Our row is valued at 236022658000.0, our lookback sum is 236022658000.0
We dropped row 9, Total liabilities, with lookback window of 9.
	Our row is valued at 295746672000.0, our lookback sum is 295746672000.0
We dropped row 14, Total stockholder's equity, with lookback window of 3.
	Our row is valued at 2074744000.0, our lookback sum is 2074734000.0
We dropped row 15, Total liabilities and stockholder's equity, with lookback window of 15.
	Our row is valued at 297821416000.0, our lookback sum is 297821416000.0
We dropped row 8, Total liabilities, with 

We dropped row 8, Total liabilities, with lookback window of 8.
	Our row is valued at 59918632000.0, our lookback sum is 59918632000.0
We dropped row 11, Total liabilities and stockholders' equity, with lookback window of 11.
	Our row is valued at 61543858000.0, our lookback sum is 61543858000.0
We dropped row 8, Total liabilities, with lookback window of 8.
	Our row is valued at 59140672000.0, our lookback sum is 59140672000.0
We dropped row 11, Total liabilities and stockholders' equity, with lookback window of 11.
	Our row is valued at 60683961000.0, our lookback sum is 60683961000.0
We dropped row 8, Total liabilities, with lookback window of 8.
	Our row is valued at 82287659000.0, our lookback sum is 82287659000.0
We dropped row 11, Total liabilities and stockholders' equity, with lookback window of 11.
	Our row is valued at 83850980000.0, our lookback sum is 83850980000.0
We dropped row 8, Total liabilities, with lookback window of 8.
	Our row is valued at 82287659000.0, our look

We dropped row 21, Total stockholder's equity, with lookback window of 4.
	Our row is valued at 10833000000.0, our lookback sum is 10833000000.0
We dropped row 22, Total liabilities and stockholder's equity, with lookback window of 22.
	Our row is valued at 377951000000.0, our lookback sum is 377951000000.0
We dropped row 4, Financial instruments sold, not yet purchased, and, with lookback window of 2.
	Our row is valued at 205309000000.0, our lookback sum is 205309000000.0
We dropped row 11, Payables and accrued liabilities:, with lookback window of 6.
	Our row is valued at 26313000000.0, our lookback sum is 26313000000.0
We dropped row 16, Total liabilities, with lookback window of 16.
	Our row is valued at 354820000000.0, our lookback sum is 354820000000.0
We dropped row 21, Total stockholder's equity, with lookback window of 4.
	Our row is valued at 8342000000.0, our lookback sum is 8342000000.0
We dropped row 22, Total liabilities and stockholder's equity, with lookback window of 

In [126]:
[i for i in asset_df.columns if re.search('total|totals', i, flags=re.I) is not None]

[]

In [127]:
[i for i in liable_df.columns if re.search('total|totals', i, flags=re.I) is not None]

["Total member's equity",
 "Total stockholder's equity",
 "Total liabilities and stockholder's equity",
 "Total Stockholders' equity:",
 "Total shareholder's equity"]

In [128]:
liable_df[~np.isnan(liable_df["Total liabilities and stockholder's equity"])]

Unnamed: 0,CIK,Name,Filing Date,Filing Year,Accrued interest payable,Other liabilities,Payable to broker-dealers and clearing organizations,Payable to customers,Securities sold under agreements to repurchase,"Securities sold, not yet purchased, at market value",...,"Commitments, contingencies and guarantees: Subordinated indebtedness","Common stock ($10,000 par value, 1,000 shares authorized, issued and outstanding)","Obligations to return securities received as collateral, at fair value","Securities loaned or sold under agreements to repurchase (including $13,083 at fair value)",U.S. Treasury and federal agency securities,"Securities loaned or sold under agreements to repurchase (including $8,793 at fair value)","Securities loaned or sold under agreements to repurchase (including $10,611 at fair value)","Securities loaned or sold under agreements to repurchase (including $8,810 at fair value)","Securities loaned and sold under agreements to repurchase (including $6,521 at fair value)","Securities loaned and sold under agreements to repurchase (including $5,207 at fair value)"
1,851376,BARCLAYS CAPITAL INC.,2014-03-04,2013,,1847000000.0,,,158214000000.0,,...,,,,,,,,,,


In [129]:
# work on combining columns that are issued seperately
s3.download_file(bucket, 'Output/X-17A-5-SPLIT-PDFS/Liability & Equity/851376-2014-03-04.csv', 'temp.csv')
df = pd.read_csv('temp.csv')
os.remove('temp.csv')

# process balance sheet dataframes
temp_df = totals_check(df)                  # run an accounting check removing any total rows
print(temp_df)

                                                    0             1
0      Securities sold under agreements to repurchase  1.582140e+11
1                                   Securities loaned  3.491900e+10
2   Obligation to return securities received as co...  2.716300e+10
3   Financial instruments sold, but not yet purcha...  2.980300e+10
4   Payables to brokers, dealers and clearing orga...  3.391000e+09
5                               Payables to customers  2.304800e+10
6                               Short-term borrowings  6.983000e+09
7              Accrued interest and dividend payables  2.030000e+08
8                                   Other liabilities  1.847000e+09
9                                Long-term borrowings  7.900000e+09
10                                  Subordinated debt  2.500000e+09
11  Common stock - no par value, 5,000 shares auth...  0.000000e+00
12                         Additional paid-in capital  6.281000e+09
13   Accumulated other comprehensive loss, net o