In [222]:
%pip install fuzzywuzzy
%pip install python-Levenshtein

Note: you may need to restart the kernel to use updated packages.
Collecting python-Levenshtein
  Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)
[K     |████████████████████████████████| 50 kB 3.0 MB/s eta 0:00:011
Building wheels for collected packages: python-Levenshtein
  Building wheel for python-Levenshtein (setup.py) ... [?25ldone
[?25h  Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp36-cp36m-linux_x86_64.whl size=155937 sha256=50176754e1b9e3a918e1dbbd36ea5cfe3772e58b70e7d741322b8d54a391030a
  Stored in directory: /home/ec2-user/.cache/pip/wheels/4a/a4/bf/d761b0899395c75fa76d003d607b3869ee47f5035b8afc30a2
Successfully built python-Levenshtein
Installing collected packages: python-Levenshtein
Successfully installed python-Levenshtein-0.12.2
Note: you may need to restart the kernel to use updated packages.


In [223]:
import os
import re
import botocore
import boto3
import json
import time
import requests
import datetime

import pandas as pd
import numpy as np

from sagemaker.session import Session
from difflib import SequenceMatcher
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz

## Accounting term matching
**Check to see if we report totals (e.g. Total Liability & Shareholder Equity) or sub-totals (e.g. Total Financial Instruments), these figure are not needed for construction of the unstructured database (avoid classification issue)**

In [204]:
# numpy exception for handling invalid log10 RunTime error (we opt to not show)
# switch 'ignore' to 'warn', if you would like to flag the RunTime error 
np.seterr(invalid = 'ignore') 

def multiple_check(x1:float, x2:float):
    """
    Determine whether the two values are the same number scaled by 10
    """
    # prevent zero division error since x1 is the denominator and log10 zero division error
    if (x1 == 0) or (x2 == 0): return False
    else:
        # if our backward sum is a multiple of 10, we return True 
        # (e.g. Total Assets (x1) 745.2322 vs Backward Sum (x2) 7452322)
        check1 = np.log10(x2 / x1).is_integer()

        # if our backward sum is a substring of a line item, with a difference of one in length, we return True 
        # (e.g. Total Assets (x1) 174182935 vs Backward Sum (x2) 74182935)
        check2 = (str(x2) in str(x1) ) & (len(str(x2)) == len(str(x1)) - 1)

        if check1 or check2: 
            return True
        else: 
            return False

def epsilon_error(x1:float, x2:float, tol:float=0.01):
    """
    Determine whether the two values are within a similar epsilon bound. We default our error tolerance
    to 0.01, implying that if two numbers are within a specified toleracnce (default 1% ) of one another 
    """
    if (x1 == 0) or (x2 == 0): return False
    else:
        # first we convert the numeric quantities into strings
        current = str(x1)
        lookback = str(x2)

        # we only want to check against the relative difference if one element in the number is read wrong
        if len(current) == len(lookback):
            
            # we iterate linearly through each string and check to see the positional match 
            # if we catch and mismatch we flag it with a 1, othewise skip with 0
            changes = [0 if (current[i] == lookback[i]) else 1 for i in range(len(current))]

            # check set differences produce a set with exactly 1 in length
            if sum(changes) == 1:

                diff = abs(x1 - x2)      # compute numeric differences

                # check to see whether an accounting condition was met wihtin a boundary condition
                if abs(diff / x1) <= tol:
                    return True

        return False

In [205]:
def totals_check(df:pd.DataFrame) -> tuple:
    """
    Checks to see if a line row meets the conditon of a total, if true we remove these rows as we make 
    have checked the terms before have meet our conditions (these include major and minor totals)
    ------------------------------------------------------------------------------------
    :param: (type pandas.DataFrame)
        A DataFrame that represents the Asset or Liability & Equity portion of the balance sheet
        
    :return: (type tuple)
        Return a cleaned DataFrame that strips the rows that represent totals
    """
    m, n = df.shape                  # unpack the shape of dataframe
    data_col = df.columns[1]         # the values column for balance sheet
    
    total_flag = 2       # default 2 (no measure found), 1 (sum is correct), 0 (sum is not correct)
    total_amt = np.nan
    
    # iterate through each of the line items
    for i in range(m):
        
        # check the value of line items at a given index (forward index)
        item1 = df.loc[i].values[1]
        name = df.loc[i].values[0]
        
        # ------------------------------------------------------------------
        # Perform regex search to determine "special" total rows
        # ------------------------------------------------------------------
        a_check = re.search('total assets', name, flags=re.I)
        le_check = re.search('(?=.*(liability|liabilities))(?=.*(equity|deficit|capital))', 
                             name, flags=re.I)
        # ------------------------------------------------------------------
        
        # if we find either total measure we re-write indicators
        if a_check is not None or le_check is not None:
            total_flag = 0; total_amt = item1;
        
        # compute backward sum (lookback index) 
        for j in range(i):
            
            # check whether dataframe empty (if so we skip to avoid fitting errors)
            # NOTE: Index position (i-1)   = line above current line
            #                      (i-j-1) = trailing look up line 'j' lines above the line above current line
            lookback = df.loc[i-j-1:i-1][data_col]
            
            # we check whether the lookback period is empty (if so we most likely deleted the row)
            if not lookback.empty:
                # backward sum for line items (index minus j-periods before)
                item2 = lookback.sum()

                # if we achieve this then we strip totals and break, no need to continue backward sum
                if (item1 == item2) or multiple_check(item1, item2) or epsilon_error(item1, item2, tol=0.01):
                    df = df.drop(index=i)
                    
                    # if we drop the "Total" line-item then we re-assign flag to 1
                    if a_check is not None or le_check is not None:
                        total_flag = 1
                    
                    # Error Handling for row deletions (uncomment for when not in use)
                    print('\tWe dropped row {}, {}, with lookback window of {}.'.format(i, name, j+1))
                    print('\t\tOur row is valued at {}, our lookback sum is {}'.format(item1, item2))
                    
                    # we break from inner loop to avoid key error flag 
                    break     
                
    return (df, total_flag, total_amt)

## Merging PDFs and PNGs 
**Functions to combine PDFs and PNGs where rows may be omitted**

In [278]:
def special_merge(df1:pd.DataFrame, df2:pd.DataFrame, col:str) -> pd.DataFrame:
    """
    Special type of merge for dataframes, combining all unique row items for a specified column. 
    This is designed to combine PDF and PNG balance sheets that differ in one or more rows.
    ------------------------------------------------------------------------------------
    :param: (type pandas.DataFrame)
        A DataFrame that represents either the PDF or PNG retreived from Balance Sheet
    :param: (type pandas.DataFrame)
        A DataFrame that represents either the PDF or PNG retreived from Balance Sheet
    :param: (type str)
        A shared column name that exists in both pandas.DataFrames (i.e. df1, df2)
        
    :return: (type pandas.DataFrame)
        Return a cleaned DataFrame that merges any row that was omitted, changed or missing
    """
    arr1 = df1[col].values
    arr2 = df2[col].values
    concat_pdf = []
    
    # find the sequences that match between either lineitems
    sm = SequenceMatcher(a=arr1, b=arr2)

    # The SequenceMathcer returns a 5-tupled for each correspond "obj"
    # 'replace'     a[i1:i2] should be replaced by b[j1:j2].
    # 'delete'      a[i1:i2] should be deleted. Note that j1 == j2 in this case.
    # 'insert'      b[j1:j2] should be inserted at a[i1:i1]. Note that i1 == i2 in this case.
    # 'equal'       a[i1:i2] == b[j1:j2] (the sub-sequences are equal)
    for (obj, i1, i2, j1, j2) in sm.get_opcodes():
        
        # implies that we want to "replace" the left side elment with the corresponding
        # right side element at the same index position (we perseve both)
        if obj == 'replace':
            
            # check the value of a fuzzy match, only insert both rows if they vastly different
            left_names = arr1[i1:i2]
            right_names = arr2[j1:j2]
            
            # iterate through each of the checks
            for it, (left, right) in enumerate(zip(left_names, right_names)):
                
                # compute the fuzz match between string (how close are these values)
                score = fuzz.partial_ratio(left.lower(), right.lower())

                # if not close in match then we append both values
                if score < 90:
                    concat_pdf.append(df1.iloc[i1:i1+it+1])
                    concat_pdf.append(df2.iloc[j1:j1+it+1])
                else:
                    concat_pdf.append(df1.iloc[i1:i1+it+1])
        
        # implies that we want to "delete" the left side element (we preserve this side)
        elif obj == 'delete':
            concat_pdf.append(df1.iloc[i1:i2])
        
        # implied that we want to "insert" the right side element (we preserve this side)
        elif obj == 'insert':
            concat_pdf.append(df2.iloc[j1:j2])
            
        elif obj == 'equal':
            concat_pdf.append(df1.iloc[i1:i2])
    
    # return concated pandas.DataFrame and reset index, removing old index
    return pd.concat(concat_pdf).reset_index(drop=True)
    

## Unstructured Database construction
**We develop our unstructured database from each of the non-total rows (concating the line items)**

In [207]:
def unstructured_data(df, filing_d, fiscal_y, cik, cik2name:dict) -> pd.DataFrame:
    """
    Forms unstructured row for larger database to be stored in s3 bucket
    ------------------------------------------------------------------------------------------
    Input:
        :param: df (type pandas.DataFrame)
            The balance sheet for a particular  
        :paran: filing_d (type str)
            The filing date for release of X-17A-5 filings for a broker dealer e.g. 2013-03-21
        :paran: fiscal_y (type str)
            The fiscal year for the balance sheet to cover e.g. 2012 (usually 1-year prior to filing date)
        :paran: cik (type str)
            The CIK number for a broker dealer e.g. 887767
        :paran: cik2name (type dict)
            A dictionary that maps CIK to Broker Deale names 
    Output:  
        :return: (type pandas.DataFrame)
             Return a transposed dataframe with additional columns corresponding to filing data
    """
    
    # intialize the first column (line items)
    first_column = df.columns[0]
    
    # clean dataframe should be of size greater than 1
    if len(df.columns) > 1:
        
        # transpose split balance sheet figure (our line items are now columns for DataFrame)
        # we first groupby the first column (this become index) and sum to group congruent names
        row = df.groupby(first_column).sum(min_count=1).T
        
        # creating additional columns in row
        row['CIK'] = cik                                  # CIK number for firm 
        row['Filing Date'] = filing_d                     # Filing Date for firm filing
        row['Filing Year'] = fiscal_y                     # Year for balance sheet filing
        row['Name'] = cik2name['broker-dealers'][cik]     # returns the name of associated with the CIK
        
        return row
    
    else:
        print('{}-{}.csv - encountered issue reading PDF'.format(cik, filing_d))
        return None
    

In [208]:
def extra_cols(csv_name:str):
    """
    Check to see whether their exists the presence of a total term from the line items
    ------------------------------------------------------------------------------------------
    Input:
        :param: csv_name (type str)
            The file directory on the s3 where data is stored (e.g. )
    Output:  
        :return: (type tuple)
            Returns a tuple for corresponding (file_name, filing_date, fiscal_year, cik)
    """
    
    file_name = csv_name.split('/')[-1]        # e.g. '1224385-2005-03-01.csv'
    csv_strip = file_name[:-4]                 # ignore last four elements from the back (i.e. .csv)

    # construct a string measure of important data measures 
    data_split = csv_strip.split('-')              
    filing_date = '-'.join(data_split[1:])         # join YYYY-mm-dd component for filing date
    fiscal_year = int(data_split[1]) - 1           # fiscal year are generally the previous year of filing
    cik = data_split[0]                            # extract the CIK number  
    
    return (file_name, filing_date, fiscal_year, cik)   

In [209]:
def reorder_columns(df:pd.DataFrame, col_preserve:list) -> pd.DataFrame:
    """
    Re-order the completed DataFrame by ordering the CIK, Name, Filing Data and Filing Year 
    ------------------------------------------------------------------------------------------
    Input:
        :param: df (type pandas.DataFrame)
            The unstructured database for balance sheet figures
    Output:  
        :return: (type pandas.DataFrame)
            Return a dataframe with dimensions less than or equal to input dataframe (MxN) -> (MxK), 
            where K <= N
    """
    # re-order the CIK and Year columns to appear as the first two columns
    remap = df.columns[~np.isin(df.columns, col_preserve)]                             
    df = df[np.insert(remap,                                       # pass all other columns, not in preserve list
                      np.zeros(len(col_preserve), dtype=int),      # map the location to the first index (i.e. 0)
                      col_preserve)]                               # insert columns we wished to preserve 

    filterNaN = df.isnull().all()                      # find if any column is all NaN 
    cleanCols = filterNaN[filterNaN == False].index    # select columns with at least one value

    # clean dataframe for unstructured asset terms
    return df[cleanCols]

## Final Main Execution

In [277]:
if __name__ == "__main__":
    
    # initiate s3 bucket and corresponding data folder
    bucket = "ran-s3-systemic-risk"
    
    pdf_asset_folder = "Output/X-17A-5-SPLIT-PDFS/Assets/"
    pdf_liable_folder = "Output/X-17A-5-SPLIT-PDFS/Liability & Equity/"
    
    png_asset_folder = "Output/X-17A-5-SPLIT-PNGS/Assets/"
    png_liable_folder = "Output/X-17A-5-SPLIT-PNGS/Liability & Equity/"
    
    out_folder = "Output/"

    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    session = Session()
    
    # ==============================================================================
    # ALL TEMPORARY FILE INFORMATION 
    # ==============================================================================
    # retrieving CIK-Dealers JSON file from s3 bucket
    s3.download_file(bucket, 'Temp/CIKandDealers.json', 'temp.json')
    with open('temp.json', 'r') as f: cik2brokers = json.loads(f.read())

    # remove local file after it has been created (variable is stored in memory)
    os.remove('temp.json')
    # ==============================================================================
    
    # s3 paths where asset and liability paths are stored
    asset_paths = session.list_s3_files(bucket, pdf_asset_folder)
    liable_paths = session.list_s3_files(bucket, pdf_liable_folder)
    
    # intialize list to store dataframes for asset and liability & equity
    asset_concat = [0] * len(asset_paths)
    liable_concat = [0] * len(liable_paths)
    
    # --------------------------------------------
    # Asset Unstructured Database
    # --------------------------------------------
    print('Assets Unstructured Database')
    for idx, csv in enumerate(asset_paths):
        
        # decompose csv name into corresponding terms
        fileName, filing_date, fiscal_year, cik = extra_cols(csv)
        
        # first load in both the PNG and PDF split balance sheets
        # NOTE: All these balance sheets are cleaned numerical values
        try:
            s3.download_file(bucket, csv, 'temp.csv')
            pdf_df = pd.read_csv('temp.csv')
            s3.download_file(bucket, png_asset_folder + fileName, 'temp.csv')
            png_df = pd.read_csv('temp.csv')
            os.remove('temp.csv')

            print('Working on {}-{}'.format(cik, filing_date))

            # do a special merge that combines unique line items names between PDF & PNG
            temp_df1 = special_merge(pdf_df, png_df, '0')

            # run accounting check to remove sub-totals for each respective line-item
            df, total_flag, total_amt = totals_check(temp_df1)
            
            # construct row for the unstructured data frame 
            export_df = unstructured_data(df, filing_date, fiscal_year, cik, cik2brokers)
            
            # we have that no "total asset" figure was found
            if total_flag == 2:
                export_df["Total asset check"] = "Total asset not found"
                export_df["Total asset"] = total_amt

            # we have that "total asset" was found and matches
            elif total_flag == 1:
                export_df["Total asset check"] = "Total asset found & match"
                export_df["Total asset"] = total_amt

            # we have that "total asset" was found, but did not match correctly
            elif total_flag == 0:
                export_df["Total asset check"] = "Total asset found & no match"

            # stores the reported data frame 
            asset_concat[idx] = export_df
        
        # in the event we can't download file from s3 (i.e. does not exist, we ignore the )
        except botocore.exceptions.ClientError:
            
            # assign an empty DataFrame and print out error
            asset_concat[idx] = pd.DataFrame()
            
            print('\nCLIENT-ERROR: WE COULD NOT DOWNLOAD SPLIT DATA FOR {}'.format(fileName))
     
    print('\n\n\n\n')
        
    # --------------------------------------------
    # Liability & Equity Unstructured Database
    # --------------------------------------------
    print('\nLiability & Equity Unstructured Database')
    for idx, csv in enumerate(liable_paths):
        
        # decompose csv name into corresponding terms
        fileName, filing_date, fiscal_year, cik = extra_cols(csv)
        
        try:
            # first load in both the PNG and PDF split balance sheets
            # NOTE: All these balance sheets are cleaned numerical values
            s3.download_file(bucket, csv, 'temp.csv')
            pdf_df = pd.read_csv('temp.csv')
            s3.download_file(bucket, png_liable_folder + fileName, 'temp.csv')
            png_df = pd.read_csv('temp.csv')
            os.remove('temp.csv')

            print('Working on {}-{}'.format(cik, filing_date))

            # do a special merge that combines unique line items names between PDF & PNG
            temp_df1 = special_merge(pdf_df, png_df, '0')

            # run accounting check to remove sub-totals for each respective line-item
            df, total_flag, total_amt = totals_check(temp_df1)
            
            # construct row for the unstructured data frame 
            export_df = unstructured_data(df, filing_date, fiscal_year, cik, cik2brokers)
            
            # we have that no "total asset" figure was found
            if total_flag == 2:
                export_df["Total liabilities & shareholder's equity check"] = "Total liabilities & shareholder's equity not found"
                export_df["Total liabilities & shareholder's equity"] = total_amt

            # we have that "total asset" was found and matches
            elif total_flag == 1:
                export_df["Total liabilities & shareholder's equity check"] = "Total liabilities & shareholder's equity found & match"
                export_df["Total liabilities & shareholder's equity"] = total_amt
        
            # we have that "total asset" was found, but did not match correctly
            elif total_flag == 0:
                export_df["Total liabilities & shareholder's equity check"] = "Total liabilities & shareholder's equity found & no match"
                
            # stores the reported data frame 
            liable_concat[idx] = export_df
        
        # in the event we can't download file from s3 (i.e. does not exist, we ignore the )
        except botocore.exceptions.ClientError:
            
            # assign an empty DataFrame and print out error
            liable_concat[idx] = pd.DataFrame()
            
            print('\nCLIENT-ERROR: WE COULD NOT DOWNLOAD SPLIT DATA FOR {}'.format(fileName))
    
    # --------------------------------------------
    # Database exportation
    # --------------------------------------------
    
    # writing data frame to .csv file
    asset_df = pd.concat(asset_concat)        # asset dataframe combining all rows from 
    asset_df = reorder_columns(asset_df,      # re-order columns for dataframe
                               col_preserve=['CIK', 'Name', 'Filing Date', 'Filing Year', 
                                             'Total asset check'])      
    
    filename1 = 'unstructured_assets.csv'
    asset_df.to_csv(filename1, index=False)
    with open(filename1, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + filename1, Body=data)
    
    
    # writing data frame to .csv file
    liable_df = pd.concat(liable_concat)     
    liable_df = reorder_columns(liable_df, 
                                col_preserve=['CIK', 'Name', 'Filing Date', 'Filing Year', 
                                              "Total liabilities & shareholder's equity check"])    
    
    filename2 = 'unstructured_liable.csv'
    liable_df.to_csv(filename2, index=False)
    with open(filename2, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + filename2, Body=data)
    
    # remove local file after it has been created
    os.remove(filename1)
    os.remove(filename2)
    
    print('\nWe created an unstructured asset and liability & equity')

Assets Unstructured Database
Working on 1224385-2004-03-01
equal 0 11 0 11
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 21509787000.0, our lookback sum is 21509787000.0
Working on 1224385-2005-03-01
equal 0 10 0 10
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 35611655000.0, our lookback sum is 35611655000.0
Working on 1224385-2006-03-01
equal 0 10 0 10
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 24029490000.0, our lookback sum is 24029490000.0
Working on 1224385-2007-03-01
equal 0 10 0 10
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 20879923000.0, our lookback sum is 20879923000.0
Working on 1224385-2008-02-29
equal 0 9 0 9
	We dropped row 8, Total assets, with lookback window of 8.
		Our row is valued at 21660817000.0, our lookback sum is 21660817000.0
Working on 1224385-2009-03-02
equal 0 9 0 9
	We dropped row 8, Total assets, with

Working on 58056-2009-03-16
equal 0 2 0 2
replace 2 3 2 3
equal 3 11 3 11
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 244413278000.0, our lookback sum is 244413278000.0
Working on 58056-2010-06-04
equal 0 11 0 11
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 249693066000.0, our lookback sum is 249693066000.0
Working on 58056-2012-02-29
equal 0 9 0 9
replace 9 10 9 11
equal 10 11 11 12
Working on 58056-2013-03-01
equal 0 1 0 1
replace 1 2 1 2
equal 2 11 2 11
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 240046000000.0, our lookback sum is 240046000000.0
Working on 58056-2014-03-04
equal 0 11 0 11
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 227451000000.0, our lookback sum is 227451000000.0
Working on 58056-2015-03-02
equal 0 11 0 11
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 173638000000.

Working on 72267-2012-03-15
equal 0 10 0 10
delete 10 12 10 10
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 104827762000.0, our lookback sum is 104827762000.0
Working on 72267-2012-05-30
equal 0 12 0 12
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 114059495000.0, our lookback sum is 114059495000.0
Working on 72267-2013-05-30
equal 0 12 0 12
Working on 72267-2014-05-30
replace 0 1 0 1
equal 1 10 1 10
replace 10 11 10 12
equal 11 12 12 13
	We dropped row 1, Cash and cash equivalents, with lookback window of 1.
		Our row is valued at 1426073000.0, our lookback sum is 1426073000.0
	We dropped row 5, Trading assets ($19,918,091 were pledged to various parties and $335,455 related to, with lookback window of 2.
		Our row is valued at 76177986000.0, our lookback sum is 76177986000.0
Working on 72267-2015-06-01
equal 0 13 0 13
	We dropped row 4, Trading assets ($16,658,256 were pledged to various parties and $437,5

Working on 782124-2021-03-01
equal 0 10 0 10
	We dropped row 9, Total assets (a), with lookback window of 9.
		Our row is valued at 609782000000.0, our lookback sum is 609782000000.0
Working on 851376-2002-03-01
equal 0 12 0 12
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 72638615000.0, our lookback sum is 72638615000.0
Working on 851376-2003-03-03
equal 0 12 0 12
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 103369408000.0, our lookback sum is 103369408000.0
Working on 851376-2004-03-01
equal 0 12 0 12
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 113650937000.0, our lookback sum is 113650937000.0
Working on 851376-2005-03-02
equal 0 12 0 12
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 174937091000.0, our lookback sum is 174937091000.0
Working on 851376-2006-03-01
equal 0 12 0 12
	We dropped row 11, Total assets, with lookback wi

Working on 91154-2007-03-01
equal 0 21 0 21
	We dropped row 20, Total assets, with lookback window of 20.
		Our row is valued at 377951000000.0, our lookback sum is 377951000000.0
Working on 91154-2008-02-29
equal 0 23 0 23
	We dropped row 4, Financial instruments owned and contractual commitments, at fair value:, with lookback window of 2.
		Our row is valued at 200500000000.0, our lookback sum is 200500000000.0
	We dropped row 14, Receivables:, with lookback window of 9.
		Our row is valued at 109682000000.0, our lookback sum is 109682000000.0
	We dropped row 22, Total assets, with lookback window of 22.
		Our row is valued at 363162000000.0, our lookback sum is 363162000000.0
Working on 91154-2009-03-02
equal 0 22 0 22
	We dropped row 4, Financial instruments owned and contractual commitments, at fair value:, with lookback window of 2.
		Our row is valued at 163491000000.0, our lookback sum is 163491000000.0
	We dropped row 21, Total assets, with lookback window of 21.
		Our row is 

Working on 1224385-2014-03-04
equal 0 11 0 11
	We dropped row 5, Total liabilities, with lookback window of 5.
		Our row is valued at 41933521000.0, our lookback sum is 41933521000.0
	We dropped row 9, Total member's equity, with lookback window of 2.
		Our row is valued at 3887874000.0, our lookback sum is 3887874000.0
	We dropped row 10, Total liabilities and member's equity, with lookback window of 10.
		Our row is valued at 48144895000.0, our lookback sum is 48144895000.0
Working on 1224385-2015-03-02
equal 0 11 0 11
	We dropped row 5, Total liabilities, with lookback window of 5.
		Our row is valued at 58237482000.0, our lookback sum is 58237482000.0
	We dropped row 9, Total member's equity, with lookback window of 2.
		Our row is valued at 4274759000.0, our lookback sum is 4274759000.0
	We dropped row 10, Total liabilities and member's equity, with lookback window of 10.
		Our row is valued at 64835741000.0, our lookback sum is 64835741000.0
Working on 1224385-2016-02-29
equal 0 

replace 0 1 0 1
equal 1 2 1 2
delete 2 4 2 2
equal 4 5 2 3
delete 5 6 3 3
equal 6 8 3 5
delete 8 15 5 5
	We dropped row 10, Total liabilities, with lookback window of 10.
		Our row is valued at 444847000000.0, our lookback sum is 444848000000.0
	We dropped row 13, Total partners' capital, with lookback window of 2.
		Our row is valued at 9904000000.0, our lookback sum is 9904000000.0
	We dropped row 14, Total liabilities and partners' capital, with lookback window of 14.
		Our row is valued at 454751000000.0, our lookback sum is 454752000000.0
Working on 42352-2017-03-01
replace 0 2 0 2
equal 2 3 2 3
replace 3 5 3 5
equal 5 6 5 6
replace 6 8 6 8
equal 8 15 8 15
	We dropped row 1, Securities sold under agreements to repurchase at fair value, with lookback window of 1.
		Our row is valued at 91297000000.0, our lookback sum is 91297000000.0
	We dropped row 5, Brokers, deaters and clearing organizations, with lookback window of 1.
		Our row is valued at 4244000000.0, our lookback sum is 42

Working on 58056-2019-03-01
replace 0 1 0 1
equal 1 15 1 15
	We dropped row 8, Total liabilities, with lookback window of 8.
		Our row is valued at 73384000000.0, our lookback sum is 73384000000.0
	We dropped row 13, Total stockholder's equity, with lookback window of 2.
		Our row is valued at 10839000000.0, our lookback sum is 10839000000.0
	We dropped row 14, Total liabilities and stockholder's equity, with lookback window of 14.
		Our row is valued at 90946000000.0, our lookback sum is 90946000000.0
Working on 58056-2019-09-30
replace 0 1 0 1
equal 1 15 1 15
	We dropped row 8, Total liabilities, with lookback window of 8.
		Our row is valued at 73384000000.0, our lookback sum is 73384000000.0
	We dropped row 13, Total stockholder's equity, with lookback window of 2.
		Our row is valued at 10839000000.0, our lookback sum is 10839000000.0
	We dropped row 14, Total liabilities and stockholder's equity, with lookback window of 14.
		Our row is valued at 90946000000.0, our lookback sum i

	We dropped row 11, Total liabilities, with lookback window of 11.
		Our row is valued at 331231000000.0, our lookback sum is 331231000000.0
	We dropped row 15, Total member's equity, with lookback window of 2.
		Our row is valued at 3903000000.0, our lookback sum is 3903000000.0
	We dropped row 16, Total liabilities and member's equity, with lookback window of 16.
		Our row is valued at 345134000000.0, our lookback sum is 345134000000.0
Working on 68136-2016-02-29
equal 0 18 0 18
	We dropped row 12, Total liabilities, with lookback window of 12.
		Our row is valued at 284402000000.0, our lookback sum is 284402000000.0
	We dropped row 16, Total member's equity, with lookback window of 2.
		Our row is valued at 5639000000.0, our lookback sum is 5639000000.0
	We dropped row 17, Total liabilities and member's equity, with lookback window of 17.
		Our row is valued at 301341000000.0, our lookback sum is 301341000000.0
Working on 68136-2017-03-01
equal 0 16 0 16
replace 16 17 16 17
equal 17

Working on 72267-2014-05-30
delete 0 1 0 0
equal 1 18 0 17
delete 18 19 17 17
	We dropped row 13, Total liabilities, with lookback window of 12.
		Our row is valued at 124317519000.0, our lookback sum is 124317519000.0
	We dropped row 17, Total stockholder's equity, with lookback window of 2.
		Our row is valued at 3067937000.0, our lookback sum is 3067937000.0
	We dropped row 18, Total liabilities and stockholder's equity, with lookback window of 17.
		Our row is valued at 127385456000.0, our lookback sum is 127385456000.0
Working on 72267-2015-06-01
equal 0 13 0 13
replace 13 14 13 14
equal 14 18 14 18
	We dropped row 12, Total liabilities, with lookback window of 12.
		Our row is valued at 106030731000.0, our lookback sum is 106030731000.0
	We dropped row 16, Total stockholder's equity, with lookback window of 2.
		Our row is valued at 2929606000.0, our lookback sum is 2929606000.0
	We dropped row 17, Total liabilities and stockholder's equity, with lookback window of 17.
		Our row 

Working on 782124-2013-03-01
replace 0 3 0 13
	We dropped row 8, Total Liabilities, with lookback window of 2.
		Our row is valued at 716000000.0, our lookback sum is 716000000.0
Working on 782124-2014-03-05
equal 0 11 0 11
replace 11 12 11 12
equal 12 14 12 14
	We dropped row 11, Total liabilities, with lookback window of 11.
		Our row is valued at 355063000000.0, our lookback sum is 355063000000.0
	We dropped row 13, Total liabilities and member's equity, with lookback window of 13.
		Our row is valued at 372319000000.0, our lookback sum is 372319000000.0
Working on 782124-2015-02-27
equal 0 11 0 11
replace 11 12 11 12
equal 12 17 12 17
	We dropped row 11, Total liabilities, with lookback window of 11.
		Our row is valued at 367038000000.0, our lookback sum is 367038000000.0
	We dropped row 15, Total member's equity, with lookback window of 2.
		Our row is valued at 15293000000.0, our lookback sum is 15293000000.0
	We dropped row 16, Total liabilities and member's equity, with lookba

Working on 851376-2013-03-01
equal 0 18 0 18
	We dropped row 10, Total, with lookback window of 10.
		Our row is valued at 299299000000.0, our lookback sum is 299299000000.0
	We dropped row 16, Total stockholder's equity, with lookback window of 3.
		Our row is valued at 7264000000.0, our lookback sum is 7264000000.0
	We dropped row 17, Total liabilities and stockholder's equity, with lookback window of 17.
		Our row is valued at 309063000000.0, our lookback sum is 309063000000.0
Working on 851376-2014-03-04
equal 0 16 0 16
Working on 851376-2015-03-02
equal 0 18 0 18
	We dropped row 16, Total stockholder's equity, with lookback window of 3.
		Our row is valued at 7857000000.0, our lookback sum is 7857000000.0
	We dropped row 17, Total liabilities and stockholder's equity, with lookback window of 17.
		Our row is valued at 248026000000.0, our lookback sum is 248026000000.0
Working on 851376-2016-02-29
equal 0 19 0 19
	We dropped row 12, Total liabilities, with lookback window of 12.
		

Working on 853784-2016-03-01
equal 0 10 0 10
	We dropped row 6, Total liabilities, with lookback window of 6.
		Our row is valued at 70097715000.0, our lookback sum is 70097715000.0
	We dropped row 9, Total liabilities and stockholder's equity, with lookback window of 9.
		Our row is valued at 71580510000.0, our lookback sum is 71580510000.0
Working on 853784-2017-03-01
equal 0 10 0 10
	We dropped row 6, Total liabilities, with lookback window of 6.
		Our row is valued at 70269822000.0, our lookback sum is 70269822000.0
	We dropped row 9, Total liabilities and shareholder's equity, with lookback window of 9.
		Our row is valued at 71759167000.0, our lookback sum is 71759167000.0
Working on 853784-2018-03-05
equal 0 14 0 14
	We dropped row 6, Total liabilities, with lookback window of 6.
		Our row is valued at 89723445000.0, our lookback sum is 89723445000.0
	We dropped row 12, Shareholder's equity, with lookback window of 3.
		Our row is valued at 835153000.0, our lookback sum is 83515

Working on 91154-2011-03-01
equal 0 25 0 25
	We dropped row 2, Collateralized short-term financing agreements:, with lookback window of 2.
		Our row is valued at 15835000000.0, our lookback sum is 15835000000.0
	We dropped row 5, Trading account liabilities:, with lookback window of 2.
		Our row is valued at 131419000000.0, our lookback sum is 131419000000.0
	We dropped row 12, Payables and accrued liabilities:, with lookback window of 6.
		Our row is valued at 27875000000.0, our lookback sum is 27875000000.0
	We dropped row 18, Total liabilities, with lookback window of 18.
		Our row is valued at 272389000000.0, our lookback sum is 272389000000.0
	We dropped row 23, Total stockholder's equity, with lookback window of 3.
		Our row is valued at 16020000000.0, our lookback sum is 16010000000.0
	We dropped row 24, Total liabilities and stockholder's equity, with lookback window of 24.
		Our row is valued at 288409000000.0, our lookback sum is 288409000000.0
Working on 91154-2012-02-29
equ

In [279]:
# work on combining columns that are issued seperately
s3 = boto3.client('s3')
session = Session()
bucket = "ran-s3-systemic-risk"

# e.g. file name = 1224385-2004-03-01, 42352-2003-01-28
s3.download_file(bucket, 'Output/X-17A-5-SPLIT-PDFS/Assets/42352-2011-03-01.csv', 'temp.csv')
pdf_df = pd.read_csv('temp.csv')
s3.download_file(bucket, 'Output/X-17A-5-SPLIT-PNGS/Assets/42352-2011-03-01.csv', 'temp.csv')
png_df = pd.read_csv('temp.csv')
os.remove('temp.csv')

# do a special merge that combines unique line items names between PDF & PNG
temp_df1 = special_merge(pdf_df, png_df, '0')

# run accounting check to remove sub-totals for each respective line-item
df, total_flag, total_amt = totals_check(temp_df1)

# # we have that no "total asset" figure was found
# if total_flag == 2:
#     row = pd.DataFrame([["Total liabilities & shareholder's equity check", 
#                          "Total liabilities & shareholder's equity"], 
#                         ["Total liabilities & shareholder's equity not found", 
#                          total_amt]], 
#                        index=['0', '1']).T
#     df = df.append(row)

# # we have that "total asset" was found and matches
# elif total_flag == 1:
#     row = pd.DataFrame([["Total liabilities & shareholder's equity check", 
#                          "Total liabilities & shareholder's equity"], 
#                         ["Total liabilities & shareholder's equity found & match", 
#                          total_amt]], 
#                        index=['0', '1']).T
#     df= df.append(row)

# # we have that "total asset" was found, but did not match correctly
# elif total_flag == 0:
#     row = pd.DataFrame(["Total liabilities & shareholder's equity check", 
#                         'Total Liability & Equity found & no match'], 
#                        index=['0', '1']).T
#     df = df.append(row)

# # construct row for the unstructured data frame 
# export_df = unstructured_data(df, '2003-01-28', 2002, '42352', cik2brokers)


In [280]:
pdf_df

Unnamed: 0,0,1
0,Cash and cash equivalents,4507000000.0
1,Cash and securities segregated for regulatory ...,32780000000.0
2,"Securities borrowed (includes $64,750 at fair ...",191890000000.0
3,Securities purchased under agreements to resel...,122926000000.0
4,"Receivables from brokers, dealers and clearing...",9356000000.0
5,Receivables from customers and counterparties ...,21582000000.0
6,"Financial Instruments owned, at fair value",116885000000.0
7,Financial instruments owned and pledged as col...,23436000000.0
8,Total financial instruments owned. at fair value,139321000000.0
9,Other assets,4639000000.0


In [281]:
png_df

Unnamed: 0,0,1
0,Assets Cash and cash equivalents,4507000000.0
1,Cash and securities segregated for regulatory ...,32780000000.0
2,"Securities borrowed (includes $64,750 at fair ...",191890000000.0
3,Securities purchased under agreements to resel...,122926000000.0
4,"Receivables from brokers, dealers and clearing...",9356000000.0
5,Receivables from customers and counterparties ...,21582000000.0
6,"Financial instruments owned, at fair value",115885000000.0
7,Financial instruments owned and pledged as col...,23436000000.0
8,"Total financial instruments owned, at fair value",139321000000.0
9,Other assets,4639000000.0


In [285]:
df

Unnamed: 0,0,1
0,Cash and cash equivalents,4507000000.0
1,Cash and securities segregated for regulatory ...,32780000000.0
2,"Securities borrowed (includes $64,750 at fair ...",191890000000.0
3,Securities purchased under agreements to resel...,122926000000.0
4,"Receivables from brokers, dealers and clearing...",9356000000.0
5,Receivables from customers and counterparties ...,21582000000.0
6,"Financial Instruments owned, at fair value",116885000000.0
7,Financial instruments owned and pledged as col...,23436000000.0
8,Total financial instruments owned. at fair value,139321000000.0
9,Other assets,4639000000.0


In [287]:
df.iloc[8].iloc[1]

139321000000.0

In [292]:
df.iloc[6:8].sum().iloc[1]

140321000000.0

In [293]:
epsilon_error(139321000000.0, 140321000000.0)

False

In [296]:
abs(abs(140321000000.0 - 139321000000) / 139321000000) 

0.007177668836715212