In [16]:
import os
import re
import boto3
import json
import time
import requests
import datetime

import pandas as pd
import numpy as np

from sagemaker.session import Session
from bs4 import BeautifulSoup

## Accounting term matching
**Check to see if we report totals (e.g. Total Liability & Shareholder Equity) or sub-totals (e.g. Total Financial Instruments), these figure are not needed for construction of the unstructured database (avoid classification issue)**

In [17]:
def totals_check(df:pd.DataFrame) -> pd.DataFrame:
    """
    Checks to see if a line row meets the conditon of a total, if true we remove these rows as we make 
    have checked the terms before have meet our conditions (these include major and minor totals)
    ------------------------------------------------------------------------------------
    :param: (type pandas.DataFrame)
        A DataFrame that represents the Asset or Liability & Equity portion of the balance sheet
        
    :return: (type tuple)
        Return a cleaned DataFrame that strips the rows that represent totals
    """
    m, n = df.shape    # unpack the shape of dataframe
    
    # numpy exception for handling invalid log10 RunTime error (we opt to not show)
    # switch 'ignore' to 'warn', if you would like to flag the RunTime error 
    np.seterr(invalid = 'ignore') 
    
    def multiple_check(x1:float, x2:float):
        """
        Determine whether the two values are the same number scaled by 10
        """
        # prevent zero division error since x1 is the denominator and log10 zero division error
        if (x1 == 0) or (x2 == 0): return False
        else:
            # if our backward sum is a multiple of 10, we return True 
            # (e.g. Total Assets (x1) 745.2322 vs Backward Sum (x2) 7452322)
            check1 = np.log10(x2 / x1).is_integer()
             
            # if our backward sum is a substring of a line item, with a difference of one in length, we return True 
            # (e.g. Total Assets (x1) 174182935 vs Backward Sum (x2) 74182935)
            check2 = (str(x2) in str(x1) ) & (len(str(x2)) == len(str(x1)) - 1)
            
            if check1 or check2: 
                return True
            else: 
                return False
    
    def epsilon_error(x1:float, x2:float, tol:float=0.01):
        """
        Determine whether the two values are within a similar epsilon bound. We default our error tolerance
        to 0.01, implying that if two numbers are within a specified toleracnce (default 1% ) of one another 
        """
        if (x1 == 0) or (x2 == 0): return False
        else:
            # first we convert the numeric quantities into strings
            current = str(x1)
            lookback = str(x2)
            
            # we only want to check against the relative difference if one element in the number is read wrong
            if len(current) == len(lookback):

                # we iterate linearly through each string and check to see the positional match 
                # if we catch and mismatch we flag it with a 1, othewise skip with 0
                changes = [0 if (current[i] == lookback[i]) else 1 for i in range(len(current))]
                
                # check set differences produce a set with exactly 1 in length
                if sum(changes) == 1:
                
                    diff = abs(x1 - x2)      # compute numeric differences

                    # check to see whether an accounting condition was met wihtin a boundary condition
                    if abs(diff / x1) <= tol:
                        return True
            
            return False
    
    data_col = df.columns[1]         # the values column for balance sheet
    
    # iterate through each of the line items
    for i in range(m):
        # check the value of line items at a given index (forward index)
        item1 = df.loc[i].values[1]
        name = df.loc[i].values[0]
        
        # compute backward sum (lookback index) 
        for j in range(i):
            
            # check whether dataframe empty (if so we skip to avoid fitting errors)
            # NOTE: Index position (i-1)   = line above current line
            #                      (i-j-1) = trailing look up line 'j' lines above the line above current line
            lookback = df.loc[i-j-1:i-1][data_col]
            
            # we check whether the lookback period is empty (if so we most likely deleted the row)
            if not lookback.empty:
                # backward sum for line items (index minus j-periods before)
                item2 = lookback.sum()

                # if we achieve this then we strip totals and break, no need to continue backward sum
                if (item1 == item2) or multiple_check(item1, item2) or epsilon_error(item1, item2, tol=0.01):
                    
                    ##
                    # Add error handle for when total liability, total equity, total liabilty and equity and 
                    # total assets terms appear in the sample set (if present)
                    ##
                    
                    
                    df = df.drop(index=i)
                    
                    # Error Handling for row deletions (uncomment for when not in use)
                    print('\tWe dropped row {}, {}, with lookback window of {}.'.format(i, name, j+1))
                    print('\t\tOur row is valued at {}, our lookback sum is {}'.format(item1, item2))
                    
                    # we break from inner loop to avoid key error flag 
                    break     
                
    return df

## Unstructured Database construction
**We develop our unstructured database from each of the non-total rows (concating the line items)**

In [18]:
def unstructured_data(df, filing_d, fiscal_y, cik, cik2name:dict) -> pd.DataFrame:
    """
    Forms unstructured row for larger database to be stored in s3 bucket
    ------------------------------------------------------------------------------------------
    Input:
        :param: df (type pandas.DataFrame)
            The balance sheet for a particular  
        :paran: filing_d (type str)
            The filing date for release of X-17A-5 filings for a broker dealer e.g. 2013-03-21
        :paran: fiscal_y (type str)
            The fiscal year for the balance sheet to cover e.g. 2012 (usually 1-year prior to filing date)
        :paran: cik (type str)
            The CIK number for a broker dealer e.g. 887767
        :paran: cik2name (type dict)
            A dictionary that maps CIK to Broker Deale names 
    Output:  
        :return: (type pandas.DataFrame)
             Return a transposed dataframe with additional columns corresponding to filing data
    """
    
    # intialize the first column (line items)
    first_column = df.columns[0]
    
    # clean dataframe should be of size greater than 1
    if len(df.columns) > 1:
        
        # transpose split balance sheet figure (our line items are now columns)
        # we first groupby the first column (this become index) and sum to group congruent names
        row = df.groupby(first_column).sum().T

        # creating additional columns in row
        row['CIK'] = cik                                  # CIK number for firm 
        row['Filing Date'] = filing_d                     # Filing Date for firm filing
        row['Filing Year'] = fiscal_y                     # Year for balance sheet filing
        row['Name'] = cik2name['broker-dealers'][cik]     # returns the name of associated with the CIK
        
        return row
    
    else:
        print('{}-{}.csv - encountered issue reading PDF'.format(cik, filing_d))
        return None
    

In [19]:
def totals_flag(df:pd.DataFrame):
    """
    Check to see whether their exists the presence of a total term from the line items
    ------------------------------------------------------------------------------------------
    """
    # perform regex search 
    selection = [i for i in df.columns if re.search('total|totals', i, flags=re.I) is not None]
    
    # if there exists some mention of the word total in a line item
    # we know to re-run structuring on PNG rather than PDF
    if len(selection) > 0:
        return True
    
    return False    

In [20]:
def reorder_columns(df:pd.DataFrame) -> pd.DataFrame:
    """
    Re-order the completed DataFrame by ordering the CIK, Name, Filing Data and Filing Year 
    ------------------------------------------------------------------------------------------
    Input:
        :param: df (type pandas.DataFrame)
            The unstructured database for balance sheet figures
    Output:  
        :return: (type pandas.DataFrame)
            Return a dataframe with dimensions less than or equal to input dataframe (MxN) -> (MxK), 
            where K <= N
    """
    # re-order the CIK and Year columns to appear as the first two columns
    remap = df.columns[~np.isin(df.columns, ['CIK', 'Name', 'Filing Date', 'Filing Year'])]   # select all other columns
    df = df[np.insert(remap, [0, 0, 0, 0], ['CIK', 'Name', 'Filing Date', 'Filing Year'])]    # insert new order

    # filter out columns with NaN values
    filterNaN = df.isnull().all()                      # find if any column is all NaN 
    cleanCols = filterNaN[filterNaN == False].index    # select columns with at least one value

    # clean dataframe for unstructured asset terms
    return df[cleanCols]

## Final Main Execution

In [21]:
if __name__ == "__main__":
    
    # initiate s3 bucket and corresponding data folder
    bucket = "ran-s3-systemic-risk"
    
    pdf_asset_folder = "Output/X-17A-5-SPLIT-PDFS/Assets/"
    pdf_liable_folder = "Output/X-17A-5-SPLIT-PDFS/Liability & Equity/"
    
    png_asset_folder = "Output/X-17A-5-SPLIT-PNGS/Assets/"
    png_liable_folder = "Output/X-17A-5-SPLIT-PNGS/Liability & Equity/"
    
    out_folder = "Output/"

    # Amazon Textract client and Sagemaker session
    textract = boto3.client('textract')
    s3 = boto3.client('s3')
    session = Session()
    
    # ==============================================================================
    # ALL TEMPORARY FILE INFORMATION 
    # ==============================================================================
    # retrieving CIK-Dealers JSON file from s3 bucket
    s3.download_file(bucket, 'Temp/CIKandDealers.json', 'temp.json')
    with open('temp.json', 'r') as f: cik2brokers = json.loads(f.read())
        
    # retrieving ASSET items and their corresponding model label
    s3.download_file(bucket, 'Temp/assetML.json', 'temp.json')
    with open('temp.json', 'r') as f: asset_ml = json.loads(f.read())
        
    # retrieving Liability & Equity items and their corresponding model label
    s3.download_file(bucket, 'Temp/liabilityML.json', 'temp.json')
    with open('temp.json', 'r') as f: liable_ml = json.loads(f.read())

    # remove local file after it has been created (variable is stored in memory)
    os.remove('temp.json')
    # ==============================================================================
    
    # s3 paths where asset and liability paths are stored
    asset_paths = session.list_s3_files(bucket, pdf_asset_folder)
    liable_paths = session.list_s3_files(bucket, pdf_liable_folder)
    
    # intialize list to store dataframes for asset and liability & equity
    asset_concat = [0] * len(asset_paths)
    liable_concat = [0] * len(liable_paths)
    
    # --------------------------------------------
    # Asset Unstructured Database
    # --------------------------------------------
    print('Assets Unstructured Database')
    for idx, csv in enumerate(asset_paths):
        fileName = csv.split('/')[-1]        # e.g. '1224385-2005-03-01.csv'
        csv_strip = fileName[:-4]            # ignore last four elements from the back (i.e. .csv)
        
        # construct a string measure of important data measures 
        data_split = csv_strip.split('-')              
        filing_date = '-'.join(data_split[1:])         # join YYYY-mm-dd component for filing date
        fiscal_year = int(data_split[1]) - 1           # fiscal year are generally the previous year of filing
        cik = data_split[0]                            # extract the CIK number  
        
        # first load in both the PNG and PDF split balance sheets
        # NOTE: All these balance sheets are cleaned numerical values
        s3.download_file(bucket, csv, 'temp.csv')
        pdf_df = pd.read_csv('temp.csv')
        
        s3.download_file(bucket, png_asset_folder + fileName, 'temp.csv')
        png_df = pd.read_csv('temp.csv')
        
        # remove local file after it has been created
        os.remove('temp.csv')
        print('Working on {}'.format(csv_strip))
        
        # process balance sheet dataframes
        temp_df1 = totals_check(pdf_df)                # run an accounting check removing any total rows
        temp_df2 = totals_check(png_df)
        
        # 
        # Add check for determing which read balance sheet is the correct one and which
        # one is incorrect, compare dataframes https://kanoki.org/2019/07/04/pandas-difference-between-two-dataframes/
        #
        
        
        
        # check to see which sum is larger (we take the assumption that the larger variant is correct)
        if temp_df1[temp_df1.columns[1]].sum() > temp_df1[temp_df1.columns[1]].sum():

            export_df = unstructured_data(temp_df1,         # construct row of unstructured data frame 
                                          filing_date, 
                                          fiscal_year, cik, cik2brokers)
        else:
            export_df = unstructured_data(temp_df2, filing_date, fiscal_year, cik, cik2brokers)
        
        # stores the reported data frame 
        asset_concat[idx] = export_df
     
    print('\n\n\n\n')
        
    # --------------------------------------------
    # Liability & Equity Unstructured Database
    # --------------------------------------------
    print('\nLiability & Equity Unstructured Database')
    for idx, csv in enumerate(liable_paths):
        fileName = csv.split('/')[-1]        # e.g. '1224385-2005-03-01.csv'
        csv_strip = fileName[:-4]            # ignore last four elements from the back (i.e. .csv)
        
        # construct a string measure of important data measures 
        data_split = csv_strip.split('-')              
        filing_date = '-'.join(data_split[1:])         # join YYYY-mm-dd component for filing date
        fiscal_year = int(data_split[1]) - 1           # fiscal year are generally the previous year of filing
        cik = data_split[0]                            # extract the CIK number  
        
        # work on combining columns that are issued seperately
        s3.download_file(bucket, csv, 'temp.csv')
        pdf_df = pd.read_csv('temp.csv')
        
        s3.download_file(bucket, png_liable_folder + fileName, 'temp.csv')
        png_df = pd.read_csv('temp.csv')
        
        # remove local file after it has been created
        os.remove('temp.csv')
        print('Working on {}'.format(csv_strip))
        
        # process balance sheet dataframes
        temp_df1 = totals_check(pdf_df)                # run an accounting check removing any total rows
        temp_df2 = totals_check(png_df)
        
        # check to see which sum is larger (we take the assumption that the larger variant is correct)
        if temp_df1[temp_df1.columns[1]].sum() > temp_df1[temp_df1.columns[1]].sum():

            export_df = unstructured_data(temp_df1,         # construct row of unstructured data frame 
                                          filing_date, 
                                          fiscal_year, cik, cik2brokers)
        else:
            export_df = unstructured_data(temp_df2, filing_date, fiscal_year, cik, cik2brokers)
        
        # stores the reported data frame 
        liable_concat[idx] = export_df
    
    # --------------------------------------------
    # Database exportation
    # --------------------------------------------
    
    asset_df = pd.concat(asset_concat)        # asset dataframe combining all rows from 
    asset_df = reorder_columns(asset_df)      # re-order columns for dataframe
    
    # writing data frame to .csv file
    filename1 = 'unstructured_assets.csv'
    asset_df.to_csv(filename1, index=False)
    
    # write .csv file to s3
    with open(filename1, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + filename1, Body=data)
    
    liable_df = pd.concat(liable_concat)      # liablity & equity dataframe combining all rows from 
    liable_df = reorder_columns(liable_df)    # re-order columns for dataframe
    
    # writing data frame to .csv file
    filename2 = 'unstructured_liable.csv'
    liable_df.to_csv(filename2, index=False)
    
    # write .csv file to s3
    with open(filename2, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + filename2, Body=data)
    
    # remove local file after it has been created
    os.remove(filename1)
    os.remove(filename2)
    
    print('\nWe created an unstructured asset and liability & equity')

Assets Unstructured Database
Working on 1224385-2004-03-01
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 21509787000.0, our lookback sum is 21509787000.0
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 21509787000.0, our lookback sum is 21509787000.0
Working on 1224385-2005-03-01
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 35611655000.0, our lookback sum is 35611655000.0
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 35611655000.0, our lookback sum is 35611655000.0
Working on 1224385-2006-03-01
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 24029490000.0, our lookback sum is 24029490000.0
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 24029490000.0, our lookback sum is 24029490000.0
Working on 1224385-2007-03-01
	We dropped row 9, Total assets, with lookback window o

Working on 42352-2010-03-01
	We dropped row 8, Total financial instruments owned, at fair value, with lookback window of 2.
		Our row is valued at 132743000000.0, our lookback sum is 132743000000.0
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 463755000000.0, our lookback sum is 463755000000.0
	We dropped row 8, Total financial instruments owned, at fair value, with lookback window of 2.
		Our row is valued at 132743000000.0, our lookback sum is 132743000000.0
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 463755000000.0, our lookback sum is 463755000000.0
Working on 42352-2011-03-01
	We dropped row 8, Total financial instruments owned, at fair value, with lookback window of 2.
		Our row is valued at 139321000000.0, our lookback sum is 139321000000.0
	We dropped row 10, Total assets, with lookback window of 10.
		Our row is valued at 527001000000.0, our lookback sum is 527001000000.0
Working on 42352-2012-02-2

Working on 58056-2020-03-02
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 80071000000.0, our lookback sum is 80071000000.0
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 80071000000.0, our lookback sum is 80071000000.0
Working on 58056-2021-03-01
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 72130000000.0, our lookback sum is 72130000000.0
	We dropped row 9, Total assets, with lookback window of 9.
		Our row is valued at 72130000000.0, our lookback sum is 72130000000.0
Working on 68136-2002-01-29
	We dropped row 15, Total assets, with lookback window of 15.
		Our row is valued at 267625461000.0, our lookback sum is 267625461000.0
	We dropped row 15, Total assets, with lookback window of 15.
		Our row is valued at 267625461000.0, our lookback sum is 267625461000.0
Working on 68136-2003-01-30
	We dropped row 15, Total assets, with lookback window of 15.
		Our row is valued at 265

Working on 72267-2005-05-31
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 78732111000.0, our lookback sum is 78732111000.0
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 178732111000.0, our lookback sum is 78732111000.0
Working on 72267-2006-05-30
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 74182935000.0, our lookback sum is 74182935000.0
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 174182935000.0, our lookback sum is 74182935000.0
Working on 72267-2007-05-29
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 87542832000.0, our lookback sum is 87542832000.0
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued at 87542832000.0, our lookback sum is 87542832000.0
Working on 72267-2008-05-30
	We dropped row 11, Total assets, with lookback window of 11.
		Our row is valued 

	We dropped row 11, TOTAL ASSETS, with lookback window of 11.
		Our row is valued at 150559829000.0, our lookback sum is 150559829000.0
Working on 782124-2004-01-30
	We dropped row 11, TOTAL ASSETS, with lookback window of 11.
		Our row is valued at 164601243000.0, our lookback sum is 164601243000.0
	We dropped row 11, TOTAL ASSETS, with lookback window of 11.
		Our row is valued at 164601243000.0, our lookback sum is 164601243000.0
Working on 782124-2004-05-07
	We dropped row 8, Net property and equipment, with lookback window of 3.
		Our row is valued at 107603.0, our lookback sum is 107603.0
	We dropped row 9, TOTAL ASSETS, with lookback window of 9.
		Our row is valued at 25673078.0, our lookback sum is 25673078.0
	We dropped row 8, Net property and equipment, with lookback window of 3.
		Our row is valued at 107603.0, our lookback sum is 107603.0
	We dropped row 9, TOTAL ASSETS, with lookback window of 9.
		Our row is valued at 25673078.0, our lookback sum is 25673078.0
Working on

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [24]:
print(csv)
print(png_asset_folder + fileName)

Output/X-17A-5-SPLIT-PDFS/Assets/782124-2014-03-05.csv
Output/X-17A-5-SPLIT-PNGS/Assets/782124-2014-03-05.csv


In [25]:
 # work on combining columns that are issued seperately
s3.download_file(bucket, 'Output/X-17A-5-SPLIT-PDFS/Assets/42352-2003-01-28.csv', 'temp.csv')
pdf_df = pd.read_csv('temp.csv')

s3.download_file(bucket, 'Output/X-17A-5-SPLIT-PNGS/Assets/42352-2003-01-28.csv', 'temp.csv')
png_df = pd.read_csv('temp.csv')

# default check the number of lines present (larger is correct)

In [33]:
totals_check(pdf_df)

	We dropped row 7, Total financial instruments owned, at fair value, with lookback window of 2.
		Our row is valued at 40528247000.0, our lookback sum is 40528247000.0


Unnamed: 0,0,1
0,and other regulations,16568220000.0
1,"Receivables from brokers, dealers and clearing...",3964833000.0
2,Receivables from customers and counterparties,10265520000.0
3,Securities borrowed,105728000000.0
4,Securities purchased under agreements to resell,32394690000.0
5,"Financial instruments owned, at fair value",32047940000.0
6,Financial instruments owned and pledged as col...,8480307000.0
8,Other assets,1716894000.0


In [34]:
totals_check(png_df)

	We dropped row 8, Total financial instruments owned, at fair value, with lookback window of 2.
		Our row is valued at 40528247000.0, our lookback sum is 40528247000.0


Unnamed: 0,0,1
0,Cash and cash equivalents,2816419000.0
1,and other regulations,16568220000.0
2,"Receivables from brokers, dealers and clearing...",3964833000.0
3,Receivables from customers and counterparties,10265520000.0
4,Securities borrowed,105728000000.0
5,Securities purchased under agreements to resell,32394690000.0
6,"Financial instruments owned, at fair value",32047940000.0
7,Financial instruments owned and pledged as col...,8480307000.0
9,Other assets,1716894000.0


In [None]:
# look for total assets, keep if the sum matches in the accounting check (Total Asset match Sum of Assets) 
# if match is wrong (Total Asset does not match sum of assets)
# if no total assets is not found, (no total asset found) (No Total Asset found)