In [42]:
import os
import re
import boto3
import json
import time
import requests
import datetime

import pandas as pd
import numpy as np

from sagemaker.session import Session
from bs4 import BeautifulSoup

## Accounting term matching
**Check to see if we report totals (e.g. Total Liability & Shareholder Equity) or sub-totals (e.g. Total Financial Instruments), these figure are not needed for construction of the unstructured database (avoid classification issue)**

In [55]:
def totals_check(df:pd.DataFrame) -> pd.DataFrame:
    """
    Checks to see if a line row meets the conditon of a total, if true we remove these rows as we make 
    have checked the terms before have meet our conditions (these include major and minor totals)
    ------------------------------------------------------------------------------------
    :param: (type pandas.DataFrame)
        A DataFrame that represents the Asset or Liability & Equity portion of the balance sheet
        
    :return: (type tuple)
        Return a cleaned DataFrame that strips the rows that represent totals
    """
    m, n = df.shape    # unpack the shape of dataframe
    
    # numpy exception for handling invalid log10 RunTime error (we opt to not show)
    # switch 'ignore' to 'warn', if you would like to flag the RunTime error 
    np.seterr(invalid = 'ignore') 
    
    def multiple_check(x1:float, x2:float):
        """
        Determine whether the two values are the same number scaled by 10
        """
        # prevent zero division error since x1 is the denominator and log10 zero division error
        if (x1 == 0) or (x2 == 0): return False
        else:
            # if our backward sum is a multiple of 10, we return True 
            # (e.g. Total Assets (x1) 745.2322 vs Backward Sum (x2) 7452322)
            check1 = np.log10(x2 / x1).is_integer()
             
            # if our backward sum is a substring of a line item, with a difference of one in length, we return True 
            # (e.g. Total Assets (x1) 174182935 vs Backward Sum (x2) 74182935)
            check2 = (str(x2) in str(x1) ) & (len(str(x2)) == len(str(x1)) - 1)
            
            if check1 or check2: 
                return True
            else: 
                return False
    
    def epsilon_error(x1:float, x2:float, epsilon:float=0.0001):
        """
        Determine whether the two values are within a similar epsilon bound.
        We default our epsilon to 0.01, implying that if two numbers are within
        0.01% of one another (we make this assumption to handle rounding errors)
        """
        if (x1 == 0) or (x2 == 0): return False
        else:
            diff = abs(x1 - x2)

            # check to see whether an accounting condition was met wihtin a boundary condition
            if abs(diff / x1) <= epsilon:
                return True
            return False
    
    data_col = df.columns[1]         # the values column for balance sheet
    
    # iterate through each of the line items
    for i in range(m):
        # check the value of line items at a given index (forward index)
        item1 = df.loc[i].values[1]

        # compute backward sum (lookback index) 
        for j in range(i):
            
            # check whether dataframe empty (if so we skip to avoid fitting errors)
            # NOTE: Index position (i-1)   = line above current line
            #                      (i-j-1) = trailing look up line 'j' lines above the line above current line
            lookback = df.loc[i-j-1:i-1][data_col]
            
            # we check whether the lookback period is empty (if so we most likely deleted the row)
            if not lookback.empty:
                # backward sum for line items (index minus j-periods before)
                item2 = lookback.sum()

                # if we achieve this then we strip totals and break, no need to continue backward sum
                if (item1 == item2) or multiple_check(item1, item2) or epsilon_error(item1, item2, epsilon=0.0001):
                    df = df.drop(index=i)
                    print('Drop index {}, lookback at {} = with top sum of {}, lookback of {}'.format(i, j, item1, item2))
                    # we break from inner loop to avoid key error flag 
                    break     
                
    return df

## Unstructured Database construction
**We develop our unstructured database from each of the non-total rows (concating the line items)**

In [44]:
def unstructured_data(df, filing_d, fiscal_y, cik, cik2name:dict) -> pd.DataFrame:
    """
    Forms unstructured row for larger database to be stored in s3 bucket
    ------------------------------------------------------------------------------------------
    Input:
        :param: df (type pandas.DataFrame)
            The balance sheet for a particular  
        :paran: filing_d (type str)
            The filing date for release of X-17A-5 filings for a broker dealer e.g. 2013-03-21
        :paran: fiscal_y (type str)
            The fiscal year for the balance sheet to cover e.g. 2012 (usually 1-year prior to filing date)
        :paran: cik (type str)
            The CIK number for a broker dealer e.g. 887767
        :paran: cik2name (type dict)
            A dictionary that maps CIK to Broker Deale names 
    Output:  
        :return: (type pandas.DataFrame)
             Return a transposed dataframe with additional columns corresponding to filing data
    """
    
    # intialize the first column (line items)
    first_column = df.columns[0]
    
    # clean dataframe should be of size greater than 1
    if len(df.columns) > 1:
        
        # transpose split balance sheet figure (our line items are now columns)
        # we first groupby the first column (this become index) and sum to group congruent names
        row = df.groupby(first_column).sum().T

        # creating additional columns in row
        row['CIK'] = cik                                  # CIK number for firm 
        row['Filing Date'] = filing_d                     # Filing Date for firm filing
        row['Filing Year'] = fiscal_y                     # Year for balance sheet filing
        row['Name'] = cik2name['broker-dealers'][cik]     # returns the name of associated with the CIK
        
        return row
    
    else:
        print('{}-{}.csv - encountered issue reading PDF'.format(cik, filing_d))
        return None
    

In [45]:
def reorder_columns(df:pd.DataFrame) -> pd.DataFrame:
    """
    Re-orders the completed 
    ------------------------------------------------------------------------------------------
    Input:
        :param: df (type pandas.DataFrame)
            The unstructured database for balance sheet figures
    Output:  
        :return: (type pandas.DataFrame)
            Return a dataframe with dimensions less than or equal to input dataframe (MxN) -> (MxK), 
            where K <= N
    """
    # re-order the CIK and Year columns to appear as the first two columns
    remap = df.columns[~np.isin(df.columns, ['CIK', 'Name', 'Filing Date', 'Filing Year'])]   # select all other columns
    df = df[np.insert(remap, [0, 0, 0, 0], ['CIK', 'Name', 'Filing Date', 'Filing Year'])]    # insert new order

    # filter out columns with NaN values
    filterNaN = df.isnull().all()                      # find if any column is all NaN 
    cleanCols = filterNaN[filterNaN == False].index    # select columns with at least one value

    # clean dataframe for unstructured asset terms
    return df[cleanCols]

In [46]:
if __name__ == "__main__":
    
    # initiate s3 bucket and corresponding data folder
    bucket = "ran-s3-systemic-risk"
    asset_folder = "Output/X-17A-5-SPLIT/Assets/"
    liable_folder = "Output/X-17A-5-SPLIT/Liability & Equity/"
    out_folder = "Output/"

    # Amazon Textract client and Sagemaker session
    textract = boto3.client('textract')
    s3 = boto3.client('s3')
    session = Session()
    
    # ==============================================================================
    # retrieving CIK-Dealers JSON file from s3 bucket
    s3.download_file(bucket, 'Temp/CIKandDealers.json', 'temp.json')

    # read all CIK and Dealer name information from storage
    with open('temp.json', 'r') as f: cik2brokers = json.loads(f.read())

    # remove local file after it has been created (variable is stored in memory)
    os.remove('temp.json')
    # ==============================================================================
    
    # s3 paths where asset and liability paths are stored
    asset_paths = session.list_s3_files(bucket, asset_folder)[1:]
    liable_paths = session.list_s3_files(bucket, liable_folder)[1:]
    
    # intialize list to store dataframes for asset and liability & equity
    asset_concat = [0] * len(asset_paths)
    liable_concat = [0] * len(liable_paths)
    
    # --------------------------------------------
    # Asset Unstructured Database
    # --------------------------------------------
    print('Assets Unstructured Database')
    for idx, csv in enumerate(asset_paths):
        fileName = csv.split('/')[-1]        # e.g. '1224385-2005-03-01.csv'
        csv_strip = fileName[:-4]            # ignore last four elements from the back (i.e. .csv)
        
        # construct a string measure of important data measures 
        data_split = csv_strip.split('-')              
        filing_date = '-'.join(data_split[1:])         # join YYYY-mm-dd component for filing date
        fiscal_year = int(data_split[1]) - 1           # fiscal year are generally the previous year of filing
        cik = data_split[0]                            # extract the CIK number  
        
        # work on combining columns that are issued seperately
        s3.download_file(bucket, csv, 'temp.csv')
        df = pd.read_csv('temp.csv')

        # process balance sheet dataframes
        temp_df = totals_check(df)                  # run an accounting check removing any total rows
        export_df = unstructured_data(temp_df,      # construct row of unstructured data frame 
                                      filing_date, 
                                      fiscal_year, cik, cik2brokers)
        
        # stores the reported data frame 
        asset_concat[idx] = export_df
        
        # remove local file after it has been created
        os.remove('temp.csv')
        
        if ((idx + 1) % 10) == 0:
            print('\tWe have completed {} - asset rows'.format(idx + 1))
        
    # --------------------------------------------
    # Liability & Equity Unstructured Database
    # --------------------------------------------
    print('\nLiability & Equity Unstructured Database')
    for idx, csv in enumerate(liable_paths):
        fileName = csv.split('/')[-1]        # e.g. '1224385-2005-03-01.csv'
        csv_strip = fileName[:-4]            # ignore last four elements from the back (i.e. .csv)
        
        # construct a string measure of important data measures 
        data_split = csv_strip.split('-')              
        filing_date = '-'.join(data_split[1:])         # join YYYY-mm-dd component for filing date
        fiscal_year = int(data_split[1]) - 1           # fiscal year are generally the previous year of filing
        cik = data_split[0]                            # extract the CIK number  
        
        # work on combining columns that are issued seperately
        s3.download_file(bucket, csv, 'temp.csv')
        df = pd.read_csv('temp.csv')
        
        # process balance sheet dataframes
        temp_df = totals_check(df)                 # run an accounting check removing any total rows
        export_df = unstructured_data(temp_df,     # construct row of unstructured data frame 
                                      filing_date, 
                                      fiscal_year, cik, cik2brokers)
        
        # stores the reported data frame 
        liable_concat[idx] = export_df

        # remove local file after it has been created
        os.remove('temp.csv')
        
        if ((idx + 1) % 10) == 0:
            print('\tWe have completed {} - liability & equity rows'.format(idx + 1))
    
    # --------------------------------------------
    # Database exportation
    # --------------------------------------------
    
    asset_df = pd.concat(asset_concat)        # asset dataframe combining all rows from 
    asset_df = reorder_columns(asset_df)      # re-order columns for dataframe
    
    # writing data frame to .csv file
    filename1 = 'unstructured_assets.csv'
    asset_df.to_csv(filename1, index=False)
    
    # write .csv file to s3
    with open(filename1, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + filename1, Body=data)
    
    liable_df = pd.concat(liable_concat)      # liablity & equity dataframe combining all rows from 
    liable_df = reorder_columns(liable_df)    # re-order columns for dataframe
    
    # writing data frame to .csv file
    filename2 = 'unstructured_liable.csv'
    liable_df.to_csv(filename2, index=False)
    
    # write .csv file to s3
    with open(filename2, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + filename2, Body=data)

    # remove local file after it has been created
    os.remove(filename1)
    os.remove(filename2)
    
    print('\nWe created an unstructured asset and liability & equity')

Assets Unstructured Database
	We have completed 10 - asset rows
	We have completed 20 - asset rows
	We have completed 30 - asset rows
	We have completed 40 - asset rows
	We have completed 50 - asset rows
	We have completed 60 - asset rows
	We have completed 70 - asset rows
	We have completed 80 - asset rows
	We have completed 90 - asset rows
	We have completed 100 - asset rows
	We have completed 110 - asset rows
	We have completed 120 - asset rows
	We have completed 130 - asset rows
	We have completed 140 - asset rows
	We have completed 150 - asset rows
	We have completed 160 - asset rows

Liability & Equity Unstructured Database
	We have completed 10 - liability & equity rows
	We have completed 20 - liability & equity rows
	We have completed 30 - liability & equity rows
	We have completed 40 - liability & equity rows
	We have completed 50 - liability & equity rows
	We have completed 60 - liability & equity rows
	We have completed 70 - liability & equity rows
	We have completed 80 - li

# Testing Sample

In [47]:
selection = [i for i in asset_df.columns if re.search('total', i, flags=re.I) is not None]

In [48]:
selection

['Total assets', 'TOTAL ASSETS']

In [49]:
base_selection = ['CIK', 'Name', 'Filing Date', 'Filing Year']

In [51]:
selection = 'TOTAL ASSETS'
a = asset_df[base_selection + [selection]]
a[~np.isnan(a[selection])]

Unnamed: 0,CIK,Name,Filing Date,Filing Year,TOTAL ASSETS
1,782124,J.P. MORGAN SECURITIES LLC,2004-05-07,2003,25673078.0


In [52]:
# e.g. file name 'Output/X-17A-5-SPLIT/Assets/91154-2020-03-02.csv'
s3.download_file(bucket, 'Output/X-17A-5-SPLIT/Assets/782124-2004-05-07.csv', 'temp.csv')
df = pd.read_csv('temp.csv')

In [53]:
df

Unnamed: 0,0,1
0,Cash equivalents,23055242.0
1,Brokerage fees receivable,64237.0
2,"Investments, at fair value",232559.0
3,Due from Calvert mutual funds,1257153.0
4,Prepaid expenses and other assets,956284.0
5,Equipment,509758.0
6,Furniture and fixtures,124634.0
7,Less: Accumulated depreciation,-526789.0
8,Net property and equipment,107603.0
9,TOTAL ASSETS,25673078.0


In [56]:
totals_check(df)

Drop index 7, lookback at 0 = with top sum of -526789.0, lookback of 124634.0


Unnamed: 0,0,1
0,Cash equivalents,23055242.0
1,Brokerage fees receivable,64237.0
2,"Investments, at fair value",232559.0
3,Due from Calvert mutual funds,1257153.0
4,Prepaid expenses and other assets,956284.0
5,Equipment,509758.0
6,Furniture and fixtures,124634.0
8,Net property and equipment,107603.0
9,TOTAL ASSETS,25673078.0


In [62]:
abs(abs(-526789.0 - -526769.0) / -526789.0)

3.7965864890876616e-05

In [61]:
abs(-526789.0 - -526769.0)

20.0