In [9]:
import os
import re
import boto3
import json
import requests

import pandas as pd
import numpy as np

from sagemaker.session import Session
from bs4 import BeautifulSoup

In [10]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
asset_folder = "Output/X-17A-5-Clean/Assets/"
liable_folder = "Output/X-17A-5-Clean/Liability & Equity/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

## Accounting term matching
**Check to see if we report totals, this figure is not need with the exception of the total asset field**

In [11]:
def totals_check(df:pd.DataFrame) -> pd.DataFrame:
    """
    Checks to see if a line row meets the conditon of a total, if true we remove these rows as we make 
    have checked the terms before have meet our conditions (these include major and minor totals)
    """
    m, n = df.shape
    
    # numpy exception for handling invalid log10 RunTime error (do not show)
    # switch ignore to warn, if warning is made to be present 
    np.seterr(invalid = 'ignore') 
    
    def multiple_check(x1:float, x2:float):
        """
        Determine whether the two values are the same number scaled by 10
        """
        # prevent zero division error
        if (x1 == 0): return False
        else:
            # if number is a multiple of 10 we return True (e.g. Total Assets 745.2322 vs Backward Sum 7452322)
            check1 = np.log10(x2 / x1).is_integer()
            
            # if number total is a substring we return True (e.g. Total Assets 174182935 vs Bacward Sum 74182935)
            check2 = (str(x2) in str(x1) ) & (len(str(x2)) == len(str(x1)) - 1)
            
            if check1 or check2: 
                return True
            else: return False
    
    for i in range(m):
        # check the value of at a given index (forward index)
        item1 = df.loc[i].values[1]

        # compute backward sum (lookback index) 
        for j in range(i):
            # backward sum (index minus j-periods before)
            item2 = df.loc[i-j-1:i-1]['1'].sum()

            # if we achieve this then we strip totals and break, no need to continue backward sum
            if (item1 == item2) or multiple_check(item1, item2):
                df = df.drop(index=i)
                break     # help avoid key error flag
                
    return df

In [119]:
def accounting_fix(in_folder:str, out_folder:str):
    """
    Wrapper function for executing the parse
    """
    paths = session.list_s3_files(bucket, in_folder)[1:]
    
    for csv in paths:
        fileName = csv.split('/')[-1]
        
        # work on combining columns that are issued seperately
        s3.download_file(bucket, csv, 'temp.pdf')
        df = pd.read_csv('temp.pdf')

        # run an accounting check for numeric figures
        tempDF = totals_check(df)

        # writing data frame to .csv file
        tempDF.to_csv(fileName, index=False)

        with open(fileName, 'rb') as data:
            s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

        # remove local file after it has been created
        os.remove(fileName)

        # remove local file after it has been created
        os.remove('temp.pdf')
        print('Checked accounting identity for {}'.format(fileName))
        
    print('\nOur check has been cleared and removed line items accordingly')

In [120]:
accounting_fix('Output/X-17A-5-Split/Assets/', asset_folder)

Checked accounting identity for 1224385-2004.csv
Checked accounting identity for 1224385-2005.csv
Checked accounting identity for 1224385-2006.csv
Checked accounting identity for 1224385-2007.csv
Checked accounting identity for 1224385-2008.csv
Checked accounting identity for 1224385-2009.csv
Checked accounting identity for 1224385-2010.csv
Checked accounting identity for 1224385-2011.csv
Checked accounting identity for 1224385-2012.csv
Checked accounting identity for 1224385-2013.csv
Checked accounting identity for 1224385-2014.csv
Checked accounting identity for 1224385-2015.csv
Checked accounting identity for 1224385-2016.csv
Checked accounting identity for 1224385-2017.csv
Checked accounting identity for 1224385-2018.csv
Checked accounting identity for 1224385-2019.csv
Checked accounting identity for 1224385-2020.csv
Checked accounting identity for 42352-2002.csv
Checked accounting identity for 42352-2003.csv




Checked accounting identity for 42352-2004.csv
Checked accounting identity for 42352-2005.csv
Checked accounting identity for 42352-2006.csv
Checked accounting identity for 42352-2007.csv
Checked accounting identity for 42352-2008.csv
Checked accounting identity for 42352-2010.csv
Checked accounting identity for 42352-2013.csv
Checked accounting identity for 42352-2014.csv
Checked accounting identity for 42352-2015.csv
Checked accounting identity for 42352-2016.csv
Checked accounting identity for 42352-2017.csv
Checked accounting identity for 42352-2018.csv
Checked accounting identity for 42352-2019.csv
Checked accounting identity for 58056-2002.csv
Checked accounting identity for 58056-2003.csv
Checked accounting identity for 58056-2004.csv
Checked accounting identity for 58056-2005.csv
Checked accounting identity for 58056-2006.csv
Checked accounting identity for 58056-2007.csv
Checked accounting identity for 58056-2008.csv
Checked accounting identity for 58056-2009.csv
Checked accou

In [121]:
accounting_fix('Output/X-17A-5-Split/Liability & Equity/', liable_folder)



Checked accounting identity for 1224385-2004.csv
Checked accounting identity for 1224385-2005.csv
Checked accounting identity for 1224385-2006.csv
Checked accounting identity for 1224385-2007.csv
Checked accounting identity for 1224385-2008.csv
Checked accounting identity for 1224385-2009.csv
Checked accounting identity for 1224385-2010.csv
Checked accounting identity for 1224385-2011.csv
Checked accounting identity for 1224385-2012.csv
Checked accounting identity for 1224385-2013.csv
Checked accounting identity for 1224385-2014.csv
Checked accounting identity for 1224385-2015.csv
Checked accounting identity for 1224385-2016.csv
Checked accounting identity for 1224385-2017.csv
Checked accounting identity for 1224385-2018.csv
Checked accounting identity for 1224385-2019.csv
Checked accounting identity for 1224385-2020.csv
Checked accounting identity for 42352-2002.csv
Checked accounting identity for 42352-2003.csv
Checked accounting identity for 42352-2004.csv
Checked accounting identit

## Unstructured Database consturction

In [122]:
def companyName(cik) -> str:
    """
    Returns the company name for a given CIK number from the SEC by parsing the Edgar site
    e.g. https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231
    
    Input:
        :param: cik (type str)
            The CIK number for a broker dealer e.g. 887767
    Return:
        :param: (type str)
            Returns the accompanying name with the CIK provided e.g. 1ST GLOBAL CAPITAL CORP. 
    """
    # establishing base-url for company name search
    baseURL = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&'
    url = baseURL+'CIK={}&type=X-17A-5&dateb=20201231'.format(cik)
    
    # response time for retrieving company names, returning beautifulsoup object
    res = requests.get(url, allow_redirects=True)
    s1 = BeautifulSoup(res.text, 'html.parser')
    
    # select the company information from the SEC website for a particular CIK
    for val in s1.find_all('span', attrs={"class":"companyName"}):
        # retrieve the company name from info class
        return val.text.split('CIK')[0].split('/BD')[0]

In [123]:
with open('assetLines.txt', 'r') as f: assetSide = json.loads(f.read())
with open('liabilityLines.txt', 'r') as f: liableSide = json.loads(f.read())

In [124]:
assetDict = dict([(i, np.nan) for i in assetSide])
liableDict = dict([(i, np.nan) for i in liableSide])

In [125]:
def unstructured_data(filepaths, lineDictionary, lineItems) -> pd.DataFrame:
    """
    Forms unstructured data frame from .csv file(s) located in s3 bucket
    
    :param: filepaths
        filepaths from s3 that store .csv file(s) (/Output/BalanceSheet/)
    :paran: lineDictionary
        dictionary of total unstructured line items and corresponding values
    :paran: lineItems
        list of line items (asset/liabilites) that will be searched for
        
    :return: tuple
        Returns a tuple, itters is a list of dataframes representing line items, error is a dictionary
        tracking all error terms that exist due to univariate dataframes 
    """
    
    # intialize list to store dataframes and errors
    itters = [0] * filepaths.size
    
    # iterate through files from s3 
    for i, file in enumerate(filepaths):
        
        # create temporary dictionary copy for storage of values
        tempDict = lineDictionary.copy()
        
        # creating two rows to track the CIK and year information released
        cik, year = file.split('/')[-1].split('-')
        
        tempDict['CIK'] = cik                 # CIK number for firm 
        tempDict['Year'] = year[:4]           # Year for firm filing  
        tempDict['Name'] = companyName(cik)   # returns the name of associated with the CIK
        
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, file, 'temp.pdf')
        df = pd.read_csv('temp.pdf')
        
        # clean dataframe should be of size greater than 1
        if len(df.columns) > 1:
            
            # extract line items from each dataframe (balance sheet)
            lines = df[df.columns[0]]
            
            # filter dataframes according line items, and extract numerical values from dataframe 
            filterDF = df[np.isin(lines, lineItems)]
            filterDF = filterDF.set_index(filterDF.columns[0])             # set line items as index

            # iterate through items from (asset or liability items)
            for item in filterDF.index:
                lineVal = filterDF.loc[item]                 # line item e.g. Cash $72,343 $71,231
                
                # check to see scope of line value, checking if multi-rows present (Type DataFrame)
                # in the event we have repeating 'item' lines (e.g. 2 Prepaid expense categories) we sum columns  
                if type(lineVal) is not pd.Series:
                    lineVal = lineVal.sum()
                
                value = lineVal.iloc[0]     # first column value e.g. 72343, either singular or sum 
                
                # store value with appropriate item name
                if ~np.isnan(value):
                    tempDict[item] = value
                
            # convert the dictionary values to dataframe for database construction 
            row = pd.DataFrame.from_dict(tempDict, orient='index')
            
            # append dataframe set to array transposing 
            itters[i] = row.T
            
        else:
            print('{} - encountered issue reading PDF'.format(file))
        
        # remove local file after it has been created
        os.remove('temp.pdf')
    
    return pd.concat(itters)

In [126]:
def dfBuild(df:pd.DataFrame) -> pd.DataFrame:
    # re-order the CIK and Year columns to appear as the first two columns
    remap = df.columns[~np.isin(df.columns, ['CIK', 'Name', 'Year'])]
    df = df[np.insert(remap, [0, 0, 0], ['CIK', 'Name', 'Year'])]

    # filter out columns with NaN values, return only values
    filterNaN = df.isnull().all()
    cleanCols = filterNaN[filterNaN == False].index

    # clean dataframe for unstructured asset terms
    return df[cleanCols]

In [127]:
# unstructured data table from all .csv files 
assetpaths = np.array(session.list_s3_files(bucket, asset_folder))[1:]
assetItters = unstructured_data(assetpaths, assetDict, assetSide)
assetDF = dfBuild(assetItters)

In [128]:
assetDF.to_csv('unstructAsset.csv', index=False)

In [129]:
# unstructured data table from all .csv files 
liablepaths = np.array(session.list_s3_files(bucket, liable_folder))[1:]
liableItters = unstructured_data(liablepaths, liableDict, liableSide)
liableDF = dfBuild(liableItters)

In [130]:
liableDF.to_csv('unstructLiable.csv', index=False)