In [53]:
import os
import re
import boto3
import itertools

import pandas as pd
import numpy as np
from sagemaker.session import Session

In [376]:
# create a darker background template (better for my eyes)
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('chesterish')

In [2]:
# initiate s3 bucket and corresponding data folder
# bucket = "ran-s3-systemic-risk"
# data_folder ="Input/X-17A-5/"

bucket = "ran-s3-systemic-risk"
data_folder ="Output/BalanceSheet/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

In [3]:
selections = np.random.choice(paths[1:], 100)

In [4]:
assetNames = []

In [5]:
for file in selections:
    # retrieving downloaded files from s3 bucket
    s3.download_file(bucket, file, 'temp.pdf')
    
    df = pd.read_csv('temp.pdf')
    names = df[df.columns[1]].dropna().values
    assetNames.append(names)
    
    # remove local file after it has been created
    os.remove('temp.pdf')

In [44]:
assetNames

[array(['Cash and cash equivalents', 'Receivables from broker-dealers',
        'Other receivables', 'Securities owned, at market value',
        'Fixed assets, at cost (net accumulated depreciation of $118)',
        'Total Assets', "Liabilities and Stockholders' Equity",
        'Liabilities:', 'Accounts payable and accrued expenses',
        'Total Liabilities', "Stockholders' equity", 'Common stock',
        'Retained earnings', "Total Liabilities and Stockholders' Equity"],
       dtype=object),
 array(['Cash and cash equivalents', 'Deposit with clearing organization',
        'Commissions and fees receivable', 'Prepaid expenses',
        'Total assets', 'Liabilities:',
        'Accounts payable and accrued expenses',
        'Management fees payable to affiliate',
        'Income taxes payable to CBSI', 'Total liabilities',
        "Shareholder's equity:",
        'Common stock, $1.00 par value, 50,000 shares authorized;',
        '1,000 shares issued and outstanding',
        'A

In [38]:
def intersection(mat:np.ndarray) -> np.ndarray:
# Finds the intersection of all array within the matrix
# :param: mat (np.array)

    interArr = mat[0]
    # itterate through the asset names 
    for i in range(len(mat)-1):
        # find the intersection amongst all array
        interArr = np.intersect1d(interArr, mat[i+1])
        
    return interArr

In [39]:
def splitJoin(nameList: np.ndarray) -> np.ndarray:
# Divides a list of strings seperated by spaces into a list and combines sets 

    a = map(lambda x: x.split(' '), nameList)
    return list(itertools.chain.from_iterable(a))

In [242]:
def regexCheck(string:str, searchTerm:str):
    """
    Checks a regex expression for a given string construct
    string (type str)
        A string to perform a regex search on e.g. "Cash and cash equiavalents"
    searchTerm (type str)
        A regex expression to execute a search e.g. "^Cash"
        
    return: the searched term in question
    """
    try:
        s = re.search(searchTerm, string, flags=re.I)
        return s.string
    except AttributeError:
        return None

In [434]:
sample = "Total liabilities and stockholder's equity"

In [436]:
regexCheck(sample, 'liabilities')

"Total liabilities and stockholder's equity"

In [410]:
keySearch = ['^cash', 'deposits', 'receivable', '^total assets', 'payable', 'total liabilities$',
             'common stock', 'preferred stock', 'earnings', 'equity$']

# keyNames = ['Cash & Equivalents', 'Deposits', 'Receivables', 'Total Assets', 'Other Assets', 'Payables', 'Earnings', 
#              'Total Liabilities', 'Total Equity', 'Common Stock', 'Preferred Stock']

In [288]:
index = 40

In [264]:
# retrieving downloaded files from s3 bucket
s3.download_file(bucket, selections[index], 'temp.pdf')

df = pd.read_csv('temp.pdf')

In [297]:
# clean dataframes for only the items and their immediate values
cleanDf = df[df.columns[1:3]].dropna() 

In [300]:
cleanDf['1'] = cleanDf['1'].apply(cleanNumeric)

In [302]:
cleanDf

Unnamed: 0,0,1
0,Cash and cash equivalents,58008.0
1,Deposits with clearing organizations (includes...,48044.0
2,"Receivable from brokers, dealers and clearing ...",163293.0
3,"Receivable from customers, net of allowance fo...",796626.0
4,"Securities owned, including amounts pledged of...",792201.0
5,"Notes receivable, net of accumulated amortizat...",43670.0
6,"Furniture, equipment and leasehold improvement...",7996.0
7,"Deferred income taxes, net",18498.0
8,"Right-of-use lease assets, net of accumulated ...",5040.0
9,Goodwill,10788.0


In [303]:
print('Data for {}\n'.format(selections[index]))

Data for Output/BalanceSheet/103623-20.csv



In [439]:
def extraction(searchNames:list, df:pd.DataFrame):
    # concat the list of dataframe for each extraction 
    concatList = []
    
    # select key for search names
    for key in searchNames:
        # filter regex for corresponding string expression
        filterSet = df['0'].str.contains(key, regex=True, flags=re.IGNORECASE)
        liability = df['0'].str.contains('liabilities', regex=True, flags=re.IGNORECASE)

        # if flags recievable or payable simply sum the entries
        if key == 'receivable' or key == 'payable':
            data = {'0':key, '1': df[filterSet]['1'].sum()}
            row = pd.DataFrame(data, columnNames)
        else:
            if ~df[filterSet].empty and ~df[liability].empty:
                row = df[filterSet & liability]
                print(row)
                concatList.append(row)
                row = df[filterSet & ~liability]
                print(row)
            else:
                row = df[filterSet]
        
        concatList.append(row)
    
    # concat the filtered dataframes, and remove duplicate rows
    comboDf = pd.concat(concatList)
    comboDf = comboDf.drop_duplicates()
    
    return comboDf

In [440]:
extraction(keySearch, cleanDf)

Empty DataFrame
Columns: [0, 1]
Index: []
                           0        1
0  Cash and cash equivalents  58008.0
Empty DataFrame
Columns: [0, 1]
Index: []
                                                   0        1
1  Deposits with clearing organizations (includes...  48044.0
Empty DataFrame
Columns: [0, 1]
Index: []
               0          1
11  Total assets  2064078.0
                    0          1
23  Total liabilities  1738079.0
Empty DataFrame
Columns: [0, 1]
Index: []
Empty DataFrame
Columns: [0, 1]
Index: []
                                                    0     1
26  Common stock, par value $100 per share - 1,000...  76.0
Empty DataFrame
Columns: [0, 1]
Index: []
Empty DataFrame
Columns: [0, 1]
Index: []
Empty DataFrame
Columns: [0, 1]
Index: []
                    0        1
28  Retained earnings  10283.0
                                             0          1
32  Total liabilities and stockholder's equity  2064078.0
                             0         1
31 

Unnamed: 0,0,1
0,Cash and cash equivalents,58008.0
1,Deposits with clearing organizations (includes...,48044.0
CIK,receivable,1003589.0
11,Total assets,2064078.0
CIK,payable,1036709.0
23,Total liabilities,1738079.0
26,"Common stock, par value $100 per share - 1,000...",76.0
28,Retained earnings,10283.0
32,Total liabilities and stockholder's equity,2064078.0
31,Total stockholder's equity,325999.0


In [215]:
columnNames = ['CIK', 'Year', 'Total Assets', 'Other Asset', 'Total Liabilites', 'Total Liabilites', 
               'Total Equity', 'Other Equity', 'Cash & Equivalents']
intialRow = [None]*len(columnNames)
# cik, year = selections[1].split('/')[-1].split('-')

[None, None, None, None, None, None, None, None, None]

In [232]:
def cleanNumeric(value:str) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
    """
    # some accounting formats take () to be negative numbers
    if value[0] == '(':
        value = '-' + value

    # perform regex operation scanning for only numeric quantities/identifiers
    cleanValue = re.sub("[^0-9|.|-]", "", value)
    if cleanValue in ['', '.', '-']:
        return 0.0
    else:
        return float(cleanValue)

In [234]:
cleanNumeric('-$19,223')

-19223.0