In [1]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp

Collecting pip
  Using cached pip-21.0.1-py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.3.3
    Uninstalling pip-20.3.3:
      Successfully uninstalled pip-20.3.3
Successfully installed pip-21.0.1
Collecting smart_open
  Downloading smart_open-4.2.0.tar.gz (119 kB)
Collecting minecart
  Downloading minecart-0.3.0-py3-none-any.whl (23 kB)
Collecting pdfminer3k
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py): started
  Building wheel for smart-open (setup.py): finished with status 'done'
  Created wheel for smart-open: filename=smart_open-4.2.0-py3-none-any.whl size=109630 sha256=bf55b8cdfac604fbfd77ead72e826b1a391cb2773bbb43358109ca9142678c84
  Stored in directory: /home/ec2-user/.cache/pip/wheels/05/12/87/d479d6a8f92130cd8b27e331cc433bb28dda9c20e57f0b1ab2
Successfully built smart-open
Installing collecte

In [96]:
import time 
import re
import os
import trp
import boto3
import minecart
import json
import logging 

import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session

In [3]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5-Subsets/"

# script to perform OCR (using Textract) for X-17A-5 subsets
out_folder = 'Output/X-17A-5-BS/'

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

In [4]:
paths = np.array(session.list_s3_files(bucket, out_folder))[1:]

**We begin by first stripping away NaN terms in the first column and then mapping all the NaN terms to an empty string**

In [5]:
for csv in paths:
    fileName = csv.split('/')[-1]

    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')

    # first begin by filtering out the NaN rows present in the first column
    filterDF = df[np.isin(df[df.columns[0]], df[df.columns[0]].dropna())]
    filterDF = filterDF.fillna('')

    # writing data frame to .csv file
    filterDF.to_csv(fileName, index=False)

    # save contents to AWS S3 bucket
    with open(fileName, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

    # remove local file after it has been created
    os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')
    
print('All .csv files are cleaned of NaN terms')

All .csv files are cleaned of NaN terms


## Table column merging
**For tables with three columns we merge the last two columns into a once unique column**

In [6]:
def singular_merge(df:pd.DataFrame) -> pd.DataFrame:
    """
    Function passes a special dataframe, and reduces its dimensions accordingly
    - Example releases include but are note limited to 1224385-2016, 72267-2003
    ----
    e.g.
    
    Converts a wide dataframe, balance sheet into a smaller rectangular form
                  0                                                 1                 2
            ====================================================================================
        0   Assets                      
        1   Cash and cash equivalents                       | $ 606,278      |     
        2   Cash and securities segregated pursuant         | 273,083        | 
        3   Collateralized short-term financing agreements: | NaN            | $ 1,345
    
    
    Rectangular form of the the dataframe ->
                   0                                                 1          
            =====================================================================
        0   Assets                      
        1   Cash and cash equivalents                       | $ 606,278        
        2   Cash and securities segregated pursuant         | 273,083        
        3   Collateralized short-term financing agreements: | $ 1,345            
    """
    cleanDF = pd.DataFrame()
    
    df = df.fillna('')    # fill all NaN values with empty string
    
    # create first column of new dataframe that corresponds with first column in prior data
    cleanDF['0'] = df[df.columns[0]]

    # we assume that the second and third columns are filled with figures
    cleanDF['1'] = df[df.columns[1]] + df[df.columns[2]]
    
    return cleanDF

In [7]:
for csv in paths:
    fileName = csv.split('/')[-1]
    
    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # if columns greater than 2, we have a weird data table
    if df.columns.size > 2:
        
        # two events could occur at this point (either total splits, or year splits)
        arr = df[df.columns[2]].values
        
        # check the scope of the second column 
        n = arr.size
        k = arr.tolist().count(np.nan)
        
        # if more than half the arr size is np.nan we assume this is a "fake column"
        # we merge these columns since there are many blank rows, otherwise we assume year split 
        if k >= int(n/2):
            tempDF = singular_merge(df)
        else:
            tempDF = df[df.columns[:2]]

        # writing data frame to .csv file
        tempDF.to_csv(fileName, index=False)

        # save contents to AWS S3 bucket
        with open(fileName, 'rb') as data:
            s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)
        
        print('We merged {}'.format(fileName))
        # remove local file after it has been created
        os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')

## Table Row Split
**Since many of the existing tables run the risk of overlapping rows we work to split these rows to appropriate values**

In [8]:
def row_split(df:pd.DataFrame) -> pd.DataFrame:
    """
    Function designed to split conjoined rows from Balance sheet dataframes
    - Example releases include but are note limited to 42352-2015, 58056-2009, 58056-2013
    
    NOTE: Our objective isn't to achieve a perfect split, but rather create labels easy enough for our predictive 
    model to identify and accurately predict. This is not a perfect method and we make assumptions as to the data 
    """
    
    def find_splits(val) -> bool:
        """
        Compute a boolean measure to assess whether a row is conjoined or not 
        """
        try:
            # split the data figures for each balance sheet figure
            x = val.split(' ')
            
            # remove the $ sign if present in the list (this helps avoid false pasitives) 
            try:
                x.remove('$')
            except ValueError:
                # if no $ found we just ignore
                pass
            
            # if length of read list exceeds 1 then we know there exists a multi-row bunch
            if len(x) > 1:
                return True
            else: return False
        except AttributeError: return False
    
    # select all the rows that match our description, where a space exists 
    selections = df[df[df.columns[1]].apply(lambda x: find_splits(x))]
    idxs = selections.index

    # initialize the reporting dataframe
    temp_df = df

    for i in idxs:

        # slice dataframe according to the idx selection (we search for all periods were a break occurs)
        top = temp_df.loc[:i-1]
        bottom = temp_df.loc[i+1:]

        # divide the identified term from the selection piece
        # e.g. "$ 9,112,943 13,151,663" -> ["$", "9,112,943", "13,151,663"] 
        values = temp_df[temp_df.columns[1]].loc[i].split(' ')

        # remove the $ sign if present in the list, otherwise pass 
        try: values.remove('$')
        except ValueError: pass
        
        lineName = temp_df[temp_df.columns[0]].loc[i]
        split = int(len(lineName) * .66)   # index where to cut the string (we assign a 66% cut-off)

        # forming dataframe from dictionary, we then re-map columns and index values (these are new rows)
        # zip restricts bounds to left most split (so always two rows are returned)
        # e.g. dict(zip(['A', 'B'], [1, 2, 3, 4])) -> {'A': 1, 'B': 2}
        mid = pd.DataFrame.from_dict(dict(zip([lineName[:split], lineName[split:]], values)), 
                                     orient='index').reset_index()
        mid.columns = ['0', '1']
        mid.index = [0, 0]

        # reassign the value of df2 to update across each iteration
        temp_df = pd.concat([top, mid, bottom])
        
    return temp_df

In [10]:
for csv in paths:
    fileName = csv.split('/')[-1]
    
    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    tempDF = row_split(df)
    
    # if difference is found then 
    if tempDF.shape != df.shape:
        print("Fixed the rows for {}".format(fileName))
        
        # writing data frame to .csv file
        tempDF.to_csv(fileName, index=False)

        # save contents to AWS S3 bucket
        with open(fileName, 'rb') as data:
            s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

        # remove local file after it has been created
        os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')
    
print('\nWe fixed all conjoined tables in sample')

Fixed the rows for 42352-2007.csv
Fixed the rows for 42352-2015.csv
Fixed the rows for 42352-2016.csv
Fixed the rows for 42352-2017.csv
Fixed the rows for 42352-2018.csv
Fixed the rows for 42352-2019.csv
Fixed the rows for 58056-2009.csv
Fixed the rows for 58056-2010.csv
Fixed the rows for 58056-2012.csv
Fixed the rows for 58056-2013.csv
Fixed the rows for 58056-2015.csv
Fixed the rows for 58056-2016.csv
Fixed the rows for 58056-2018.csv
Fixed the rows for 58056-2019.csv
Fixed the rows for 58056-2020.csv
Fixed the rows for 68136-2002.csv
Fixed the rows for 68136-2004.csv
Fixed the rows for 68136-2005.csv
Fixed the rows for 68136-2008.csv
Fixed the rows for 68136-2009.csv
Fixed the rows for 72267-2008.csv
Fixed the rows for 72267-2009.csv
Fixed the rows for 72267-2010.csv
Fixed the rows for 72267-2012.csv
Fixed the rows for 72267-2014.csv
Fixed the rows for 72267-2015.csv
Fixed the rows for 72267-2016.csv
Fixed the rows for 72267-2017.csv
Fixed the rows for 72267-2018.csv
Fixed the rows

## Numeric Conversion
**Work on converting all string and poor formating quantities to numerical type**

In [15]:
def cleanNumeric(value):
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
        
    """
    
    assert type(value) is str or int or np.ndarray, 'Value must be of type string, integer, float or numpy array'
    
    # checks to see what type of value is being provided
    operator = type(value)
    
    # ##############################################################
    def num_strip(number):
        """
        Nested function for extracting numerical quantities
        """
        numType = type(number)
        
        # if provided a string, perform regex operation 
        if (numType is str) and (len(number) > 0):
            
            # some accounting formats take () to be negative numbers
            if number[0] == '(':
                number = '-' + number

            # perform regex operation scanning for only numeric quantities/identifiers
            cleanValue = re.sub("[^0-9|.|-]", "", number)

            # last check against poor lagging formats e.g. "." or "-" to return nan or floating-point number
            if (cleanValue == '-') or (cleanValue == '.'):
                return 0.0
            else:
                # try to convert the stripped value, otherwise return NaN
                try: 
                    return float(cleanValue)
                except ValueError: 
                    return np.nan
        
        # if operator is integer then simply return the value, no need to modify 
        elif (numType is int) or (numType is float):
            return number
    
        else:
            return np.nan
    # ##############################################################
    
    # if provided a string, perform regex operation 
    if (operator is str) and (len(value) > 0):
        return num_strip(value)
    
    # if operator is integer then simply return the value, no need to modify 
    elif (operator is int) or (operator is float):
        return value
    
    # if operator is numpy array then we perform a extraction per element in array
    elif (operator is np.ndarray):
        vFunc = np.vectorize(num_strip)      # vectorize function to apply to numpy array
        cleanValue = vFunc(value)            # apply vector function
        return cleanValue
    
    else:
        return np.nan

In [68]:
for csv in paths:
    fileName = csv.split('/')[-1]
    
    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # pass numeric converter to the column 
    df[df.columns[1]] = df[df.columns[1]].apply(cleanNumeric)
    
    # remove NaNs values and reset index to return values
    df = df.dropna()
    df = df.reset_index()[['0', '1']]

    # writing data frame to .csv file
    df.to_csv(fileName, index=False)

    # save contents to AWS S3 bucket
    with open(fileName, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

    # remove local file after it has been created
    os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')
    
print('We converted all tables in the sample to numeric figures')

We converted all tables in the sample to numeric figures


## Accounting term matching
**Check to see if we report totals, this figure is not need with the exception of the total asset field**

In [179]:
def totals_check(df:pd.DataFrame) -> pd.DataFrame:
    """
    Checks to see if a line row meets the conditon of a total, if true we remove these rows as we make 
    have checked the terms before have meet our conditions (these include major and minor totals)
    """
    m, n = df.shape
    
    for i in range(m):
        # check the value of at a given index (forward index)
        item1 = df.loc[i].values[1]

        # compute backward sum (lookback index) 
        for j in range(i):
            # backward sum (index minus j-periods before)
            item2 = df.loc[i-j-1:i-1]['1'].sum()

            # if we achieve this then we strip totals and break, no need to continue backward sum
            if item1 == item2:
                df = df.drop(index=i)
                break     # help avoid key error flag
                
    return df

In [158]:
for csv in paths:
    fileName = csv.split('/')[-1]
    
    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # run an accounting check for numeric figures
    tempDF = totals_check(df)

    # writing data frame to .csv file
    tempDF.to_csv(fileName, index=False)

    # save contents to AWS S3 bucket
    with open(fileName, 'rb') as data:
        s3.put_object(Bucket=bucket, Key='Output/X-17A-5-Clean/' + fileName, Body=data)

    # remove local file after it has been created
    os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')
    
print('We checked for accounting identity and removed accordingly')