In [108]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp



In [109]:
import time 
import re
import os
import trp
import boto3
import minecart
import json
import logging 

import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session

In [110]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5-Subsets/"

# script to perform OCR (using Textract) for X-17A-5 subsets
out_folder = 'Output/X-17A-5-BS/'

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

In [111]:
paths = np.array(session.list_s3_files(bucket, out_folder))[1:]

**We begin by first stripping away NaN terms in the first column and then mapping all the NaN terms to an empty string**

In [112]:
for csv in paths:
    fileName = csv.split('/')[-1]

    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')

    # first begin by filtering out the NaN rows present in the first column
    filterDF = df[np.isin(df[df.columns[0]], df[df.columns[0]].dropna())]
    filterDF = filterDF.fillna('')

    # writing data frame to .csv file
    filterDF.to_csv(fileName, index=False)

    # save contents to AWS S3 bucket
    with open(fileName, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

    # remove local file after it has been created
    os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')
    
print('All .csv files are cleaned of NaN terms')

All .csv files are cleaned of NaN terms


## Table column merging
**For tables with three columns we merge the last two columns into a once unique column**

In [113]:
def singular_merge(df:pd.DataFrame) -> pd.DataFrame:
    """
    Function passes a special dataframe, and reduces its dimensions accordingly
    - Example releases include but are note limited to 1224385-2016, 72267-2003
    ----
    e.g.
    
    Converts a wide dataframe, balance sheet into a smaller rectangular form
                  0                                                 1                 2
            ====================================================================================
        0   Assets                                          | NaN            | NaN  
        1   Cash and cash equivalents                       | $ 606,278      |     
        2   Cash and securities segregated pursuant         | 273,083        | 
        3   Collateralized short-term financing agreements: | NaN            | $ 1,345
    
    
    Rectangular form of the the dataframe ->
                   0                                                 1          
            =====================================================================
        0   Assets                      
        1   Cash and cash equivalents                       | $ 606,278        
        2   Cash and securities segregated pursuant         | 273,083        
        3   Collateralized short-term financing agreements: | $ 1,345            
    """
    # work on itterative merging for rows, check left/right and top/bottom
    n = df.shape[0]
    trans = []

    for i in range(n):
        row = df.iloc[i]

        name = row.iloc[0]
        col1 = row.iloc[1]
        col2 = row.iloc[2]

        # check the position of values for columns 1 vs 2
        if col1 is not np.nan:
            trans.append([name, col1]) 
        elif col2 is not np.nan:
            trans.append([name, col2])

        # we want to check if there exists two NaNs - is it real or false flag
        # we assume the column-check will filter out year-matching in ocrTextract
        if (col1 is np.nan) and (col2 is np.nan): 
            # look up one row (if possible to see if col1 and col2 are populated)
            try:
                # check the information for the above row
                prior_row = df.iloc[i-1]
                prior_col1 = prior_row.iloc[1]
                prior_col2 = prior_row.iloc[2]
                
                # if both values present then we simply use the right hand side value above  
                if (prior_col1 is not np.nan) and (prior_col2 is not np.nan):
                    trans.append([name, prior_col2])
            except IndexError:
                pass
    
    return pd.DataFrame(trans, columns=['0', '1'])

In [114]:
for csv in paths:
    fileName = csv.split('/')[-1]
    
    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # if columns greater than 2, we have a weird data table
    if df.columns.size > 2:
        
        # two events could occur at this point (either total splits, or year splits)
        arr = df[df.columns[2]].values
        
        # check the scope of the second column 
        n = arr.size
        k = arr.tolist().count(np.nan)
        
        # K-check: if more than half the arr size is np.nan we assume this is a "fake column"
        # we merge these columns since there are many blank rows, otherwise we assume year split 
        if k/n >= 0.50:
            tempDF = singular_merge(df)
        else:
            tempDF = df[df.columns[:2]]

        # writing data frame to .csv file
        tempDF.to_csv(fileName, index=False)

        # save contents to AWS S3 bucket
        with open(fileName, 'rb') as data:
            s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)
        
        print('We merged {}'.format(fileName))
        # remove local file after it has been created
        os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')

## Table Row Split
**Since many of the existing tables run the risk of overlapping rows we work to split these rows to appropriate values**

In [115]:
def row_split(df:pd.DataFrame, text_file:dict) -> pd.DataFrame:
    """
    Function designed to split conjoined rows from Balance sheet dataframes
    - Example releases include but are note limited to 42352-2015, 58056-2009, 58056-2013, 58056-2019
    
    Input:
        :param df: (type pandas dataframe)
            References the balance sheet dataframe read in from AWS Textract
        :param text_file: (type dictionary)
            Stores text values with corresponding confidence level for balance sheet pages
    
    Output:
        :param return: (type pandas dataframe) 
            A processed dataframe of size greater than or equal to the inputed dataframe
    
    NOTE: Our objective isn't to achieve a perfect split, but rather create labels easy enough for our predictive 
    model to identify and accurately predict. This is not a perfect method and we make the assumption that a merged 
    row exists when a space exists in the value column (e.g. [19,345 2,213])
    """
    
    # ##############################################################
    # ##############################################################
    
    def find_splits(val) -> bool:
        """
        Compute a boolean measure to assess whether a row is conjoined or not 
        """
        try:
            # split the data figures for each balance sheet figure
            arr = val.split(' ')
            
            # remove the $ sign if present in the list (this helps avoid false pasitives) 
            arr = list(filter(lambda x: x != '$', arr))
            
            # if length of read list exceeds 1 then we know there exists a multi-row bunch
            if len(arr) > 1:
                return True
            else: return False
        
        # handle exception for NaN (no attribute to split) 
        except AttributeError: return False
    
    def extract_lineitems(val, dictionary:dict) -> list:
        """
        Extract the appropriate line items from each line value 
        """
        splits = []
        
        # iterate through each line item
        for i in dictionary.keys():
    
            # we check for real key-value names avoiding single character keys
            if len(i) > 1: idx = val.find(i)

                # if we find such a value we append the series
                if idx >= 0: splits.append(i)
        
        # check whether we have a one-to-one mapping between line
        # items and line values, if not we adjust
        n = len(splits) - len(values)
        if n > 0: return splits[n:]
        else: return splits
    
    def recursive_splits(splits:list, lineName:list, sub=[]) -> pd.DataFrame:
        """
        Recursively breaks up merged rows for each split until no merged row is left
        """
        # if our list exceeds 1 in length, we continue to split
        if len(splits) > 1:
            # construct a dataframe row of the first split term to append to sub list
            row = pd.DataFrame([lineName[0], splits[0]]).T
            sub.append(row)
            
            # we pass the +1 index splits and line name, appending the first-most layer 
            return recursive_splits(splits[1:], lineName[1:], sub=sub)
        else:
            row = pd.DataFrame([lineName[0], splits[0]]).T
            sub.append(row)
            
            # we concatenate all DataFrames vertically to form a large DataFrame 
            return pd.concat(sub)
        
    # ##############################################################
    # ##############################################################    
    
    # select all the rows that match our description, where a space exists 
    selections = df[df[df.columns[1]].apply(lambda x: find_splits(x))]
    idxs = selections.index
    
    # iterate through each row that is determined to be conjoined
    for i in idxs:
        
        # slice dataframe according to the idx selection (we search for all periods were a break occurs)
        top = df.loc[:i-1]
        bottom = df.loc[i+1:]
        
        # divide the identified term from the selection e.g. "$ 9,112,943 13,151,663" -> ["$", "9,112,943", "13,151,663"] 
        # and filter out the $ sign in the list e.g. ["$", "9,112,943", "13,151,663"] -> [9,112,943", "13,151,663"]
        values = df[df.columns[1]].loc[i].split(' ')
        values = list(filter(lambda x: x != '$', values))
        
        # extract line names according to Text parsed list (requires parsed Text string)
        lineName = df[df.columns[0]].loc[i]
        lineName = extract_lineitems(lineName, text_file)
        
        # determine the splits for the corresponding row
        mid = recursive_splits(values, lineName, sub=[])
        mid.columns = ['0', '1']

        # reassign the value of df2 to update across each iteration
        df = pd.concat([top, mid, bottom])
        
    return df

In [116]:
# track the presence of json file storing information on forms
if os.path.exists('X17A5-text.json'):
    with open('X17A5-text.json', 'r') as f: text = json.loads(f.read())

    for csv in paths:
        fileName = csv.split('/')[-1]

        # work on combining columns that are issued seperately
        s3.download_file(bucket, csv, 'temp.pdf')
        df = pd.read_csv('temp.pdf')
        
        # compute the row merged figures 
        filter_json = fileName.split('-')[:2]
        tempDF = row_split(df, filter_json)

        # if difference is found then 
        if tempDF.shape != df.shape:
            print("Fixed the rows for {}".format(fileName))

            # writing data frame to .csv file
            tempDF.to_csv(fileName, index=False)

            # save contents to AWS S3 bucket
            with open(fileName, 'rb') as data:
                s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

            # remove local file after it has been created
            os.remove(fileName)

        # remove local file after it has been created
        os.remove('temp.pdf')

    print('We fixed all conjoined tables in sample')
    
else:
    print('We do not have a text file')

We fixed all conjoined tables in sample


## Numeric Conversion
**Work on converting all string and poor formating quantities to numerical type**

In [117]:
def cleanNumeric(value):
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
        
    """
    
    assert type(value) is str or int or np.ndarray, 'Value must be of type string, integer, float or numpy array'
    
    # checks to see what type of value is being provided
    operator = type(value)
    
    # ##############################################################
    # ##############################################################
    
    def num_strip(number):
        numType = type(number)
        
        # if provided a non-empty string, perform regex operation 
        if (numType is str) and (len(number) > 0):
            
            # check for accounting formats that use parenthesis to signal losses 
            if number[0] == '(': number = '-' + number

            # case replacing to handle poor textract reading of numbers
            number = number.replace('I', '1').replace('l', '1')
            
            # --------------------------------------------------------------
            # Explanation of the Regex Expression:
            #      [^0-9|.|-]     = match all elements that are not numeric 0-9, periods "." or hyphens "-"
            #      (?<!^)-        = match all elements that are hyphens "-" not in the first index position
            #      \.(?=[^.]*\.)  = match all elements that are periods "." except the last instance
            # --------------------------------------------------------------
            
            check1 = re.sub("[^0-9|.|-]", "", number)         # remove all the non-numeric, periods "." or hyphens "-"
            check2 = re.sub("(?<!^)-", "", check1)            # removes all "-" that aren't in the first index 
            check3 = re.sub("\.(?=[^.]*\.)", "", check2)      # removes all periods except the last instance of "." 
            
            # --------------------------------------------------------------
            
            # we consider weird decimal values that exceed 2 spaces to the right (e.g. 432.2884)
            period_check = check3.find('.')                         # returns the location of the period 
            right_tail_length = len(check3) - period_check - 1      # right-tail length should not exceed 2
            
            # if more than 2 trailing digits to decimal point we assume incorrect placement
            if right_tail_length > 2:
                check3 = check3.replace('.', '')
            
            # last check against poor lagging formats e.g. "." or "-" to return nan or floating-point number
            if (check3 == '-') or (check3 == '.'):
                return 0.0
            else:
                # try to cast to floating point value, else flat NaN
                try: 
                    return float(check3)
                except ValueError: 
                    return np.nan
        
        # if operator is an integer or float then simply return the value
        elif (numType is int) or (numType is float):
            return number
    
        else:
            return np.nan
        
    # ##############################################################
    # ##############################################################
    
    # if provided a string, perform regex operation 
    if (operator is str) and (len(value) > 0):
        return num_strip(value)
    
    # if operator is integer then simply return the value, no need to modify 
    elif (operator is int) or (operator is float):
        return value
    
    # if operator is numpy array then we perform a extraction per element in array
    elif (operator is np.ndarray):
        vFunc = np.vectorize(num_strip)      # vectorize function to apply to numpy array
        cleanValue = vFunc(value)            # apply vector function
        return cleanValue
    
    else:
        return np.nan

In [118]:
for csv in paths:
    fileName = csv.split('/')[-1]
    
    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # pass numeric converter to the column 
    df[df.columns[1]] = df[df.columns[1]].apply(cleanNumeric)
    
    # remove NaNs values and reset index to return values
    df = df.dropna()
    df = df.reset_index()[['0', '1']]

    # writing data frame to .csv file
    df.to_csv(fileName, index=False)

    # save contents to AWS S3 bucket
    with open(fileName, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

    # remove local file after it has been created
    os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')
    
print('We converted all tables in the sample to numeric figures')

We converted all tables in the sample to numeric figures
