In [1]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp

Collecting pip
  Using cached pip-21.0.1-py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.3.3
    Uninstalling pip-20.3.3:
      Successfully uninstalled pip-20.3.3
Successfully installed pip-21.0.1
Collecting smart_open
  Downloading smart_open-4.1.2-py3-none-any.whl (111 kB)
Collecting minecart
  Downloading minecart-0.3.0-py3-none-any.whl (23 kB)
Collecting pdfminer3k
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
Installing collected packages: pdfminer3k, smart-open, minecart
Successfully installed minecart-0.3.0 pdfminer3k-1.3.4 smart-open-4.1.2
Collecting textract-trp
  Downloading textract_trp-0.1.3-py3-none-any.whl (5.8 kB)
Installing collected packages: textract-trp
Successfully installed textract-trp-0.1.3


In [35]:
import time 
import re
import os
import trp
import boto3
import minecart
import json
import logging 

import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session

In [36]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5-Subsets/"

# script to perform OCR (using Textract) for X-17A-5 subsets
out_folder = 'Output/X-17A-5-BS/'

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

In [205]:
paths = np.array(session.list_s3_files(bucket, out_folder))[1:]

**We begin by first stripping away NaN terms in the first column and then mapping all the NaN terms to an empty string**

In [None]:
for csv in paths:
    fileName = csv.split('/')[-1]

    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')

    # first begin by filtering out the NaN rows present in the first column
    filterDF = df[np.isin(df[df.columns[0]], df[df.columns[0]].dropna())]
    filterDF = filterDF.fillna('')

    # writing data frame to .csv file
    tempDF.to_csv(fileName, index=False)

    # save contents to AWS S3 bucket
    with open(fileName, 'rb') as data:
        s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

    # remove local file after it has been created
    os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')
    
print('All .csv files are cleaned of NaN terms')

## Table column merging
**For tables with three columns we merge the last two columns into a once unique column**

In [207]:
def singular_merge(df:pd.DataFrame) -> pd.DataFrame:
    """
    Function passes a special dataframe, and reduces its dimensions accordingly
    ----
    e.g.
    
    Converts a wide dataframe, balance sheet into a smaller rectangular form
                  0                                                 1                 2
            ====================================================================================
        0   Assets                      
        1   Cash and cash equivalents                       | $ 606,278      |     
        2   Cash and securities segregated pursuant         | 273,083        | 
        3   Collateralized short-term financing agreements: | NaN            | $ 1,345
    
    
    Rectangular form of the the dataframe ->
                   0                                                 1          
            =====================================================================
        0   Assets                      
        1   Cash and cash equivalents                       | $ 606,278        
        2   Cash and securities segregated pursuant         | 273,083        
        3   Collateralized short-term financing agreements: | $ 1,345            
    """
    cleanDF = pd.DataFrame()
    
    # create first column of new dataframe that corresponds with first column in prior data
    cleanDF['0'] = df[df.columns[0]]

    # we assume that the second and third columns are filled with figures
    cleanDF['1'] = df[df.columns[1]] + df[df.columns[2]]
    
    return cleanDF

In [211]:
for csv in paths:
    fileName = csv.split('/')[-1]
    
    # work on combining columns that are issued seperately
    s3.download_file(bucket, csv, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # if columns greater than 2, we have a weird
    if df.columns.size > 2:
        tempDF = singular_merge(df)

        # writing data frame to .csv file
        tempDF.to_csv(fileName, index=False)

        # save contents to AWS S3 bucket
        with open(fileName, 'rb') as data:
            s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)
        
        print('We merged {}'.format(fileName))
        # remove local file after it has been created
        os.remove(fileName)

    # remove local file after it has been created
    os.remove('temp.pdf')

## Table Row Split
**Since many of the existing tables run the risk of overlapping rows we work to split these rows to appropriate values**

In [216]:
# issues with conjoined rows in tables
s3.download_file(bucket, 'Output/X-17A-5-BS/782124-2002.csv', 'temp.pdf')
df2 = pd.read_csv('temp.pdf')

In [217]:
df2

Unnamed: 0,0,1
0,ASSETS,
1,Cash and cash equivalents,"$ 222,336"
2,Cash and securities deposited with clearing or...,
3,in compliance with federal regulations,9071138
4,Securities purchased under agreements to resell,34764794
5,Securities borrowed,51094781
6,Securities received as collateral,3037956
7,Receivables:,
8,Customers,12373732
9,"Brokers, dealers and others",2751879


In [None]:
def row_split(df:pd.DataFrame) -> pd.DataFrame:
    """
    Function designed to split compounded rows from Balance sheet dataframes
    
    NOTE: Our objective isn't to achieve a perfect split, but rather create labels easy enough for our predictive 
    model can identify and accurately predict. This is not a perfect method and we make assumptions as to the data 
    """
    
    def find_splits(val) -> bool:
        """
        Compute a boolean measure to acess whether a 
        """
        try:
            # split the data figures for each balance sheet figure
            x = val.split(' ')

            # if length of read list exceeds 1 and first term isn't "$" we want this 
            if (len(x) > 1) and (x[0] != '$'):
                return True
            else: return False
        except AttributeError: return False
    
    # select all the rows that match our description, where a space exists 
    selections = df[df[df.columns[1]].apply(lambda x: find_splits(x))]
    idxs = selections.index
    
    # initialize the reporting dataframe
    temp_df = df
    
    for num, i in enumerate(idxs):
    
        # slice dataframe according to the idx selection
        top = df.iloc[:i]
        bottom = df.iloc[i+1:]

        # divide the identified term from the selection piece 
        values = selections[selections.columns[1]].iloc[0].split(' ')
        lineName = selections[selections.columns[0]].iloc[0]
        split = int(len(lineName) * .66)   # cut the value by 2/3

        # forming dataframe from dictionary, we then re-map columns
        mid = pd.DataFrame.from_dict(dict(zip([lineName[:split], lineName[split:]], values)), 
                                     orient='index').reset_index()
        mid.columns = ['0', '1']

        # reassign the value of df2 to update across each iteration
        temp_df = pd.concat([top, mid, bottom])

## Table Totals Counting

In [38]:
# issues with unpacking total values from tables
s3.download_file(bucket, 'Output/X-17A-5-BS/42352-2005.csv', 'temp.pdf')
df = pd.read_csv('temp.pdf')

In [41]:
df.iloc[:12]

Unnamed: 0,0,1
0,Cash and cash equivalents,"$ 665,532"
1,Cash and securities segregated in compliance w...,
2,and other regulations,33665205
3,"Receivables from brokers, dealers and clearing...",8974752
4,Receivables from customers and counterparties,12342912
5,Securities borrowed,193784018
6,Securities purchased under agreements to resell,30376416
7,"Financial instruments owned, at fair value",50257864
8,Financial instruments owned and pledged as col...,15194916
9,"Total financial instruments owned, at fair value",65452780
