In [1]:
%%bash
pip install --upgrade pip
pip install PyPDF2

Collecting pip
  Using cached pip-21.0-py3-none-any.whl (1.5 MB)
  Downloading pip-20.3.4-py2.py3-none-any.whl (1.5 MB)
Collecting PyPDF2
  Downloading PyPDF2-1.26.0.tar.gz (77 kB)
Building wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py): started
  Building wheel for PyPDF2 (setup.py): finished with status 'done'
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-py3-none-any.whl size=61084 sha256=ca2cd27f14af88c85985c63735155a8b8c795abe65b3ef2fabbd66581f3b7d0c
  Stored in directory: /home/ec2-user/.cache/pip/wheels/97/28/4b/142b7d8c98eeeb73534b9c5b6558ddd3bab3c2c8192aa7ab30
Successfully built PyPDF2
Installing collected packages: PyPDF2
Successfully installed PyPDF2-1.26.0


## PDF Slicing for X-17A-5 Files
We slice the first 15 pages from the merged X-17A-5 files retrieved from the SEC 

In [1]:
import os
import boto3
import numpy as np 

from sagemaker.session import Session
from PyPDF2 import PdfFileReader, PdfFileWriter, utils

In [2]:
bucket = "ran-s3-systemic-risk"
data_folder = 'Input/X-17A-5/'
export_folder = "Input/X-17A-5-Subsets/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

In [3]:
# import paths for all the subset files
importPaths = np.array(session.list_s3_files(bucket, data_folder))[1:]

In [6]:
def selectPages(pdf:PdfFileReader, pages) -> PdfFileWriter:
    """
    Extracts pages and returns a PdfFileWriter object 
    """
    # initialize a pdf object to be store pdf pages
    pdfWriter = PdfFileWriter()
    nPages = pdf.getNumPages()

    # to manage pdfs that don't contain as many pages as listed  
    if nPages > pages.max():
        # add the pages from the document as specified 
        _ = [pdfWriter.addPage(pdf.getPage(page_num)) for page_num in pages]
        return pdfWriter
    else:
        # add all pages from the document provided
        _ = [pdfWriter.addPage(pdf.getPage(page_num)) for page_num in np.arange(nPages)]
        return pdfWriter 

In [7]:
def extractSubset(path:str, export:str, pages:np.ndarray, bucket:str='ran-s3-systemic-risk', 
                  export_folder:str="Input/X-17A-5-Subsets/"):
    """
    Extracts a subset of pages from a pdf, provided the page numbers are specified
    """
    
    try:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, path, 'temp.pdf')

        # read pdf file and initialize empty pdf file to create subset
        pdf = PdfFileReader('temp.pdf')
        subset = selectPages(pdf, pages)

        try:
            # open file and save to local instance
            with open(export, 'wb') as f:
                subset.write(f)
                f.close()

            # save contents to AWS S3 bucket as specified
            with open(export, 'rb') as data:
                s3.upload_fileobj(data, bucket, export_folder + export)
                
            print('Saved file -> {}'.format(export))
        
        except:
            print('Not able to save '.format(export))

        # remove local file after it has been created
        os.remove('temp.pdf')
        os.remove(export)

    except utils.PdfReadError:
        print('EOF marker not found - reject {}'.format(export))

In [8]:
# pages to keep from each pdf
pages = np.arange(15) 

# export file paths to document subfolder
exportPaths = session.list_s3_files(bucket, export_folder)

for pdf_file_path in importPaths:
    
    # check to see if values are downloaded to s3 sub-bin
    baseFile = pdf_file_path.split('/')[-1].split('.')[0]
    exportFile = '{0}-subset.pdf'.format(baseFile)

    # if our subset is not found in our s3 bucket we look to extract it 
    if export_folder + exportFile not in exportPaths:
        extractSubset(pdf_file_path, exportFile, pages)
    else:
        print('{} already saved'.format(exportFile))
        

Saved file -> 1224385-2004-subset.pdf
Saved file -> 1224385-2005-subset.pdf
Saved file -> 1224385-2006-subset.pdf
Saved file -> 1224385-2007-subset.pdf
Saved file -> 1224385-2008-subset.pdf
Saved file -> 1224385-2009-subset.pdf
Saved file -> 1224385-2010-subset.pdf
Saved file -> 1224385-2011-subset.pdf
Saved file -> 1224385-2012-subset.pdf
Saved file -> 1224385-2013-subset.pdf
Saved file -> 1224385-2014-subset.pdf
Saved file -> 1224385-2015-subset.pdf
Saved file -> 1224385-2016-subset.pdf
Saved file -> 1224385-2017-subset.pdf
Saved file -> 1224385-2018-subset.pdf
Saved file -> 1224385-2019-subset.pdf
Saved file -> 1224385-2020-subset.pdf
Saved file -> 42352-2002-subset.pdf
Saved file -> 42352-2003-subset.pdf
Saved file -> 42352-2004-subset.pdf
Saved file -> 42352-2005-subset.pdf
Saved file -> 42352-2006-subset.pdf
Saved file -> 42352-2007-subset.pdf
Saved file -> 42352-2008-subset.pdf
Saved file -> 42352-2009-subset.pdf
Saved file -> 42352-2010-subset.pdf
Saved file -> 42352-2011-subse