In [79]:
%%bash
pip install --upgrade pip
pip install PyPDF2



## PDF Slicing for X-17A-5 Files
We slice the first 10 pages from the X-17A-5 files 

In [80]:
import os
import boto3
import numpy as np 

from sagemaker.session import Session
from PyPDF2 import PdfFileReader, PdfFileWriter, utils

In [81]:
bucket = "ran-s3-systemic-risk"
data_folder = 'Input/X-17A-5/'
export_folder = "Input/X-17A-5-Subsets/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

In [82]:
# import paths for all the subset files
importPaths = np.array(session.list_s3_files(bucket, data_folder))[1:]

In [83]:
def selectPages(pdf:PdfFileReader, pages) -> PdfFileWriter:
    """
    Extracts pages and returns a PdfFileWriter object 
    """
    # initialize a pdf object to be store pdf pages
    pdfWriter = PdfFileWriter()
    nPages = pdf.getNumPages()

    # to manage pdfs that don't contain as many pages as listed  
    if nPages > pages.max():
        # add the pages from the document as specified 
        _ = [pdfWriter.addPage(pdf.getPage(page_num)) for page_num in pages]
        return pdfWriter
    else:
        # add all pages from the document provided
        _ = [pdfWriter.addPage(pdf.getPage(page_num)) for page_num in np.arange(nPages)]
        return pdfWriter 

In [84]:
def extractSubset(path:str, export:str, pages:np.ndarray, bucket:str='ran-s3-systemic-risk', 
                  export_folder:str="Input/X-17A-5-Subsets/"):
    """
    Extracts a subset of pages from a pdf, provided the page numbers are specified
    """
    
    try:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, path, 'temp.pdf')

        # read pdf file and initialize empty pdf file to create subset
        pdf = PdfFileReader('temp.pdf')
        subset = selectPages(pdf, pages)

        try:
            # open file and save to local instance
            with open(export, 'wb') as f:
                subset.write(f)
                f.close()

            # save contents to AWS S3 bucket as specified
            with open(export, 'rb') as data:
                s3.upload_fileobj(data, bucket, export_folder + export)
                
            print('Saved file -> {}'.format(export))
        
        except:
            print('Not able to save '.format(export))

        # remove local file after it has been created
        os.remove('temp.pdf')
        os.remove(export)

    except utils.PdfReadError:
        print('EOF marker not found - reject {}'.format(export))

In [90]:
# pages to keep from each pdf
pages = np.arange(15) 

# export file paths to document subfolder
exportPaths = session.list_s3_files(bucket, export_folder)

for pdf_file_path in importPaths[:1]:
    
    # check to see if values are downloaded to s3 sub-bin
    baseFile = pdf_file_path.split('/')[-1].split('.')[0]
    exportFile = '{0}-subset.pdf'.format(baseFile)

    # if our subset is not found in our s3 bucket we look to extract it 
    if export_folder + exportFile not in exportPaths:
        extractSubset(pdf_file_path, exportFile, pages)
    else:
        print('{} already saved'.format(exportFile))
        

Saved file -> 782124-2002-subset.pdf
