In [1]:
# Run on first instance to install required libraries
%pip install PyPDF2 pdf2image fitz PyMuPDF
# %pip install PyMuPDF==1.16.14

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import cv2
import fitz
import boto3
import numpy as np 

from sagemaker.session import Session
from pdf2image import convert_from_path
from PyPDF2 import PdfFileReader, PdfFileWriter, utils

## PDF Slicing for X-17A-5 Files
We slice the first 15 pages from the merged X-17A-5 files retrieved from the SEC 

In [3]:
def selectPages(pdf:PdfFileReader, pageSelection:list) -> PdfFileWriter:
    """
    Extracts pages from a pdf and returns a PdfFileWriter object 
    ------------------------------------------------------------------------------------------
    Input:
        :param: pdf (type PdfFileReader)
            A PdfFileReader object that represents a pdf file that has been read and interpreted
        :param: pageSelection (type list)   
            The page numbers to be selected from the pdf. NOTE, these page numbers do not have to be sequential, 
            but often times are read as such
    Return:
        :param: pdfWriter (type PdfFileWriter)
            Returns a truncated PdfFile object that is smaller than or equal to the original parsed pdf
    """
    # initialize a pdf object to store pdf pages
    pdfWriter = PdfFileWriter()
    nPages = pdf.getNumPages()

    # to manage pdfs that don't contain as many pages as listed  
    if nPages > max(pageSelection):
        
        # add the first n-pages from the document as specified in pageSelection 
        for page_num in pageSelection:
            pdfWriter.addPage(pdf.getPage(page_num))
        return pdfWriter
    
    else:   
        
        # add all pages from the document provided
        for page_num in np.arange(nPages):
            pdfWriter.addPage(pdf.getPage(page_num))
        return pdfWriter 

In [4]:
def extractSubset(pages:list, export_file:str):
    """
    Extracts a subset of pages from a pdf, provided the page numbers are specified
    ------------------------------------------------------------------------------------------
    Input:
        :param: pages (type list)
            A list of page numbers to extract from a given pdf (e.g. [1, 2, 3, 4, 5, 6]) 
        :param: export_file (type str)   
            The name for the pdf file to be exported, we traditional keep the orignal pdf name, with the 
            accompanying subset tag (e.g. 'CITI-2020-02-22-subset.pdf')
    Return:
        This is a void function, we return no value(s) as we interface with AWS s3 bucket to store pdfs
    """
    
    try:
        # read pdf file and initialize empty pdf file to create subset
        pdf = PdfFileReader('temp.pdf')
        subset = selectPages(pdf, pages)

        try:
            # open file and save to local instance
            with open(export_file, 'wb') as f:
                subset.write(f)
                f.close()
        except:
            print('Not able to save local file {}'.format(export_file))

    except utils.PdfReadError:
        print('EOF marker not found - reject {}'.format(export_file))

In [5]:
def pdf2png(doc, index, filename):
    """
    Converts the pages of a pdf file to a series of png files
    ------------------------------------------------------------------------------------------
    Input:
        :param: doc (type fitz.Document)
            
        :param: index (type int)   
        
        :param: filename (type str)   
            
    Return: (type str)
        This function returns 
    """
    
    doc_pages = doc.getPageImageList(index)
    
    # check against empty lists
    if len(doc_pages) > 0:
        
        # iterate through each provided page in the document
        for img in doc_pages:

            # retreive pixel and width information 
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)    

            # write the png name for exportation
            export_file_name = "{}-p{}.png".format(filename, index)
            
            # if pixel == 1, implies No Colorspace is present
            if pix.n == 1:
                pix.writePNG(export_file_name)
                pix = None
            else:
                out_png = fitz.Pixmap(fitz.csRGB, pix)
                out_png.writePNG(export_file_name)
                out_png = None

        return export_file_name

In [6]:
if __name__ == "__main__":
    
    bucket = "ran-s3-systemic-risk"
    import_folder = 'Input/X-17A-5/'
    export_folder_pdf = "Input/X-17A-5-PDF-SUBSETS/"
    export_folder_png = "Input/X-17A-5-PNG-SUBSETS/"

    # Amazon Textract client and Sagemaker session
    textract = boto3.client('textract')
    s3 = boto3.client('s3')
    session = Session()
    
    # pages to keep from each pdf
    pages = np.arange(15) 
    
    # import paths for all the X-17A-5 files 
    # (NOTE: we take the 1+ index, since the the zero position is folder directory)
    importPaths = np.array(session.list_s3_files(bucket, import_folder))[1:]

    # export file paths to document subfolder
    pdf_paths = session.list_s3_files(bucket, export_folder_pdf)
    png_paths = session.list_s3_files(bucket, export_folder_png)
    
    for path_name in importPaths:
        print('Slicing information for ', path_name)
        
        # check to see if values are downloaded to s3 sub-bin
        baseFile = path_name.split('/')[-1].split('.')[0]
        png_look_up = export_folder_png + baseFile + '/' + baseFile + '-p0.png'
        pdf_look_up = export_folder_pdf + baseFile + '-subset.pdf'
        
        # ---------------------------------------------------------------
        # PDF FILE DOWNLOAD
        # ---------------------------------------------------------------

        if pdf_look_up not in pdf_paths:
            
            # retrieving downloaded files from s3 bucket
            s3.download_file(bucket, path_name, 'temp.pdf')
            
            # run the subset function to save a local subset file (void-function)
            export_name = baseFile + '-subset.pdf'
            extractSubset([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], export_name)
            
             # save contents to AWS S3 bucket as specified
            with open(export_name, 'rb') as data:
                s3.upload_fileobj(data, bucket, export_folder_pdf + export_name)
                print('\tSaved png files for -> {}'.format(export_name))
            
            # remove local file after it has been created
            os.remove('temp.pdf')
            os.remove(export_name)
            
        else:
            print('\t{} already saved pdf'.format(baseFile))
        
        # ---------------------------------------------------------------
        # PNG FILE DOWNLOAD
        # ---------------------------------------------------------------
        
        if png_look_up not in png_paths:
            
            # retrieving downloaded files from s3 bucket
            s3.download_file(bucket, path_name, 'temp.pdf')
            
            # document class for temporary pdf (correspond to X-17A-5) filing 
            doc = fitz.open('temp.pdf')
            
            # determine the iterable size (number of page in document)
            if len(doc) > 15:
                size = 15
            else: size = len(doc)
            
            # iterate through first 15 pages provided in the document
            for i in range(size):
                export_name = pdf2png(doc, i, baseFile)
                
                # save contents to AWS S3 bucket as specified
                if export_name is not None:
                    
                    with open(export_name, 'rb') as data:
                        s3.upload_fileobj(data, bucket, export_folder_png + baseFile + '/' + export_name)

                    os.remove(export_name)
                
            print('\tSaved png files for -> {}'.format(baseFile))
            
            # remove local file after it has been created
            os.remove('temp.pdf')
            
        else:
            print('\t{} already saved png'.format(baseFile))

Slicing information for  Input/X-17A-5/1224385-2004-03-01.pdf
	1224385-2004-03-01 already saved pdf
	1224385-2004-03-01 already saved png
Slicing information for  Input/X-17A-5/1224385-2005-03-01.pdf
	1224385-2005-03-01 already saved pdf
	1224385-2005-03-01 already saved png
Slicing information for  Input/X-17A-5/1224385-2006-03-01.pdf
	1224385-2006-03-01 already saved pdf
	1224385-2006-03-01 already saved png
Slicing information for  Input/X-17A-5/1224385-2007-03-01.pdf
	1224385-2007-03-01 already saved pdf
	1224385-2007-03-01 already saved png
Slicing information for  Input/X-17A-5/1224385-2008-02-29.pdf
	1224385-2008-02-29 already saved pdf
	1224385-2008-02-29 already saved png
Slicing information for  Input/X-17A-5/1224385-2009-03-02.pdf
	1224385-2009-03-02 already saved pdf
	1224385-2009-03-02 already saved png
Slicing information for  Input/X-17A-5/1224385-2010-03-12.pdf
	1224385-2010-03-12 already saved pdf
	1224385-2010-03-12 already saved png
Slicing information for  Input/X-1

	Saved png files for -> 68136-2016-02-29
Slicing information for  Input/X-17A-5/68136-2017-03-01.pdf
	68136-2017-03-01 already saved pdf
	Saved png files for -> 68136-2017-03-01
Slicing information for  Input/X-17A-5/68136-2018-03-01.pdf
	68136-2018-03-01 already saved pdf
	Saved png files for -> 68136-2018-03-01
Slicing information for  Input/X-17A-5/68136-2019-03-01.pdf
	68136-2019-03-01 already saved pdf
	Saved png files for -> 68136-2019-03-01
Slicing information for  Input/X-17A-5/68136-2020-03-02.pdf
	68136-2020-03-02 already saved pdf
	68136-2020-03-02 already saved png
Slicing information for  Input/X-17A-5/68136-2021-02-26.pdf
	68136-2021-02-26 already saved pdf
	Saved png files for -> 68136-2021-02-26
Slicing information for  Input/X-17A-5/72267-2003-05-30.pdf
	72267-2003-05-30 already saved pdf
	72267-2003-05-30 already saved png
Slicing information for  Input/X-17A-5/72267-2004-05-28.pdf
	72267-2004-05-28 already saved pdf
	72267-2004-05-28 already saved png
Slicing informa

	Saved png files for -> 853784-2021-03-01
Slicing information for  Input/X-17A-5/91154-2002-03-01.pdf
	91154-2002-03-01 already saved pdf
	91154-2002-03-01 already saved png
Slicing information for  Input/X-17A-5/91154-2003-03-03.pdf
	91154-2003-03-03 already saved pdf
	91154-2003-03-03 already saved png
Slicing information for  Input/X-17A-5/91154-2004-02-27.pdf
	91154-2004-02-27 already saved pdf
	91154-2004-02-27 already saved png
Slicing information for  Input/X-17A-5/91154-2005-03-01.pdf
	91154-2005-03-01 already saved pdf
	91154-2005-03-01 already saved png
Slicing information for  Input/X-17A-5/91154-2006-03-01.pdf
	91154-2006-03-01 already saved pdf
	91154-2006-03-01 already saved png
Slicing information for  Input/X-17A-5/91154-2007-03-01.pdf
	91154-2007-03-01 already saved pdf
	91154-2007-03-01 already saved png
Slicing information for  Input/X-17A-5/91154-2008-02-29.pdf
	91154-2008-02-29 already saved pdf
	91154-2008-02-29 already saved png
Slicing information for  Input/X-1