In [1]:
%conda update -n base -c defaults conda  # to update conda environment
%conda install -c conda-forge poppler    # to install poppler PDF backend

Collecting package metadata (current_repodata.json): done
Solving environment: | 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/noarch::tifffile==2021.1.14=pyhd3eb1b0_1
  - defaults/linux-64::numpy-base==1.19.2=py37hfa32c7d_0
  - conda-forge/linux-64::numexpr==2.7.3=py37hdc94413_0
  - defaults/linux-64::secretstorage==3.3.1=py37h06a4308_0
  - defaults/linux-64::bokeh==2.2.3=py37_0
  - defaults/linux-64::anaconda-client==1.7.2=py37_0
  - defaults/linux-64::bottleneck==1.3.2=py37heb32a55_1
  - defaults/linux-64::imagecodecs==2021.1.11=py37h581e88b_1
  - defaults/linux-64::keyring==22.0.1=py37h06a4308_0
  - defaults/noarch::dask==2021.2.0=pyhd3eb1b0_0
  - defaults/linux-64::_anaconda_depends==2020.07=py37_0
  - defaults/linux-64::mkl_fft==1.3.0=py37h54f3939_0
  - defaults/linux-64::scikit-learn==0.23.2=py37h0573a6f_0
  - defaults/linux-64::spyder==4.2.1=py37h06a4308_1
  - defaults/linux-64::harf

done


  current version: 4.8.4
  latest version: 4.10.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/python3

  added / updated specs:
    - poppler


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    aiobotocore-1.3.0          |     pyhd8ed1ab_0          40 KB  conda-forge
    astroid-2.5.6              |   py36h5fab9bb_0         300 KB  conda-forge
    botocore-1.20.49           |     pyhd8ed1ab_0         4.6 MB  conda-forge
    dataclasses-0.8            |     pyh787bdff_0          22 KB  conda-forge
    docutils-0.17.1            |   py36h5fab9bb_0         762 KB  conda-forge
    flask-cors-3.0.8           |             py_0          14 KB  conda-forge
    jupyter_console-5.2.0      |           py36_1          34 KB  conda-forge
    lxml-4.6.3                 |   py36h04a5ba7_

In [2]:
# Run on first instance to install required libraries and backend packages
%pip install PyPDF2 pdf2image fitz pillow
%pip install PyMuPDF==1.16.14

Collecting pdf2image
  Downloading pdf2image-1.15.1-py3-none-any.whl (10 kB)
Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl (20 kB)
Collecting pyxnat
  Downloading pyxnat-1.4.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.9 MB/s eta 0:00:011
Collecting configparser
  Downloading configparser-5.0.2-py3-none-any.whl (19 kB)
Collecting nibabel
  Downloading nibabel-3.2.1-py3-none-any.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 7.2 MB/s eta 0:00:01
Collecting configobj
  Downloading configobj-5.0.6.tar.gz (33 kB)
Collecting httplib2
  Downloading httplib2-0.19.1-py3-none-any.whl (95 kB)
[K     |████████████████████████████████| 95 kB 6.9 MB/s  eta 0:00:01
[?25hCollecting nipype
  Downloading nipype-1.6.0-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 31.7 MB/s eta 0:00:01
Collecting prov>=1.5.2
  Downloading prov-2.0.0-py3-none-any.whl (421 kB)
[K     |████████████████████████████████| 421 kB 50.1 MB/

In [3]:
import os
import cv2
import fitz
import boto3
import numpy as np 

from sagemaker.session import Session
from pdf2image import convert_from_path
from pdf2image.exceptions import PDFPageCountError
from PyPDF2 import PdfFileReader, PdfFileWriter, utils

## PDF Slicing for X-17A-5 Files
We slice the first 15 pages from the merged X-17A-5 files retrieved from the SEC 

In [4]:
def selectPages(pdf:PdfFileReader, pageSelection:list) -> PdfFileWriter:
    """
    Extracts pages from a pdf and returns a PdfFileWriter object 
    ------------------------------------------------------------------------------------------
    Input:
        :param: pdf (type PdfFileReader)
            A PdfFileReader object that represents a pdf file that has been read and interpreted
        :param: pageSelection (type list)   
            The page numbers to be selected from the pdf. NOTE, these page numbers do not have to be sequential, 
            but often times are read as such
    Return:
        :param: pdfWriter (type PdfFileWriter)
            Returns a truncated PdfFile object that is smaller than or equal to the original parsed pdf
    """
    # initialize a pdf object to store pdf pages
    pdfWriter = PdfFileWriter()
    nPages = pdf.getNumPages()

    # to manage pdfs that don't contain as many pages as listed  
    if nPages > max(pageSelection):
        
        # add the first n-pages from the document as specified in pageSelection 
        for page_num in pageSelection:
            pdfWriter.addPage(pdf.getPage(page_num))
        return pdfWriter
    
    else:   
        
        # add all pages from the document provided
        for page_num in np.arange(nPages):
            pdfWriter.addPage(pdf.getPage(page_num))
        return pdfWriter 

In [5]:
def extractSubset(pages:list, export_file:str):
    """
    Extracts a subset of pages from a pdf, provided the page numbers are specified
    ------------------------------------------------------------------------------------------
    Input:
        :param: pages (type list)
            A list of page numbers to extract from a given pdf (e.g. [1, 2, 3, 4, 5, 6]) 
        :param: export_file (type str)   
            The name for the pdf file to be exported, we traditional keep the orignal pdf name, with the 
            accompanying subset tag (e.g. 'CITI-2020-02-22-subset.pdf')
    Return:
        This is a void function, we return no value(s) as we interface with AWS s3 bucket to store pdfs
    """
    
    try:
        # read pdf file and initialize empty pdf file to create subset
        pdf = PdfFileReader('temp.pdf')
        subset = selectPages(pdf, pages)

        try:
            # open file and save to local instance
            with open(export_file, 'wb') as f:
                subset.write(f)
                f.close()
        except:
            print('Not able to save local file {}'.format(export_file))

    except utils.PdfReadError:
        print('EOF marker not found - reject {}'.format(export_file))

## Main File Execution

In [6]:
if __name__ == "__main__":
    
    bucket = "ran-s3-systemic-risk"
    import_folder = 'Input/X-17A-5/'
    export_folder_pdf = "Input/X-17A-5-PDF-SUBSETS/"
    export_folder_png = "Input/X-17A-5-PNG-SUBSETS/"
    
    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    session = Session()
    
    # pages to keep from each pdf
    pages = np.arange(15) 
    
    # import paths for all the X-17A-5 files 
    # (NOTE: we take the 1+ index, since the the zero position is folder directory)
    importPaths = np.array(session.list_s3_files(bucket, import_folder))[1:]
    
    # export file paths to document subfolder
    pdf_paths = session.list_s3_files(bucket, export_folder_pdf)
    png_paths = session.list_s3_files(bucket, export_folder_png)
    
    for path_name in importPaths:
        print('Slicing information for ', path_name)
        
        # check to see if values are downloaded to s3 sub-bin
        baseFile = path_name.split('/')[-1].split('.')[0]
        png_look_up = export_folder_png + baseFile + '/' + baseFile + '-p0.png'
        pdf_look_up = export_folder_pdf + baseFile + '-subset.pdf'
        
        # ---------------------------------------------------------------
        # PDF FILE DOWNLOAD
        # ---------------------------------------------------------------
        
        if pdf_look_up not in pdf_paths:
            
            # retrieving downloaded files from s3 bucket
            s3.download_file(bucket, path_name, 'temp.pdf')
            
            # run the subset function to save a local subset file (void-function)
            export_name = baseFile + '-subset.pdf'
            extractSubset([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], export_name)
            
             # save contents to AWS S3 bucket as specified
            with open(export_name, 'rb') as data:
                s3.upload_fileobj(data, bucket, export_folder_pdf + export_name)
                print('\tSaved pdf files for -> {}'.format(export_name))
            
            # remove local file after it has been created
            os.remove('temp.pdf')
            os.remove(export_name)
            
        else:
            print('\t{} already saved pdf'.format(baseFile))
        
        # ---------------------------------------------------------------
        # PNG FILE DOWNLOAD
        # ---------------------------------------------------------------
        
        if png_look_up not in png_paths:
            
            # retrieving downloaded files from s3 bucket
            s3.download_file(bucket, path_name, 'temp.pdf')
            
            try:
                # document class for temporary pdf (correspond to X-17A-5 pages)  
                pages = convert_from_path('temp.pdf', 500)
                
                # determine the iterable size (number of page in document)
                if len(pages) > 15:
                    size = 15
                else: size = len(pages)
                
                for idx in range(size):
                    # write the png name for exportation
                    export_file_name = "{}-p{}.png".format(baseFile, idx)
                    
                    # storing PDF page as a PNG file locally (using pdf2image)
                    pages[idx].save(export_file_name, 'PNG')
                    
                    # save contents to AWS S3 bucket as specified
                    with open(export_file_name, 'rb') as data:
                        s3.upload_fileobj(data, bucket, export_folder_png + baseFile + '/' + export_file_name)
                    
                    os.remove(export_file_name)
                    
                print('\tSaved png files for -> {}'.format(baseFile))
                
                # remove local file after it has been created
                os.remove('temp.pdf')
                
            except PDFPageCountError:
                print('\tEncountered PDFPageCounterError when trying to convert to png for -> {}'.format(baseFile))
            
        else:
            print('\t{} already saved png'.format(baseFile))

Slicing information for  Input/X-17A-5/1146184-2004-03-01.pdf
	Saved pdf files for -> 1146184-2004-03-01-subset.pdf
	Saved png files for -> 1146184-2004-03-01
Slicing information for  Input/X-17A-5/1146184-2005-03-02.pdf
	Saved pdf files for -> 1146184-2005-03-02-subset.pdf
	Saved png files for -> 1146184-2005-03-02
Slicing information for  Input/X-17A-5/1146184-2006-03-01.pdf
	Saved pdf files for -> 1146184-2006-03-01-subset.pdf
	Saved png files for -> 1146184-2006-03-01
Slicing information for  Input/X-17A-5/1146184-2007-02-26.pdf
	Saved pdf files for -> 1146184-2007-02-26-subset.pdf
	Saved png files for -> 1146184-2007-02-26
Slicing information for  Input/X-17A-5/1146184-2008-02-29.pdf
	Saved pdf files for -> 1146184-2008-02-29-subset.pdf
	Saved png files for -> 1146184-2008-02-29
Slicing information for  Input/X-17A-5/1146184-2009-03-02.pdf
	Saved pdf files for -> 1146184-2009-03-02-subset.pdf
	Saved png files for -> 1146184-2009-03-02
Slicing information for  Input/X-17A-5/1146184

	Saved pdf files for -> 1261467-2005-03-08-subset.pdf
	Saved png files for -> 1261467-2005-03-08
Slicing information for  Input/X-17A-5/1261467-2006-03-01.pdf
	Saved pdf files for -> 1261467-2006-03-01-subset.pdf
	Saved png files for -> 1261467-2006-03-01
Slicing information for  Input/X-17A-5/1261467-2007-03-02.pdf
	Saved pdf files for -> 1261467-2007-03-02-subset.pdf
	Saved png files for -> 1261467-2007-03-02
Slicing information for  Input/X-17A-5/1261467-2008-02-29.pdf
	Saved pdf files for -> 1261467-2008-02-29-subset.pdf
	Saved png files for -> 1261467-2008-02-29
Slicing information for  Input/X-17A-5/1261467-2009-03-02.pdf
	Saved pdf files for -> 1261467-2009-03-02-subset.pdf
	Saved png files for -> 1261467-2009-03-02
Slicing information for  Input/X-17A-5/1261467-2010-03-01.pdf
	Saved pdf files for -> 1261467-2010-03-01-subset.pdf
	Saved png files for -> 1261467-2010-03-01
Slicing information for  Input/X-17A-5/1261467-2011-03-01.pdf
	Saved pdf files for -> 1261467-2011-03-01-sub

	Saved pdf files for -> 26617-2005-05-26-subset.pdf
	Saved png files for -> 26617-2005-05-26
Slicing information for  Input/X-17A-5/26617-2006-05-26.pdf
	Saved pdf files for -> 26617-2006-05-26-subset.pdf
	Saved png files for -> 26617-2006-05-26
Slicing information for  Input/X-17A-5/26617-2007-05-29.pdf
	Saved pdf files for -> 26617-2007-05-29-subset.pdf
	Saved png files for -> 26617-2007-05-29
Slicing information for  Input/X-17A-5/26617-2008-05-29.pdf
	Saved pdf files for -> 26617-2008-05-29-subset.pdf
	Saved png files for -> 26617-2008-05-29
Slicing information for  Input/X-17A-5/26617-2009-05-29.pdf
	Saved pdf files for -> 26617-2009-05-29-subset.pdf
	Saved png files for -> 26617-2009-05-29
Slicing information for  Input/X-17A-5/26617-2009-06-08.pdf
	Saved pdf files for -> 26617-2009-06-08-subset.pdf
	Saved png files for -> 26617-2009-06-08
Slicing information for  Input/X-17A-5/26617-2010-05-28.pdf
	Saved pdf files for -> 26617-2010-05-28-subset.pdf
	Saved png files for -> 26617-

	Encountered PDFPageCounterError when trying to convert to png for -> 356628-1998-02-23
Slicing information for  Input/X-17A-5/356628-2002-02-25.pdf
	356628-2002-02-25 already saved pdf
	356628-2002-02-25 already saved png
Slicing information for  Input/X-17A-5/356628-2003-02-24.pdf
	356628-2003-02-24 already saved pdf
	356628-2003-02-24 already saved png
Slicing information for  Input/X-17A-5/356628-2004-03-01.pdf
	356628-2004-03-01 already saved pdf
	356628-2004-03-01 already saved png
Slicing information for  Input/X-17A-5/356628-2005-03-01.pdf
	356628-2005-03-01 already saved pdf
	356628-2005-03-01 already saved png
Slicing information for  Input/X-17A-5/356628-2006-03-02.pdf
	356628-2006-03-02 already saved pdf
	356628-2006-03-02 already saved png
Slicing information for  Input/X-17A-5/356628-2007-03-01.pdf
	356628-2007-03-01 already saved pdf
	356628-2007-03-01 already saved png
Slicing information for  Input/X-17A-5/356628-2008-02-29.pdf
	356628-2008-02-29 already saved pdf
	356

	Saved pdf files for -> 803012-2002-12-31-subset.pdf
	Saved png files for -> 803012-2002-12-31
Slicing information for  Input/X-17A-5/803012-2003-12-31.pdf
	Saved pdf files for -> 803012-2003-12-31-subset.pdf
	Saved png files for -> 803012-2003-12-31
Slicing information for  Input/X-17A-5/803012-2005-01-03.pdf
	Saved pdf files for -> 803012-2005-01-03-subset.pdf
	Saved png files for -> 803012-2005-01-03
Slicing information for  Input/X-17A-5/803012-2006-01-03.pdf
	Saved pdf files for -> 803012-2006-01-03-subset.pdf
	Saved png files for -> 803012-2006-01-03
Slicing information for  Input/X-17A-5/803012-2007-01-29.pdf
	Saved pdf files for -> 803012-2007-01-29-subset.pdf
	Saved png files for -> 803012-2007-01-29
Slicing information for  Input/X-17A-5/803012-2007-12-28.pdf
	Saved pdf files for -> 803012-2007-12-28-subset.pdf
	Saved png files for -> 803012-2007-12-28
Slicing information for  Input/X-17A-5/803012-2008-12-30.pdf
	Saved pdf files for -> 803012-2008-12-30-subset.pdf
	Saved png 

	Saved pdf files for -> 867626-2002-04-29-subset.pdf
	Saved png files for -> 867626-2002-04-29
Slicing information for  Input/X-17A-5/867626-2003-04-25.pdf
	Saved pdf files for -> 867626-2003-04-25-subset.pdf
	Saved png files for -> 867626-2003-04-25
Slicing information for  Input/X-17A-5/867626-2004-04-28.pdf
	Saved pdf files for -> 867626-2004-04-28-subset.pdf
	Saved png files for -> 867626-2004-04-28
Slicing information for  Input/X-17A-5/867626-2005-04-29.pdf
	Saved pdf files for -> 867626-2005-04-29-subset.pdf
	Saved png files for -> 867626-2005-04-29
Slicing information for  Input/X-17A-5/867626-2006-04-28.pdf
	Saved pdf files for -> 867626-2006-04-28-subset.pdf
	Saved png files for -> 867626-2006-04-28
Slicing information for  Input/X-17A-5/867626-2007-04-26.pdf
	Saved pdf files for -> 867626-2007-04-26-subset.pdf
	Saved png files for -> 867626-2007-04-26
Slicing information for  Input/X-17A-5/867626-2008-02-29.pdf
	Saved pdf files for -> 867626-2008-02-29-subset.pdf
	Saved png 