In [1]:
import os
import sys
import urllib.request

sys.path.append("..")

from pdf2image import convert_from_path
import pytesseract
from PIL import Image

from utils.case import Case

In [2]:
DATA_LOCATION = "../data/samples"

case_files = []
for file in os.listdir(DATA_LOCATION):
    if file.endswith(".json"):
        case_files.append(os.path.join(DATA_LOCATION, file))

In [3]:
def extract_text(doc):
    url = doc.get_file_url()
    if not doc.is_available() and not url:
        return
    directory = 'tmp/{}'.format(doc.get_id())

    # make directory if it doesn't exist
    try:
        os.stat(directory)
    except:
        os.makedirs(directory, exist_ok=True)

    pdf_filename = '{0}/doc.pdf'.format(directory)

    with urllib.request.urlopen(url) as response, open(pdf_filename, 'wb+') as out_file:
        response = response.read()
        out_file.write(response)

    # convert PDF to images
    page_images = convert_from_path(pdf_filename, 450)

    page_filenames = []
    # crop images
    for i, page in enumerate(page_images):
        page = page.crop(
            (
                0, 
                0 + page.height * 0.055, 
                page.width, 
                page.height - page.height * 0.11,
            )
        )
        fn = 'pg_{}.jpg'.format(i)
        fn = '{0}/{1}'.format(directory, fn)
        page.save(fn)
        page_filenames.append(fn)
    
    # convert images to text and compile
    texts = []
    for fn in page_filenames:
        text = str(pytesseract.image_to_string(Image.open(fn)))
        text = text.replace('-\n', '')
        text = text.replace('\n', ' ')
        text = text.strip()
        print(text)
        texts.append(text)
        
    result = ' '.join(texts)
    
    # clean up
    for fn in page_filenames:
        os.remove(fn)
    os.remove(pdf_filename)
    
    return result

In [None]:
case = None
for i, fn in enumerate(case_files):
    case = Case(fn)
    if case.get_chapter() != 11:
        continue
    print('*************** Case {} ***************'.format(i))
    for entry in case.get_entries():
        for doc in entry.documents:
            print('*************** Document {}'.format(doc.get_id()))
            print(extract_text(doc))

*************** Case 82 ***************
*************** Document 6129377
United States Bankruptcy Court  Eastern District of North Carolina - Greenville Division V01untary Petltlon  Name of Debtor (if individual, enter Last, First, Middle): Gunboat International, Ltd.  Name of Joint Debtor (Spouse) (Last, First, Middle):  All Other Names used by the Joint Debtor in the last 8 years (include married, maiden, and trade names):  All Other Names used by the Debtor in the last 8 years (include married, maiden, and trade names):  FDBA Pure Yachting, Ltd; FDBA Gunboat Company  Last four digits of Soc. Sec. or Individual—Taxpayer I.D. (ITIN)/Complete EIN  (if more than one, state all)  27—4433540 Street Address of Debtor (N0. and Street, City, and State):  829 Harbor Road Wanchese, NC  Last four digits of Soc. Sec. 0r Individual—Taxpayer I.D. (ITIN) No./Complete EIN  (if more than one, state all)  Street Address of Joint Debtor (N0. and Street, City, and State):  ZIP Code ZIP Code  County of R

B4 (Ofﬁcial Form 4) (12/07)  In re  United States Bankruptcy Court  Eastern District of North Carolina - Greenville Division  Gunboat International, Ltd.  Case No.     LIST OF CREDITORS HOLDING 20 LARGEST UN SECURED CLAIMS  Debtor(s)     Chapter 1 1     Following is the list of the debtor's creditors holding the 20 largest unsecured Claims. The list is prepared in accordance with Fed. R. Bankr. P. 1007(d) for filing in this chapter 11 [or Chapter 9] case. The list does not include (1) persons who come Within the definition of "insider" set forth in 11 U.S.C. § 101, or (2) secured creditors unless the value of the collateral is such that the unsecured deficiency places the creditor among the holders of the 20 largest unsecured claims. If a minor child is one of the creditors holding the 20 largest unsecured claims, state the Child's initials and the name and address of the Child's parent or guardian, such as "A.B., a minor child, by John Doe, guardian." Do not disclose the child's  name

BLUEWATER SUPPLY  ATTN: MANAGER OR AGENT 1000 CLASSIC ROAD  APEX, NC 27539  BRADFORD MARINE  ATTN: MANAGER OR AGENT PO. BOX F-44867  FREEPORT  GRAND BAHAMAS  COMMONWEALTH FOREIGN EXCHG.  C/O JAMES J. NAGELBERG, ESQ. HINCKLEY ALLEN  100 WESTMINSTER ST, STE 1500 PROVIDENCE, RI 02903-2319  CORE COMPOSITES  ATTN: MANAGER OR AGENT ROM DEVELOPMENT  108 TUPELO STREET BRISTOL, RI 02809  CUSTOM COMFORT BY WINN, LTD ATTN: MANAGER OR AGENT  15 TERMINAL STREET  HOPEWELL, VA 23860  DIMENSION HARDWOODS ATTN: MANAGER OR AGENT 415 INDUSTRIAL BLVD.  NEW ALBANY, IN 47150  DUNNING AND ASSOCIATES, LLC ATTN: MANAGER OR AGENT  449 THAMES STREET  SUITE 308  NEWPORT, RI 02840  ERNIE'S BOAT CANVAS  & AWNING COMPANY  101 MEADOW LAKE CIRCLE JARVISBURG, NC 27947  SHANNON FALCONE FALMOUTH HARBOUR, ANTIGUA WEST INDIES  JOHN BLUMBERG 4550 GORDON DRIVE NAPLES, FL 34102  CAY ELECTRONICS  ATTN: MANAGER OR AGENT ONE MARITIME DRIVE PORTSMOUTH, RI 02871  COMPOSITE RIGGING  ATTN: MANAGER OR AGENT  342 COMPASS CIRCLE, UNIT 

PROCESS MACHINE AUTOMATION ATTN: MANAGER OR AGENT 5275 SADDLEBROOK DRIVE COLUMBUS, OH 43221  R.A. HOY HEARING & A/C ATTN: MANAGER OR AGENT PO. BOX 179  KITTY HAWK, NC 27949  ROSE WELDING & CRANE SVC. ATTN: MANAGER OR AGENT 1060 SOUTH GUMNECK ROAD COLUMBIA, NC 27925  SAIL MARINE GROUP  ATTN: MANAGER OR AGENT PO. BOX 864792  ORLANDO, FL 32886-4792  SAILTEC, INC.  ATTN: MANAGER OR AGENT 2930 CONGER COURT OSHKOSH, WI 54904  SEA-FIRE MARINE  ATTN: MANAGER OR AGENT 9331-A PHILADELPHIA ROAD ROSEDALE, MD 21237  SOUNDOWN CORPORATION ATTN: MANAGER OR AGENT 16 BROADWAY  SALEM, MA 01970  SPRING MEDIA  D/B/A BONNIER CORP 15255 ALTON PKWAY SUITE 300  IRVINE, CA 92618  TEAM ONE NEWPORT ATTN: MANAGER OR AGENT PO. BOX 1443  All—lﬂlnnh'l' I-‘I nnnAn  QUICK USA  ATTN: MANAGER OR AGENT 810 OREGON AVENUE  STE F  LINTHICUM HEIGHTS, MD 21090  REAL EARTH LANDSCAPES & IRRIGATION  ATTN: MANAGER OR AGENT PO. BOX 1652  KILL DEVIL HILLS, NC 27948  SAERTEX  ATTN: MANAGER OR AGENT 12000-A MT. HOLLYHUNTERSVILLE ROAD 

UNITED STATES BANKRUPTCY COURT EASTERN DISTRICT OF NORTH CAROLINA  GREENVILLE DIVISION IN RE: GUNBOAT INTERNATIONAL, LTD, CASE NO. 15-06271-5-DMW CHAPTER 11 DEBTOR.  EMERGENCY MOTION FOR AUTHORIZATION TO USE CASH COLLATERAL '  COME NOW GUNBOAT INTERNATIONAL, LTD. (the “Debtor”), by and through its undersigned counsel and pursuant to § 363 of the Bankruptcy Code, hereby respectfully moves this Court for an Order authorizing the Debtor to use cash collateral and, pursuant to Fed. R. Bankr. P. 4001(b)(2), in order to avoid immediate and irreparable harm to the bankruptcy estate, to hold an emergency hearing on this matter.  Pursuant to Fed. R. Bankr. P. 4001(b)(l)(B), the Debtor sets forth the following introductory statement:  The Debtor is a corporation organized and existing under the laws of the State of Rhode Island. The Debtor’s principal place of business is in Wanchese, Dare County, North Carolina. The Debtor designs and manufactures luxury performance cruising catamarans under th

EXHIBIT  -m.m 35.3 312. 8&8 33 89.3 SEN  lllllll E. ~53 Bialamnﬂ. algae lllllll  mnwﬁh whmdm $3.3 mmm.m~H maﬁa mnwdm mowdm  lgﬁﬁllllﬁﬁllﬁlgﬁalllllg 2? m g  ﬂllllllllllllllllillﬁ 8 h  Goods 8nd  mmﬁ  Illﬁlllillllll  Nah  m8; 838  "Haggai lllll . ﬁﬁﬁnﬁ'ﬁ ENE” ﬁﬁﬁﬁ  «3.8  8.3  3% a  ° .3. § 3 9‘  Iﬁﬂﬁlﬁllllli ‘3  S 56 Quad“  QNJ  O  a ‘7. H  5353 «2535 333$ 385$ 35an 3253 323::  mmmeXw .w 0252:“.     .33:  55.2 38  OS Wu 3.52 22m w..  Edkzm as  22.5.:  50.  Show:  «a 3:23:— En  .: ulna—o 0....  :3 «.389 «5.5.2.5 an 9:0.  .053.  85:...  wean-u—  " 8.3 wm¢ W”  2 mg mug  h  ESE—.3...”  3.33.2.3.  d: agﬂgﬁ
UNITED STATES BANKRUPTCY COURT EASTERN DISTRICT OF NORTH CAROLINA  GREENVILLE DIVISION IN RE: GUNBOAT INTERNATIONAL, LTD, CASE NO. 15-06271-5-DMW . CHAPTER 11  DEBTOR.  NOTICE OF EMERGENCY MOTION FOR AUTHORIZATION TO USE CASH COLLATERAL  PLEASE TAKE NOTICE of the EMERGENCY MOTION FOR AUTHORIZATION TO USE CASH COLLATERAL (the “Motion”) ﬁled simultaneously herewith on behalf of the Debtor in 